From 44c2e74a7acdfe22fad690c63d56fb71f58afa33 Mon Sep 17 00:00:00 2001 From: ldy Date: Fri, 13 Jun 2025 15:00:33 +0800 Subject: [PATCH] Basic functioning --- benchmark_coco.py | 136 -------------------------------------------- embedder.py | 19 ++----- find_similar_img.py | 36 ++++++++++++ readme.md | 16 ++++++ starter.py | 102 +++++++++++++++++++++++++++++++++ text_embedder.py | 50 ---------------- 6 files changed, 160 insertions(+), 199 deletions(-) delete mode 100644 benchmark_coco.py create mode 100644 find_similar_img.py create mode 100644 readme.md create mode 100644 starter.py delete mode 100644 text_embedder.py diff --git a/benchmark_coco.py b/benchmark_coco.py deleted file mode 100644 index 35bd611..0000000 --- a/benchmark_coco.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -import argparse -import numpy as np -from tqdm import tqdm -from pycocotools.coco import COCO -from PIL import Image -import torch - -from embedder import ImageEmbedder, DEVICE -from text_embedder import TextEmbedder - - -def parse_args(): - p = argparse.ArgumentParser(description="COCO retrieval benchmark") - p.add_argument("--coco-annotation-json", required=True, - help="path to captions_val2017.json") - p.add_argument("--coco-image-dir", required=True, - help="path to val2017/ folder") - p.add_argument("--llava-ckpt", required=True, - help="path to pytorch_model-00003-of-00003.bin") - p.add_argument("--proj-model", default="openai/clip-vit-large-patch14-336") - p.add_argument("--proj-dim", type=int, default=5120) - p.add_argument("--text-model", required=True) - p.add_argument("--text-tokenizer", required=True) - p.add_argument("--batch-size", type=int, default=64) - return p.parse_args() - - -def load_coco(ann_file): - coco = COCO(ann_file) - img_ids = coco.getImgIds() - img2caps = {iid: [ann["caption"] for ann in coco.loadAnns(coco.getAnnIds(imgIds=iid))] - for iid in img_ids} - return coco, img_ids, img2caps - - -def compute_image_embeddings(embedder, coco, img_ids, img_dir, bs): - all_embs = [] - for i in tqdm(range(0, len(img_ids), bs), desc="Images"): - batch = img_ids[i:i + bs] - imgs = [] - for iid in batch: - fn = coco.loadImgs(iid)[0]["file_name"] - imgs.append(Image.open(os.path.join(img_dir, fn)).convert("RGB")) - with torch.no_grad(): - embs = torch.stack([embedder.image_to_embedding(im) for im in imgs]) - all_embs.append(embs.cpu().numpy()) - return np.vstack(all_embs) - - -def compute_text_embeddings(embedder, captions, bs): - all_embs = [] - for i in tqdm(range(0, len(captions), bs), desc="Texts"): - batch = captions[i:i + bs] - with torch.no_grad(): - embs = embedder.text_to_embedding(batch) - all_embs.append(embs.cpu().numpy()) - return np.vstack(all_embs) - - -def compute_metrics(ranks, gt, K=(1, 5, 10)): - R, meds = {k: 0 for k in K}, [] - for idx, rank_list in enumerate(ranks): - targets = gt[idx] - best = min(int(np.where(rank_list == t)[0][0]) for t in targets) - meds.append(best + 1) - for k in K: - if best < k: - R[k] += 1 - n = len(ranks) - recall = {k: R[k] / n for k in K} - return recall, np.median(meds) - - -def evaluate(img_embs, txt_embs, img2caps, img_ids): - sims = img_embs @ txt_embs.T - - # Image→Text - img2cap = {} - offset = 0 - for i, iid in enumerate(img_ids): - cnt = len(img2caps[iid]) - img2cap[i] = list(range(offset, offset + cnt)) - offset += cnt - - ranks_i2t = np.argsort(-sims, axis=1) - R_i2t, med_i2t = compute_metrics(ranks_i2t, img2cap) - print("Image→Text R@1,5,10:", [f"{R_i2t[k] * 100:.2f}%" for k in (1, 5, 10)]) - print("Image→Text Median Rank:", med_i2t) - - # Text→Image - cap2img, offset = {}, 0 - for i, iid in enumerate(img_ids): - for _ in img2caps[iid]: - cap2img[offset] = i - offset += 1 - - ranks_t2i = np.argsort(-sims.T, axis=1) - gt = {idx: [cap2img[idx]] for idx in range(len(ranks_t2i))} - R_t2i, med_t2i = compute_metrics(ranks_t2i, gt) - print("Text→Image R@1,5,10:", [f"{R_t2i[k] * 100:.2f}%" for k in (1, 5, 10)]) - print("Text→Image Median Rank:", med_t2i) - - -def main(): - args = parse_args() - - # 1) Load COCO - coco, img_ids, img2caps = load_coco(args.coco_annotation_json) - captions = [c for iid in img_ids for c in img2caps[iid]] - - # 2) Build embedders - img_embedder = ImageEmbedder( - vision_model_name=args.proj_model, - proj_out_dim=args.proj_dim, - llava_ckpt_path=args.llava_ckpt - ) - txt_embedder = TextEmbedder( - model_name=args.text_model, - tokenizer_name=args.text_tokenizer - ) - - # 3) Compute embeddings - img_vectors = compute_image_embeddings( - img_embedder, coco, img_ids, args.coco_image_dir, args.batch_size - ) - txt_vectors = compute_text_embeddings( - txt_embedder, captions, args.batch_size - ) - - # 4) Evaluate Retrieval - evaluate(img_vectors, txt_vectors, img2caps, img_ids) - - -if __name__ == "__main__": - main() diff --git a/embedder.py b/embedder.py index 7cfda1f..2b4fb1a 100644 --- a/embedder.py +++ b/embedder.py @@ -1,17 +1,9 @@ import torch from transformers import CLIPImageProcessor, CLIPVisionModel from PIL import Image -# Force use of GPU 0 (or change “0” to whichever GPU index you want) -import os -print(torch.version.cuda) -os.environ["CUDA_VISIBLE_DEVICES"] = "0" -# Now select device -if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available—cannot run on GPU") -DEVICE = torch.device("cuda") # no fallback to CPU - -print(f"→ Running exclusively on {torch.cuda.get_device_name(0)}") +# Use CUDA +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") class ImageEmbedder: @@ -27,7 +19,7 @@ class ImageEmbedder: .to(DEVICE) .eval() ) - # Freeze it + # Freeze version for p in self.vision_model.parameters(): p.requires_grad = False @@ -36,7 +28,7 @@ class ImageEmbedder: self.projection = torch.nn.Linear(vision_hidden_dim, proj_out_dim).to(DEVICE) self.projection.eval() - # If provided, load LLaVA’s projection weights from the full bin checkpoint + # Load LLaVA’s projection weights from the full bin checkpoint if llava_ckpt_path is not None: ckpt = torch.load(llava_ckpt_path, map_location=DEVICE) # extract only the projector weights + bias @@ -58,5 +50,6 @@ class ImageEmbedder: out = self.vision_model(pixel_values=pixel_values) feat = out.pooler_output # (1,1024) emb = self.projection(feat) # (1,proj_out_dim) - # Returns a 1D tensor of size [proj_out_dim] on DEVICE + + # Returns a 1D tensor of size [proj_out_dim] on DEVICE. return emb.squeeze(0) # → (proj_out_dim,) diff --git a/find_similar_img.py b/find_similar_img.py new file mode 100644 index 0000000..cc8e8f5 --- /dev/null +++ b/find_similar_img.py @@ -0,0 +1,36 @@ +import numpy as np +import pickle +from PIL import Image +from sklearn.metrics.pairwise import cosine_similarity + +from embedder import ImageEmbedder + +# ——— Load stored embeddings & mapping ——— +vecs = np.load("processed_images/image_vectors.npy") # (N, D) +with open("processed_images/index_to_file.pkl", "rb") as f: + idx2file = pickle.load(f) # dict: idx → filepath + +# ——— Specify query image ——— +query_path = "datasets/coco/val2017/1140002154.jpg" + +# ——— Embed the query image ——— +embedder = ImageEmbedder( + vision_model_name="openai/clip-vit-large-patch14-336", + proj_out_dim=5120, + llava_ckpt_path="datasets/pytorch_model-00003-of-00003.bin" +) +img = Image.open(query_path).convert("RGB") +q_vec = embedder.image_to_embedding(img).cpu().numpy() # (D,) + +# ——— Compute similarities & retrieve top-k ——— +# (you can normalize if you built your DB with inner-product indexing) +# vecs_norm = vecs / np.linalg.norm(vecs, axis=1, keepdims=True) +# q_vec_norm = q_vec / np.linalg.norm(q_vec) +# sims = cosine_similarity(q_vec_norm.reshape(1, -1), vecs_norm).flatten() +sims = cosine_similarity(q_vec.reshape(1, -1), vecs).flatten() +top5 = sims.argsort()[-5:][::-1] + +# ——— Print out the results ——— +print(f"Query image: {query_path}\n") +for rank, idx in enumerate(top5, 1): + print(f"{rank:>2}. {idx2file[idx]} (score: {sims[idx]:.4f})") diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..adae912 --- /dev/null +++ b/readme.md @@ -0,0 +1,16 @@ +# Img2Vec + +A rough implementation of generating image embeddings through methodologies introduced in LLaVA + +### Structure +We derived the image embeddings by using a CLIP encoder and mapping it with the pretrained LLaVA’s projection weight + +### Prerequisites +1. install requirements.txt +2. Make sure you have downloaded `pytorch_model-00003-of-00003.bin` + +### Usage + +Replace **image-dir** and **llava-ckpt** to your **test image folder addr** and **pytorch_model-00003-of-00003.bin addr** + +`python convert_images_to_vectors.py --image-dir ./datasets/coco/val2017 --output-dir imgVecs --vision-model openai/clip-vit-large-patch14-336 --proj-dim 5120 --llava-ckpt ./datasets/pytorch_model-00003-of-00003.bin --batch-size 64` diff --git a/starter.py b/starter.py new file mode 100644 index 0000000..004bdae --- /dev/null +++ b/starter.py @@ -0,0 +1,102 @@ +import os +import argparse +import pickle + +import numpy as np +from PIL import Image +import torch + +from embedder import ImageEmbedder, DEVICE + + +def parse_args(): + p = argparse.ArgumentParser( + description="Batch-convert a folder of images into embedding vectors" + ) + p.add_argument( + "--image-dir", required=True, + help="Path to a folder containing images (jpg/png/bmp/gif)" + ) + p.add_argument( + "--output-dir", default="processed_images", + help="Where to save image_vectors.npy and index_to_file.pkl" + ) + p.add_argument( + "--vision-model", default="openai/clip-vit-large-patch14-336", + help="Hugging Face name of the CLIP vision encoder" + ) + p.add_argument( + "--proj-dim", type=int, default=5120, + help="Dimensionality of the projection output" + ) + p.add_argument( + "--llava-ckpt", default=None, + help="(Optional) full LLaVA checkpoint .bin to load projector weights from" + ) + p.add_argument( + "--batch-size", type=int, default=64, + help="How many images to encode per GPU/CPU batch" + ) + return p.parse_args() + + +def find_images(folder): + exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif") + return sorted([ + os.path.join(folder, fname) + for fname in os.listdir(folder) + if fname.lower().endswith(exts) + ]) + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + # Discover images + image_paths = find_images(args.image_dir) + if not image_paths: + raise RuntimeError(f"No images found in {args.image_dir}") + print(f"Found {len(image_paths)} images in {args.image_dir}") + + # Build embedder + embedder = ImageEmbedder( + vision_model_name=args.vision_model, + proj_out_dim=args.proj_dim, + llava_ckpt_path=args.llava_ckpt + ) + print(f"Using device: {DEVICE}") + + # Process in batches + all_embs = [] + index_to_file = {} + for batch_start in range(0, len(image_paths), args.batch_size): + batch_paths = image_paths[batch_start:batch_start + args.batch_size] + # load images + imgs = [Image.open(p).convert("RGB") for p in batch_paths] + # embed + with torch.no_grad(): + embs = embedder.image_to_embedding(imgs) # (B, D) + embs_np = embs.cpu().numpy() + all_embs.append(embs_np) + # record mapping + for i, p in enumerate(batch_paths): + index_to_file[batch_start + i] = p + + print(f" • Processed {batch_start + len(batch_paths)}/{len(image_paths)} images") + + # Stack and save + vectors = np.vstack(all_embs) # shape (N, D) + vec_file = os.path.join(args.output_dir, "image_vectors.npy") + map_file = os.path.join(args.output_dir, "index_to_file.pkl") + + np.save(vec_file, vectors) + with open(map_file, "wb") as f: + pickle.dump(index_to_file, f) + + print(f"\nSaved {vectors.shape[0]}×{vectors.shape[1]} vectors to\n {vec_file}") + print(f"Saved index→file mapping to\n {map_file}") + + +if __name__ == "__main__": + main() diff --git a/text_embedder.py b/text_embedder.py deleted file mode 100644 index 95c994c..0000000 --- a/text_embedder.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from transformers import AutoTokenizer, AutoModel -from typing import List - -# Use the same DEVICE as embedder -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -class TextEmbedder: - """ - Encodes text into the same embedding space (assumes your LLM was aligned - with LLaVA’s projector during fine-tuning). - """ - - def __init__(self, - model_name: str, - tokenizer_name: str = None): - tokenizer_name = tokenizer_name or model_name - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - self.text_model = ( - AutoModel - .from_pretrained(model_name) - .to(DEVICE) - .eval() - ) - for p in self.text_model.parameters(): - p.requires_grad = False - - def text_to_embedding(self, texts: List[str]) -> torch.Tensor: - """ - Returns a tensor of shape (batch_size, hidden_dim) on DEVICE. - Uses pooler_output if available; otherwise mean-pools tokens. - """ - inputs = self.tokenizer( - texts, - padding=True, - truncation=True, - return_tensors="pt" - ) - # move all inputs to GPU if available - inputs = {k: v.to(DEVICE) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.text_model(**inputs) - if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None: - emb = outputs.pooler_output # (batch, hidden_dim) - else: - emb = outputs.last_hidden_state.mean(dim=1) - - return emb