commit e704bde69b7460464561bfeb80e4a5efae9ee759 Author: ldy Date: Fri Jun 13 07:21:58 2025 +0800 Initial Commit diff --git a/benchmark_coco.py b/benchmark_coco.py new file mode 100644 index 0000000..35bd611 --- /dev/null +++ b/benchmark_coco.py @@ -0,0 +1,136 @@ +import os +import argparse +import numpy as np +from tqdm import tqdm +from pycocotools.coco import COCO +from PIL import Image +import torch + +from embedder import ImageEmbedder, DEVICE +from text_embedder import TextEmbedder + + +def parse_args(): + p = argparse.ArgumentParser(description="COCO retrieval benchmark") + p.add_argument("--coco-annotation-json", required=True, + help="path to captions_val2017.json") + p.add_argument("--coco-image-dir", required=True, + help="path to val2017/ folder") + p.add_argument("--llava-ckpt", required=True, + help="path to pytorch_model-00003-of-00003.bin") + p.add_argument("--proj-model", default="openai/clip-vit-large-patch14-336") + p.add_argument("--proj-dim", type=int, default=5120) + p.add_argument("--text-model", required=True) + p.add_argument("--text-tokenizer", required=True) + p.add_argument("--batch-size", type=int, default=64) + return p.parse_args() + + +def load_coco(ann_file): + coco = COCO(ann_file) + img_ids = coco.getImgIds() + img2caps = {iid: [ann["caption"] for ann in coco.loadAnns(coco.getAnnIds(imgIds=iid))] + for iid in img_ids} + return coco, img_ids, img2caps + + +def compute_image_embeddings(embedder, coco, img_ids, img_dir, bs): + all_embs = [] + for i in tqdm(range(0, len(img_ids), bs), desc="Images"): + batch = img_ids[i:i + bs] + imgs = [] + for iid in batch: + fn = coco.loadImgs(iid)[0]["file_name"] + imgs.append(Image.open(os.path.join(img_dir, fn)).convert("RGB")) + with torch.no_grad(): + embs = torch.stack([embedder.image_to_embedding(im) for im in imgs]) + all_embs.append(embs.cpu().numpy()) + return np.vstack(all_embs) + + +def compute_text_embeddings(embedder, captions, bs): + all_embs = [] + for i in tqdm(range(0, len(captions), bs), desc="Texts"): + batch = captions[i:i + bs] + with torch.no_grad(): + embs = embedder.text_to_embedding(batch) + all_embs.append(embs.cpu().numpy()) + return np.vstack(all_embs) + + +def compute_metrics(ranks, gt, K=(1, 5, 10)): + R, meds = {k: 0 for k in K}, [] + for idx, rank_list in enumerate(ranks): + targets = gt[idx] + best = min(int(np.where(rank_list == t)[0][0]) for t in targets) + meds.append(best + 1) + for k in K: + if best < k: + R[k] += 1 + n = len(ranks) + recall = {k: R[k] / n for k in K} + return recall, np.median(meds) + + +def evaluate(img_embs, txt_embs, img2caps, img_ids): + sims = img_embs @ txt_embs.T + + # Image→Text + img2cap = {} + offset = 0 + for i, iid in enumerate(img_ids): + cnt = len(img2caps[iid]) + img2cap[i] = list(range(offset, offset + cnt)) + offset += cnt + + ranks_i2t = np.argsort(-sims, axis=1) + R_i2t, med_i2t = compute_metrics(ranks_i2t, img2cap) + print("Image→Text R@1,5,10:", [f"{R_i2t[k] * 100:.2f}%" for k in (1, 5, 10)]) + print("Image→Text Median Rank:", med_i2t) + + # Text→Image + cap2img, offset = {}, 0 + for i, iid in enumerate(img_ids): + for _ in img2caps[iid]: + cap2img[offset] = i + offset += 1 + + ranks_t2i = np.argsort(-sims.T, axis=1) + gt = {idx: [cap2img[idx]] for idx in range(len(ranks_t2i))} + R_t2i, med_t2i = compute_metrics(ranks_t2i, gt) + print("Text→Image R@1,5,10:", [f"{R_t2i[k] * 100:.2f}%" for k in (1, 5, 10)]) + print("Text→Image Median Rank:", med_t2i) + + +def main(): + args = parse_args() + + # 1) Load COCO + coco, img_ids, img2caps = load_coco(args.coco_annotation_json) + captions = [c for iid in img_ids for c in img2caps[iid]] + + # 2) Build embedders + img_embedder = ImageEmbedder( + vision_model_name=args.proj_model, + proj_out_dim=args.proj_dim, + llava_ckpt_path=args.llava_ckpt + ) + txt_embedder = TextEmbedder( + model_name=args.text_model, + tokenizer_name=args.text_tokenizer + ) + + # 3) Compute embeddings + img_vectors = compute_image_embeddings( + img_embedder, coco, img_ids, args.coco_image_dir, args.batch_size + ) + txt_vectors = compute_text_embeddings( + txt_embedder, captions, args.batch_size + ) + + # 4) Evaluate Retrieval + evaluate(img_vectors, txt_vectors, img2caps, img_ids) + + +if __name__ == "__main__": + main() diff --git a/embedder.py b/embedder.py new file mode 100644 index 0000000..7cfda1f --- /dev/null +++ b/embedder.py @@ -0,0 +1,62 @@ +import torch +from transformers import CLIPImageProcessor, CLIPVisionModel +from PIL import Image +# Force use of GPU 0 (or change “0” to whichever GPU index you want) +import os +print(torch.version.cuda) +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +# Now select device +if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available—cannot run on GPU") +DEVICE = torch.device("cuda") # no fallback to CPU + +print(f"→ Running exclusively on {torch.cuda.get_device_name(0)}") + + +class ImageEmbedder: + def __init__(self, + vision_model_name: str = "openai/clip-vit-large-patch14-336", + proj_out_dim: int = 5120, + llava_ckpt_path: str = None): + # Load CLIP vision encoder + processor + self.processor = CLIPImageProcessor.from_pretrained(vision_model_name) + self.vision_model = ( + CLIPVisionModel + .from_pretrained(vision_model_name) + .to(DEVICE) + .eval() + ) + # Freeze it + for p in self.vision_model.parameters(): + p.requires_grad = False + + # Build the projection layer (1024 → proj_out_dim) and move it to DEVICE + vision_hidden_dim = self.vision_model.config.hidden_size # should be 1024 + self.projection = torch.nn.Linear(vision_hidden_dim, proj_out_dim).to(DEVICE) + self.projection.eval() + + # If provided, load LLaVA’s projection weights from the full bin checkpoint + if llava_ckpt_path is not None: + ckpt = torch.load(llava_ckpt_path, map_location=DEVICE) + # extract only the projector weights + bias + keys = ["model.mm_projector.weight", "model.mm_projector.bias"] + state = { + k.replace("model.mm_projector.", ""): ckpt[k] + for k in keys + } + # Load into our linear layer + self.projection.load_state_dict(state) + + def image_to_embedding(self, image: Image.Image) -> torch.Tensor: + # preprocess & move to device + inputs = self.processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(DEVICE) # (1,3,H,W) + + # Forward pass + with torch.no_grad(): + out = self.vision_model(pixel_values=pixel_values) + feat = out.pooler_output # (1,1024) + emb = self.projection(feat) # (1,proj_out_dim) + # Returns a 1D tensor of size [proj_out_dim] on DEVICE + return emb.squeeze(0) # → (proj_out_dim,) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f7e7049 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +# Core +torch>=2.0.1 +transformers>=4.30.0 +Pillow>=9.5.0 +requests>=2.31.0 +charset-normalizer>=3.1.0 + +# Numerical & evaluation +numpy>=1.25.0 +scikit-learn>=1.2.2 +faiss-cpu>=1.7.3 + +# COCO utilities & progress bars +pycocotools>=2.0.6 +tqdm>=4.65.0 diff --git a/text_embedder.py b/text_embedder.py new file mode 100644 index 0000000..95c994c --- /dev/null +++ b/text_embedder.py @@ -0,0 +1,50 @@ +import torch +from transformers import AutoTokenizer, AutoModel +from typing import List + +# Use the same DEVICE as embedder +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class TextEmbedder: + """ + Encodes text into the same embedding space (assumes your LLM was aligned + with LLaVA’s projector during fine-tuning). + """ + + def __init__(self, + model_name: str, + tokenizer_name: str = None): + tokenizer_name = tokenizer_name or model_name + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + self.text_model = ( + AutoModel + .from_pretrained(model_name) + .to(DEVICE) + .eval() + ) + for p in self.text_model.parameters(): + p.requires_grad = False + + def text_to_embedding(self, texts: List[str]) -> torch.Tensor: + """ + Returns a tensor of shape (batch_size, hidden_dim) on DEVICE. + Uses pooler_output if available; otherwise mean-pools tokens. + """ + inputs = self.tokenizer( + texts, + padding=True, + truncation=True, + return_tensors="pt" + ) + # move all inputs to GPU if available + inputs = {k: v.to(DEVICE) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.text_model(**inputs) + if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None: + emb = outputs.pooler_output # (batch, hidden_dim) + else: + emb = outputs.last_hidden_state.mean(dim=1) + + return emb