Initial Commit

This commit is contained in:
ldy 2025-06-13 07:21:58 +08:00
commit e704bde69b
4 changed files with 263 additions and 0 deletions

136
benchmark_coco.py Normal file
View File

@ -0,0 +1,136 @@
import os
import argparse
import numpy as np
from tqdm import tqdm
from pycocotools.coco import COCO
from PIL import Image
import torch
from embedder import ImageEmbedder, DEVICE
from text_embedder import TextEmbedder
def parse_args():
p = argparse.ArgumentParser(description="COCO retrieval benchmark")
p.add_argument("--coco-annotation-json", required=True,
help="path to captions_val2017.json")
p.add_argument("--coco-image-dir", required=True,
help="path to val2017/ folder")
p.add_argument("--llava-ckpt", required=True,
help="path to pytorch_model-00003-of-00003.bin")
p.add_argument("--proj-model", default="openai/clip-vit-large-patch14-336")
p.add_argument("--proj-dim", type=int, default=5120)
p.add_argument("--text-model", required=True)
p.add_argument("--text-tokenizer", required=True)
p.add_argument("--batch-size", type=int, default=64)
return p.parse_args()
def load_coco(ann_file):
coco = COCO(ann_file)
img_ids = coco.getImgIds()
img2caps = {iid: [ann["caption"] for ann in coco.loadAnns(coco.getAnnIds(imgIds=iid))]
for iid in img_ids}
return coco, img_ids, img2caps
def compute_image_embeddings(embedder, coco, img_ids, img_dir, bs):
all_embs = []
for i in tqdm(range(0, len(img_ids), bs), desc="Images"):
batch = img_ids[i:i + bs]
imgs = []
for iid in batch:
fn = coco.loadImgs(iid)[0]["file_name"]
imgs.append(Image.open(os.path.join(img_dir, fn)).convert("RGB"))
with torch.no_grad():
embs = torch.stack([embedder.image_to_embedding(im) for im in imgs])
all_embs.append(embs.cpu().numpy())
return np.vstack(all_embs)
def compute_text_embeddings(embedder, captions, bs):
all_embs = []
for i in tqdm(range(0, len(captions), bs), desc="Texts"):
batch = captions[i:i + bs]
with torch.no_grad():
embs = embedder.text_to_embedding(batch)
all_embs.append(embs.cpu().numpy())
return np.vstack(all_embs)
def compute_metrics(ranks, gt, K=(1, 5, 10)):
R, meds = {k: 0 for k in K}, []
for idx, rank_list in enumerate(ranks):
targets = gt[idx]
best = min(int(np.where(rank_list == t)[0][0]) for t in targets)
meds.append(best + 1)
for k in K:
if best < k:
R[k] += 1
n = len(ranks)
recall = {k: R[k] / n for k in K}
return recall, np.median(meds)
def evaluate(img_embs, txt_embs, img2caps, img_ids):
sims = img_embs @ txt_embs.T
# Image→Text
img2cap = {}
offset = 0
for i, iid in enumerate(img_ids):
cnt = len(img2caps[iid])
img2cap[i] = list(range(offset, offset + cnt))
offset += cnt
ranks_i2t = np.argsort(-sims, axis=1)
R_i2t, med_i2t = compute_metrics(ranks_i2t, img2cap)
print("Image→Text R@1,5,10:", [f"{R_i2t[k] * 100:.2f}%" for k in (1, 5, 10)])
print("Image→Text Median Rank:", med_i2t)
# Text→Image
cap2img, offset = {}, 0
for i, iid in enumerate(img_ids):
for _ in img2caps[iid]:
cap2img[offset] = i
offset += 1
ranks_t2i = np.argsort(-sims.T, axis=1)
gt = {idx: [cap2img[idx]] for idx in range(len(ranks_t2i))}
R_t2i, med_t2i = compute_metrics(ranks_t2i, gt)
print("Text→Image R@1,5,10:", [f"{R_t2i[k] * 100:.2f}%" for k in (1, 5, 10)])
print("Text→Image Median Rank:", med_t2i)
def main():
args = parse_args()
# 1) Load COCO
coco, img_ids, img2caps = load_coco(args.coco_annotation_json)
captions = [c for iid in img_ids for c in img2caps[iid]]
# 2) Build embedders
img_embedder = ImageEmbedder(
vision_model_name=args.proj_model,
proj_out_dim=args.proj_dim,
llava_ckpt_path=args.llava_ckpt
)
txt_embedder = TextEmbedder(
model_name=args.text_model,
tokenizer_name=args.text_tokenizer
)
# 3) Compute embeddings
img_vectors = compute_image_embeddings(
img_embedder, coco, img_ids, args.coco_image_dir, args.batch_size
)
txt_vectors = compute_text_embeddings(
txt_embedder, captions, args.batch_size
)
# 4) Evaluate Retrieval
evaluate(img_vectors, txt_vectors, img2caps, img_ids)
if __name__ == "__main__":
main()

62
embedder.py Normal file
View File

@ -0,0 +1,62 @@
import torch
from transformers import CLIPImageProcessor, CLIPVisionModel
from PIL import Image
# Force use of GPU 0 (or change “0” to whichever GPU index you want)
import os
print(torch.version.cuda)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Now select device
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available—cannot run on GPU")
DEVICE = torch.device("cuda") # no fallback to CPU
print(f"→ Running exclusively on {torch.cuda.get_device_name(0)}")
class ImageEmbedder:
def __init__(self,
vision_model_name: str = "openai/clip-vit-large-patch14-336",
proj_out_dim: int = 5120,
llava_ckpt_path: str = None):
# Load CLIP vision encoder + processor
self.processor = CLIPImageProcessor.from_pretrained(vision_model_name)
self.vision_model = (
CLIPVisionModel
.from_pretrained(vision_model_name)
.to(DEVICE)
.eval()
)
# Freeze it
for p in self.vision_model.parameters():
p.requires_grad = False
# Build the projection layer (1024 → proj_out_dim) and move it to DEVICE
vision_hidden_dim = self.vision_model.config.hidden_size # should be 1024
self.projection = torch.nn.Linear(vision_hidden_dim, proj_out_dim).to(DEVICE)
self.projection.eval()
# If provided, load LLaVAs projection weights from the full bin checkpoint
if llava_ckpt_path is not None:
ckpt = torch.load(llava_ckpt_path, map_location=DEVICE)
# extract only the projector weights + bias
keys = ["model.mm_projector.weight", "model.mm_projector.bias"]
state = {
k.replace("model.mm_projector.", ""): ckpt[k]
for k in keys
}
# Load into our linear layer
self.projection.load_state_dict(state)
def image_to_embedding(self, image: Image.Image) -> torch.Tensor:
# preprocess & move to device
inputs = self.processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(DEVICE) # (1,3,H,W)
# Forward pass
with torch.no_grad():
out = self.vision_model(pixel_values=pixel_values)
feat = out.pooler_output # (1,1024)
emb = self.projection(feat) # (1,proj_out_dim)
# Returns a 1D tensor of size [proj_out_dim] on DEVICE
return emb.squeeze(0) # → (proj_out_dim,)

15
requirements.txt Normal file
View File

@ -0,0 +1,15 @@
# Core
torch>=2.0.1
transformers>=4.30.0
Pillow>=9.5.0
requests>=2.31.0
charset-normalizer>=3.1.0
# Numerical & evaluation
numpy>=1.25.0
scikit-learn>=1.2.2
faiss-cpu>=1.7.3
# COCO utilities & progress bars
pycocotools>=2.0.6
tqdm>=4.65.0

50
text_embedder.py Normal file
View File

@ -0,0 +1,50 @@
import torch
from transformers import AutoTokenizer, AutoModel
from typing import List
# Use the same DEVICE as embedder
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class TextEmbedder:
"""
Encodes text into the same embedding space (assumes your LLM was aligned
with LLaVAs projector during fine-tuning).
"""
def __init__(self,
model_name: str,
tokenizer_name: str = None):
tokenizer_name = tokenizer_name or model_name
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.text_model = (
AutoModel
.from_pretrained(model_name)
.to(DEVICE)
.eval()
)
for p in self.text_model.parameters():
p.requires_grad = False
def text_to_embedding(self, texts: List[str]) -> torch.Tensor:
"""
Returns a tensor of shape (batch_size, hidden_dim) on DEVICE.
Uses pooler_output if available; otherwise mean-pools tokens.
"""
inputs = self.tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
# move all inputs to GPU if available
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.text_model(**inputs)
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
emb = outputs.pooler_output # (batch, hidden_dim)
else:
emb = outputs.last_hidden_state.mean(dim=1)
return emb