Basic functioning
This commit is contained in:
parent
e704bde69b
commit
44c2e74a7a
@ -1,136 +0,0 @@
|
|||||||
import os
|
|
||||||
import argparse
|
|
||||||
import numpy as np
|
|
||||||
from tqdm import tqdm
|
|
||||||
from pycocotools.coco import COCO
|
|
||||||
from PIL import Image
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from embedder import ImageEmbedder, DEVICE
|
|
||||||
from text_embedder import TextEmbedder
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
p = argparse.ArgumentParser(description="COCO retrieval benchmark")
|
|
||||||
p.add_argument("--coco-annotation-json", required=True,
|
|
||||||
help="path to captions_val2017.json")
|
|
||||||
p.add_argument("--coco-image-dir", required=True,
|
|
||||||
help="path to val2017/ folder")
|
|
||||||
p.add_argument("--llava-ckpt", required=True,
|
|
||||||
help="path to pytorch_model-00003-of-00003.bin")
|
|
||||||
p.add_argument("--proj-model", default="openai/clip-vit-large-patch14-336")
|
|
||||||
p.add_argument("--proj-dim", type=int, default=5120)
|
|
||||||
p.add_argument("--text-model", required=True)
|
|
||||||
p.add_argument("--text-tokenizer", required=True)
|
|
||||||
p.add_argument("--batch-size", type=int, default=64)
|
|
||||||
return p.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def load_coco(ann_file):
|
|
||||||
coco = COCO(ann_file)
|
|
||||||
img_ids = coco.getImgIds()
|
|
||||||
img2caps = {iid: [ann["caption"] for ann in coco.loadAnns(coco.getAnnIds(imgIds=iid))]
|
|
||||||
for iid in img_ids}
|
|
||||||
return coco, img_ids, img2caps
|
|
||||||
|
|
||||||
|
|
||||||
def compute_image_embeddings(embedder, coco, img_ids, img_dir, bs):
|
|
||||||
all_embs = []
|
|
||||||
for i in tqdm(range(0, len(img_ids), bs), desc="Images"):
|
|
||||||
batch = img_ids[i:i + bs]
|
|
||||||
imgs = []
|
|
||||||
for iid in batch:
|
|
||||||
fn = coco.loadImgs(iid)[0]["file_name"]
|
|
||||||
imgs.append(Image.open(os.path.join(img_dir, fn)).convert("RGB"))
|
|
||||||
with torch.no_grad():
|
|
||||||
embs = torch.stack([embedder.image_to_embedding(im) for im in imgs])
|
|
||||||
all_embs.append(embs.cpu().numpy())
|
|
||||||
return np.vstack(all_embs)
|
|
||||||
|
|
||||||
|
|
||||||
def compute_text_embeddings(embedder, captions, bs):
|
|
||||||
all_embs = []
|
|
||||||
for i in tqdm(range(0, len(captions), bs), desc="Texts"):
|
|
||||||
batch = captions[i:i + bs]
|
|
||||||
with torch.no_grad():
|
|
||||||
embs = embedder.text_to_embedding(batch)
|
|
||||||
all_embs.append(embs.cpu().numpy())
|
|
||||||
return np.vstack(all_embs)
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(ranks, gt, K=(1, 5, 10)):
|
|
||||||
R, meds = {k: 0 for k in K}, []
|
|
||||||
for idx, rank_list in enumerate(ranks):
|
|
||||||
targets = gt[idx]
|
|
||||||
best = min(int(np.where(rank_list == t)[0][0]) for t in targets)
|
|
||||||
meds.append(best + 1)
|
|
||||||
for k in K:
|
|
||||||
if best < k:
|
|
||||||
R[k] += 1
|
|
||||||
n = len(ranks)
|
|
||||||
recall = {k: R[k] / n for k in K}
|
|
||||||
return recall, np.median(meds)
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(img_embs, txt_embs, img2caps, img_ids):
|
|
||||||
sims = img_embs @ txt_embs.T
|
|
||||||
|
|
||||||
# Image→Text
|
|
||||||
img2cap = {}
|
|
||||||
offset = 0
|
|
||||||
for i, iid in enumerate(img_ids):
|
|
||||||
cnt = len(img2caps[iid])
|
|
||||||
img2cap[i] = list(range(offset, offset + cnt))
|
|
||||||
offset += cnt
|
|
||||||
|
|
||||||
ranks_i2t = np.argsort(-sims, axis=1)
|
|
||||||
R_i2t, med_i2t = compute_metrics(ranks_i2t, img2cap)
|
|
||||||
print("Image→Text R@1,5,10:", [f"{R_i2t[k] * 100:.2f}%" for k in (1, 5, 10)])
|
|
||||||
print("Image→Text Median Rank:", med_i2t)
|
|
||||||
|
|
||||||
# Text→Image
|
|
||||||
cap2img, offset = {}, 0
|
|
||||||
for i, iid in enumerate(img_ids):
|
|
||||||
for _ in img2caps[iid]:
|
|
||||||
cap2img[offset] = i
|
|
||||||
offset += 1
|
|
||||||
|
|
||||||
ranks_t2i = np.argsort(-sims.T, axis=1)
|
|
||||||
gt = {idx: [cap2img[idx]] for idx in range(len(ranks_t2i))}
|
|
||||||
R_t2i, med_t2i = compute_metrics(ranks_t2i, gt)
|
|
||||||
print("Text→Image R@1,5,10:", [f"{R_t2i[k] * 100:.2f}%" for k in (1, 5, 10)])
|
|
||||||
print("Text→Image Median Rank:", med_t2i)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
# 1) Load COCO
|
|
||||||
coco, img_ids, img2caps = load_coco(args.coco_annotation_json)
|
|
||||||
captions = [c for iid in img_ids for c in img2caps[iid]]
|
|
||||||
|
|
||||||
# 2) Build embedders
|
|
||||||
img_embedder = ImageEmbedder(
|
|
||||||
vision_model_name=args.proj_model,
|
|
||||||
proj_out_dim=args.proj_dim,
|
|
||||||
llava_ckpt_path=args.llava_ckpt
|
|
||||||
)
|
|
||||||
txt_embedder = TextEmbedder(
|
|
||||||
model_name=args.text_model,
|
|
||||||
tokenizer_name=args.text_tokenizer
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3) Compute embeddings
|
|
||||||
img_vectors = compute_image_embeddings(
|
|
||||||
img_embedder, coco, img_ids, args.coco_image_dir, args.batch_size
|
|
||||||
)
|
|
||||||
txt_vectors = compute_text_embeddings(
|
|
||||||
txt_embedder, captions, args.batch_size
|
|
||||||
)
|
|
||||||
|
|
||||||
# 4) Evaluate Retrieval
|
|
||||||
evaluate(img_vectors, txt_vectors, img2caps, img_ids)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
19
embedder.py
19
embedder.py
@ -1,17 +1,9 @@
|
|||||||
import torch
|
import torch
|
||||||
from transformers import CLIPImageProcessor, CLIPVisionModel
|
from transformers import CLIPImageProcessor, CLIPVisionModel
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
# Force use of GPU 0 (or change “0” to whichever GPU index you want)
|
|
||||||
import os
|
|
||||||
print(torch.version.cuda)
|
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
||||||
|
|
||||||
# Now select device
|
# Use CUDA
|
||||||
if not torch.cuda.is_available():
|
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
raise RuntimeError("CUDA is not available—cannot run on GPU")
|
|
||||||
DEVICE = torch.device("cuda") # no fallback to CPU
|
|
||||||
|
|
||||||
print(f"→ Running exclusively on {torch.cuda.get_device_name(0)}")
|
|
||||||
|
|
||||||
|
|
||||||
class ImageEmbedder:
|
class ImageEmbedder:
|
||||||
@ -27,7 +19,7 @@ class ImageEmbedder:
|
|||||||
.to(DEVICE)
|
.to(DEVICE)
|
||||||
.eval()
|
.eval()
|
||||||
)
|
)
|
||||||
# Freeze it
|
# Freeze version
|
||||||
for p in self.vision_model.parameters():
|
for p in self.vision_model.parameters():
|
||||||
p.requires_grad = False
|
p.requires_grad = False
|
||||||
|
|
||||||
@ -36,7 +28,7 @@ class ImageEmbedder:
|
|||||||
self.projection = torch.nn.Linear(vision_hidden_dim, proj_out_dim).to(DEVICE)
|
self.projection = torch.nn.Linear(vision_hidden_dim, proj_out_dim).to(DEVICE)
|
||||||
self.projection.eval()
|
self.projection.eval()
|
||||||
|
|
||||||
# If provided, load LLaVA’s projection weights from the full bin checkpoint
|
# Load LLaVA’s projection weights from the full bin checkpoint
|
||||||
if llava_ckpt_path is not None:
|
if llava_ckpt_path is not None:
|
||||||
ckpt = torch.load(llava_ckpt_path, map_location=DEVICE)
|
ckpt = torch.load(llava_ckpt_path, map_location=DEVICE)
|
||||||
# extract only the projector weights + bias
|
# extract only the projector weights + bias
|
||||||
@ -58,5 +50,6 @@ class ImageEmbedder:
|
|||||||
out = self.vision_model(pixel_values=pixel_values)
|
out = self.vision_model(pixel_values=pixel_values)
|
||||||
feat = out.pooler_output # (1,1024)
|
feat = out.pooler_output # (1,1024)
|
||||||
emb = self.projection(feat) # (1,proj_out_dim)
|
emb = self.projection(feat) # (1,proj_out_dim)
|
||||||
# Returns a 1D tensor of size [proj_out_dim] on DEVICE
|
|
||||||
|
# Returns a 1D tensor of size [proj_out_dim] on DEVICE.
|
||||||
return emb.squeeze(0) # → (proj_out_dim,)
|
return emb.squeeze(0) # → (proj_out_dim,)
|
||||||
|
|||||||
36
find_similar_img.py
Normal file
36
find_similar_img.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pickle
|
||||||
|
from PIL import Image
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
from embedder import ImageEmbedder
|
||||||
|
|
||||||
|
# ——— Load stored embeddings & mapping ———
|
||||||
|
vecs = np.load("processed_images/image_vectors.npy") # (N, D)
|
||||||
|
with open("processed_images/index_to_file.pkl", "rb") as f:
|
||||||
|
idx2file = pickle.load(f) # dict: idx → filepath
|
||||||
|
|
||||||
|
# ——— Specify query image ———
|
||||||
|
query_path = "datasets/coco/val2017/1140002154.jpg"
|
||||||
|
|
||||||
|
# ——— Embed the query image ———
|
||||||
|
embedder = ImageEmbedder(
|
||||||
|
vision_model_name="openai/clip-vit-large-patch14-336",
|
||||||
|
proj_out_dim=5120,
|
||||||
|
llava_ckpt_path="datasets/pytorch_model-00003-of-00003.bin"
|
||||||
|
)
|
||||||
|
img = Image.open(query_path).convert("RGB")
|
||||||
|
q_vec = embedder.image_to_embedding(img).cpu().numpy() # (D,)
|
||||||
|
|
||||||
|
# ——— Compute similarities & retrieve top-k ———
|
||||||
|
# (you can normalize if you built your DB with inner-product indexing)
|
||||||
|
# vecs_norm = vecs / np.linalg.norm(vecs, axis=1, keepdims=True)
|
||||||
|
# q_vec_norm = q_vec / np.linalg.norm(q_vec)
|
||||||
|
# sims = cosine_similarity(q_vec_norm.reshape(1, -1), vecs_norm).flatten()
|
||||||
|
sims = cosine_similarity(q_vec.reshape(1, -1), vecs).flatten()
|
||||||
|
top5 = sims.argsort()[-5:][::-1]
|
||||||
|
|
||||||
|
# ——— Print out the results ———
|
||||||
|
print(f"Query image: {query_path}\n")
|
||||||
|
for rank, idx in enumerate(top5, 1):
|
||||||
|
print(f"{rank:>2}. {idx2file[idx]} (score: {sims[idx]:.4f})")
|
||||||
16
readme.md
Normal file
16
readme.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# Img2Vec
|
||||||
|
|
||||||
|
A rough implementation of generating image embeddings through methodologies introduced in LLaVA
|
||||||
|
|
||||||
|
### Structure
|
||||||
|
We derived the image embeddings by using a CLIP encoder and mapping it with the pretrained LLaVA’s projection weight
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
1. install requirements.txt
|
||||||
|
2. Make sure you have downloaded `pytorch_model-00003-of-00003.bin`
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Replace **image-dir** and **llava-ckpt** to your **test image folder addr** and **pytorch_model-00003-of-00003.bin addr**
|
||||||
|
|
||||||
|
`python convert_images_to_vectors.py --image-dir ./datasets/coco/val2017 --output-dir imgVecs --vision-model openai/clip-vit-large-patch14-336 --proj-dim 5120 --llava-ckpt ./datasets/pytorch_model-00003-of-00003.bin --batch-size 64`
|
||||||
102
starter.py
Normal file
102
starter.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from embedder import ImageEmbedder, DEVICE
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(
|
||||||
|
description="Batch-convert a folder of images into embedding vectors"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--image-dir", required=True,
|
||||||
|
help="Path to a folder containing images (jpg/png/bmp/gif)"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--output-dir", default="processed_images",
|
||||||
|
help="Where to save image_vectors.npy and index_to_file.pkl"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--vision-model", default="openai/clip-vit-large-patch14-336",
|
||||||
|
help="Hugging Face name of the CLIP vision encoder"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--proj-dim", type=int, default=5120,
|
||||||
|
help="Dimensionality of the projection output"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--llava-ckpt", default=None,
|
||||||
|
help="(Optional) full LLaVA checkpoint .bin to load projector weights from"
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--batch-size", type=int, default=64,
|
||||||
|
help="How many images to encode per GPU/CPU batch"
|
||||||
|
)
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def find_images(folder):
|
||||||
|
exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif")
|
||||||
|
return sorted([
|
||||||
|
os.path.join(folder, fname)
|
||||||
|
for fname in os.listdir(folder)
|
||||||
|
if fname.lower().endswith(exts)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Discover images
|
||||||
|
image_paths = find_images(args.image_dir)
|
||||||
|
if not image_paths:
|
||||||
|
raise RuntimeError(f"No images found in {args.image_dir}")
|
||||||
|
print(f"Found {len(image_paths)} images in {args.image_dir}")
|
||||||
|
|
||||||
|
# Build embedder
|
||||||
|
embedder = ImageEmbedder(
|
||||||
|
vision_model_name=args.vision_model,
|
||||||
|
proj_out_dim=args.proj_dim,
|
||||||
|
llava_ckpt_path=args.llava_ckpt
|
||||||
|
)
|
||||||
|
print(f"Using device: {DEVICE}")
|
||||||
|
|
||||||
|
# Process in batches
|
||||||
|
all_embs = []
|
||||||
|
index_to_file = {}
|
||||||
|
for batch_start in range(0, len(image_paths), args.batch_size):
|
||||||
|
batch_paths = image_paths[batch_start:batch_start + args.batch_size]
|
||||||
|
# load images
|
||||||
|
imgs = [Image.open(p).convert("RGB") for p in batch_paths]
|
||||||
|
# embed
|
||||||
|
with torch.no_grad():
|
||||||
|
embs = embedder.image_to_embedding(imgs) # (B, D)
|
||||||
|
embs_np = embs.cpu().numpy()
|
||||||
|
all_embs.append(embs_np)
|
||||||
|
# record mapping
|
||||||
|
for i, p in enumerate(batch_paths):
|
||||||
|
index_to_file[batch_start + i] = p
|
||||||
|
|
||||||
|
print(f" • Processed {batch_start + len(batch_paths)}/{len(image_paths)} images")
|
||||||
|
|
||||||
|
# Stack and save
|
||||||
|
vectors = np.vstack(all_embs) # shape (N, D)
|
||||||
|
vec_file = os.path.join(args.output_dir, "image_vectors.npy")
|
||||||
|
map_file = os.path.join(args.output_dir, "index_to_file.pkl")
|
||||||
|
|
||||||
|
np.save(vec_file, vectors)
|
||||||
|
with open(map_file, "wb") as f:
|
||||||
|
pickle.dump(index_to_file, f)
|
||||||
|
|
||||||
|
print(f"\nSaved {vectors.shape[0]}×{vectors.shape[1]} vectors to\n {vec_file}")
|
||||||
|
print(f"Saved index→file mapping to\n {map_file}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -1,50 +0,0 @@
|
|||||||
import torch
|
|
||||||
from transformers import AutoTokenizer, AutoModel
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
# Use the same DEVICE as embedder
|
|
||||||
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
||||||
|
|
||||||
|
|
||||||
class TextEmbedder:
|
|
||||||
"""
|
|
||||||
Encodes text into the same embedding space (assumes your LLM was aligned
|
|
||||||
with LLaVA’s projector during fine-tuning).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
model_name: str,
|
|
||||||
tokenizer_name: str = None):
|
|
||||||
tokenizer_name = tokenizer_name or model_name
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
|
||||||
self.text_model = (
|
|
||||||
AutoModel
|
|
||||||
.from_pretrained(model_name)
|
|
||||||
.to(DEVICE)
|
|
||||||
.eval()
|
|
||||||
)
|
|
||||||
for p in self.text_model.parameters():
|
|
||||||
p.requires_grad = False
|
|
||||||
|
|
||||||
def text_to_embedding(self, texts: List[str]) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Returns a tensor of shape (batch_size, hidden_dim) on DEVICE.
|
|
||||||
Uses pooler_output if available; otherwise mean-pools tokens.
|
|
||||||
"""
|
|
||||||
inputs = self.tokenizer(
|
|
||||||
texts,
|
|
||||||
padding=True,
|
|
||||||
truncation=True,
|
|
||||||
return_tensors="pt"
|
|
||||||
)
|
|
||||||
# move all inputs to GPU if available
|
|
||||||
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = self.text_model(**inputs)
|
|
||||||
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
|
||||||
emb = outputs.pooler_output # (batch, hidden_dim)
|
|
||||||
else:
|
|
||||||
emb = outputs.last_hidden_state.mean(dim=1)
|
|
||||||
|
|
||||||
return emb
|
|
||||||
Loading…
x
Reference in New Issue
Block a user