Initial Commit

This commit is contained in:
ldy 2025-06-18 16:07:02 +08:00
commit d173a94399
14 changed files with 14052 additions and 0 deletions

2
embed/__init__.py Normal file
View File

@ -0,0 +1,2 @@
class ImageEmbedder:
pass

258
embed/image_embedder.py Normal file
View File

@ -0,0 +1,258 @@
import pickle
import logging
from pathlib import Path
from typing import Dict, List, Union
import numpy as np
import torch
import torch.nn as nn
from PIL import Image, UnidentifiedImageError
from transformers import CLIPImageProcessor, CLIPVisionModel
from tqdm import tqdm
from atomicwrites import atomic_write
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Device & defaults
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEFAULT_VISION_MODEL = "openai/clip-vit-large-patch14-336"
PROJECTOR_WEIGHTS_PATH = (
Path(__file__).parent.parent
/ "models"
/ "llava-v1.5-mlp2x-336px-vicuna-7b"
/ "mm_projector.bin"
)
OUTPUT_DIM = 4096
DEFAULT_BATCH_SIZE = 32
# Default output path
EMBED_DIR = Path(__file__).parent.parent / "embeddings"
IMAGE_EMB_PATH = EMBED_DIR / "image_embeddings.pkl"
class ImageEmbedder:
"""
Frozen CLIP vision encoder + pretrained 2-layer MLP projector.
Uses PATCH-LEVEL features only (mean of patch embeddings) following
LLaVA's configuration: mm_vision_select_feature="patch".
Provides single-image and folder-wide embedding, with options to
print only or append to a persistent pickle store via atomic writes.
"""
def __init__(
self,
vision_model_name: str = DEFAULT_VISION_MODEL,
projector_weights_path: Union[str, Path] = PROJECTOR_WEIGHTS_PATH,
output_dim: int = OUTPUT_DIM,
):
# 1) Setup CLIP processor + model
self.processor = CLIPImageProcessor.from_pretrained(vision_model_name)
self.vision_model = (
CLIPVisionModel
.from_pretrained(vision_model_name, output_hidden_states=True)
.to(DEVICE)
.eval()
)
for p in self.vision_model.parameters():
p.requires_grad = False
# 2) Build MLP projector - PATCH ONLY (1024 input, not 2048)
# Following LLaVA config: mm_vision_select_feature="patch", mm_hidden_size=1024
hidden_dim = self.vision_model.config.hidden_size # 1024, not 1024*2
self.projector = nn.Sequential(
nn.Linear(hidden_dim, output_dim), # 1024 -> 4096
nn.GELU(),
nn.Linear(output_dim, output_dim), # 4096 -> 4096
).to(DEVICE)
# 3) Load projector weights
ckpt = torch.load(projector_weights_path, map_location=DEVICE)
state = ckpt.get("projector_state_dict", ckpt)
# Strip prefix
clean_state = {
(k[len("model.mm_projector."):] if k.startswith("model.mm_projector.") else k): v
for k, v in state.items()
}
# Load with warning on mismatch
missing, unexpected = self.projector.load_state_dict(clean_state, strict=False)
if missing or unexpected:
logger.info(f"Loaded projector weights with missing={missing}, unexpected={unexpected}")
def image_to_embedding(self, image: Image.Image) -> torch.Tensor:
"""
Embed a single PIL.Image (output_dim,) L2-normalized vector.
Uses PATCH-LEVEL features only (mean of patches, no CLS token).
"""
try:
inputs = self.processor(images=image, return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
except Exception as e:
logger.exception(f"Preprocessing failed: {e}")
raise
with torch.no_grad():
last_hidden = self.vision_model(**inputs).last_hidden_state
# PATCH-ONLY: Use mean of patch embeddings, drop CLS token
# last_hidden shape: (1, 1+num_patches, 1024)
# [:, 0, :] = CLS token (skip)
# [:, 1:, :] = patch tokens (use mean)
patch_mean = last_hidden[:, 1:, :].mean(dim=1) # (1, 1024)
combo = nn.functional.normalize(patch_mean, p=2, dim=1) # L2 normalize
proj = self.projector(combo) # (1, 4096)
emb = nn.functional.normalize(proj, p=2, dim=1) # L2 normalize output
return emb.squeeze(0) # → (OUTPUT_DIM,)
def embed_folder(
self,
folder_path: Union[str, Path],
save_path: Union[str, Path] = IMAGE_EMB_PATH,
extensions: List[str] = None,
batch_size: int = DEFAULT_BATCH_SIZE,
) -> Dict[str, np.ndarray]:
"""
Embed all images in a folder (recursively), skipping already-embedded
and corrupted files, saving after each batch.
Uses PATCH-LEVEL features only.
"""
if extensions is None:
extensions = [".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"]
folder = Path(folder_path)
save_path = Path(save_path)
save_path.parent.mkdir(parents=True, exist_ok=True)
# Load existing embeddings
try:
with open(save_path, "rb") as fp:
mapping = pickle.load(fp)
except FileNotFoundError:
mapping = {}
except Exception as e:
logger.warning(f"Could not load embeddings: {e}")
mapping = {}
# Collect files to embed
all_files = [f for f in folder.rglob("*") if f.suffix.lower() in extensions]
to_process = [f for f in all_files if str(f) not in mapping]
if not to_process:
logger.info("No new images to embed.")
return mapping
logger.info(f"Embedding {len(to_process)} images (batch size {batch_size})")
for i in tqdm(range(0, len(to_process), batch_size), desc="Images"):
batch = to_process[i: i + batch_size]
imgs, paths = [], []
for f in batch:
try:
imgs.append(Image.open(f).convert("RGB"))
paths.append(f)
except UnidentifiedImageError:
logger.warning(f"Skipping corrupted: {f}")
except Exception as e:
logger.warning(f"Error loading {f}: {e}")
if not imgs:
continue
# Embed batch
inputs = self.processor(images=imgs, return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
last_hidden = self.vision_model(**inputs).last_hidden_state
# PATCH-ONLY: Use mean of patches, drop CLS token
patch_means = last_hidden[:, 1:, :].mean(dim=1) # (batch_size, 1024)
combo = nn.functional.normalize(patch_means, p=2, dim=1)
proj = self.projector(combo) # (batch_size, 4096)
embs = nn.functional.normalize(proj, p=2, dim=1).cpu().detach().numpy()
# Update and save
for pth, emb in zip(paths, embs):
mapping[str(pth)] = emb
with atomic_write(save_path, mode="wb", overwrite=True) as fp:
pickle.dump(mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(f"Done. Total embeddings: {len(mapping)}")
return mapping
def embed_image(
self,
image_path: Union[str, Path],
save_path: Union[str, Path] = IMAGE_EMB_PATH,
no_save: bool = False,
) -> torch.Tensor:
"""
Embed one image using PATCH-LEVEL features only:
- prints the vector
- unless no_save=True, also appends to the pickle at save_path
Returns the torch.Tensor.
"""
img_path = Path(image_path)
try:
img = Image.open(img_path).convert("RGB")
vec = self.image_to_embedding(img)
except Exception as e:
logger.error(f"Failed embedding {img_path}: {e}")
raise
# Print out the vector
vec_np = vec.cpu().detach().numpy()
print(f"{img_path} → shape={vec_np.shape}")
print(vec_np)
# Append to pickle unless disabled
if not no_save:
save_path = Path(save_path)
save_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(save_path, "rb") as fp:
mapping = pickle.load(fp)
except (FileNotFoundError, pickle.PickleError):
mapping = {}
mapping[str(img_path)] = vec_np
with atomic_write(save_path, overwrite=True) as fp:
fp.write(pickle.dumps(mapping, protocol=pickle.HIGHEST_PROTOCOL))
logger.info(f"Saved embedding for {img_path} to {save_path}")
return vec
def main():
import argparse
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--image", type=str, help="Path to a single image to embed")
parser.add_argument("--folder", type=str, help="Path to a folder of images")
parser.add_argument("--out", type=str, default=str(IMAGE_EMB_PATH),
help="Pickle to save embeddings (default: embeddings/image_embeddings.pkl)")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
help="Batch size for folder embedding")
parser.add_argument("--no-save", action="store_true",
help="When embedding a single image, do not save to the pickle")
args = parser.parse_args()
embedder = ImageEmbedder()
if args.image:
embedder.embed_image(
image_path=args.image,
save_path=args.out,
no_save=args.no_save
)
elif args.folder:
embedder.embed_folder(
folder_path=args.folder,
save_path=args.out,
batch_size=args.batch_size
)
else:
parser.error("Please specify --image or --folder")
if __name__ == "__main__":
main()

138
embed/text_embedder.py Normal file
View File

@ -0,0 +1,138 @@
import pickle
import logging
from pathlib import Path
from typing import Dict, Union
import torch
import torch.nn as nn
import numpy as np
from sentence_transformers import SentenceTransformer
from cachetools import LRUCache
from tqdm import tqdm
from atomicwrites import atomic_write
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Device & defaults
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Use a model that's good for semantic similarity
DEFAULT_MODEL = "sentence-transformers/clip-ViT-B-32-multilingual-v1" # CLIP-based, good for images
OUTPUT_DIM = 4096
ARTICLE_CACHE_SIZE = 1024
# Persistence paths
EMBEDDINGS_DIR = Path(__file__).parent.parent / "embeddings"
TEXT_EMB_PATH = EMBEDDINGS_DIR / "text_embeddings.pkl"
class SimpleSentenceEmbedder:
"""
Simple text embedder using sentence-transformers with CLIP backbone.
This should give much better text-image alignment.
"""
def __init__(
self,
model_name: str = DEFAULT_MODEL,
output_dim: int = OUTPUT_DIM,
):
# Load sentence transformer model
self.model = SentenceTransformer(model_name, device=DEVICE)
# Get the model's output dimension
model_dim = self.model.get_sentence_embedding_dimension()
# Simple projector to match your 4096-D image embeddings
self.projector = nn.Sequential(
nn.Linear(model_dim, output_dim),
nn.ReLU(),
nn.Linear(output_dim, output_dim),
).to(DEVICE)
# Better initialization
for layer in self.projector:
if isinstance(layer, nn.Linear):
nn.init.xavier_uniform_(layer.weight)
nn.init.zeros_(layer.bias)
self.article_cache = LRUCache(maxsize=ARTICLE_CACHE_SIZE)
# Ensure embeddings dir
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Initialized SentenceEmbedder with {model_name} ({model_dim}D -> {output_dim}D)")
def text_to_embedding(self, text: str) -> torch.Tensor:
"""
Embed text using sentence transformer + simple projector
"""
if text in self.article_cache:
return self.article_cache[text]
# Get sentence embedding
with torch.no_grad():
sentence_emb = self.model.encode([text], convert_to_tensor=True, device=DEVICE)
sentence_emb = nn.functional.normalize(sentence_emb, p=2, dim=1)
# Project to target dimension
projected = self.projector(sentence_emb)
projected = nn.functional.normalize(projected, p=2, dim=1)
result = projected.squeeze(0).cpu()
self.article_cache[text] = result
return result
def embed_file(
self,
text_file: Union[str, Path],
save_path: Union[str, Path] = TEXT_EMB_PATH
) -> Dict[str, np.ndarray]:
"""
Read newline-separated articles and embed them
"""
save_path = Path(save_path)
save_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(save_path, "rb") as fp:
mapping: Dict[str, np.ndarray] = pickle.load(fp)
except FileNotFoundError:
mapping = {}
except Exception as e:
logger.warning(f"Failed loading {save_path}: {e}")
mapping = {}
try:
with open(text_file, "r", encoding="utf-8") as fp:
lines = [ln.strip() for ln in fp if ln.strip()]
except Exception as e:
raise RuntimeError(f"Error reading {text_file}: {e}")
for ln in tqdm(lines, desc="Embedding articles"):
if ln not in mapping:
try:
vec = self.text_to_embedding(ln)
mapping[ln] = vec.cpu().numpy()
except Exception as e:
logger.error(f"Embedding failed for article: {e}")
with atomic_write(save_path, mode='wb', overwrite=True) as f:
pickle.dump(mapping, f, protocol=pickle.HIGHEST_PROTOCOL)
return mapping
def get_memory_usage(self):
"""Get current GPU memory usage if available."""
if torch.cuda.is_available():
return {
'allocated': torch.cuda.memory_allocated() / 1024 ** 3, # GB
'reserved': torch.cuda.memory_reserved() / 1024 ** 3, # GB
}
return {'allocated': 0, 'reserved': 0}
# Alias for compatibility
TextEmbedder = SimpleSentenceEmbedder

BIN
embeddings/chunk_texts.pkl Normal file

Binary file not shown.

Binary file not shown.

103
examples/search.py Normal file
View File

@ -0,0 +1,103 @@
import numpy as np
import pickle
from pathlib import Path
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
IMG_EMB_PKL = Path("E:/GiteaRepo/text2img/embeddings/image_embeddings.pkl")
TOP_K = 5
query_text = """orange cat"""
def get_text_embedding(text, model, projector, device):
"""Get text embedding using sentence transformer + projector"""
with torch.no_grad():
# Get sentence embedding
sentence_emb = model.encode([text], convert_to_tensor=True, device=device)
sentence_emb = nn.functional.normalize(sentence_emb, p=2, dim=1)
# Project to 4096D
projected = projector(sentence_emb)
projected = nn.functional.normalize(projected, p=2, dim=1)
return projected.squeeze(0).cpu().numpy()
def main():
print(f"Loading image embeddings from {IMG_EMB_PKL}")
# 1) Load image embeddings
try:
with open(IMG_EMB_PKL, "rb") as f:
data = pickle.load(f)
except FileNotFoundError:
print(f"Error: Could not find {IMG_EMB_PKL}")
return
paths = np.array(list(data.keys()))
vecs = np.stack(list(data.values())) # shape (N, 4096)
print(f"Loaded {len(paths)} image embeddings with shape {vecs.shape}")
# 2) Initialize text embedding model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Use CLIP-based sentence transformer for better text-image alignment
model_name = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
print(f"Loading text model: {model_name}")
try:
model = SentenceTransformer(model_name, device=device)
model_dim = model.get_sentence_embedding_dimension()
print(f"Text model dimension: {model_dim}")
# Simple projector to match 4096D image embeddings
projector = nn.Sequential(
nn.Linear(model_dim, 4096),
nn.ReLU(),
nn.Linear(4096, 4096),
).to(device)
# Better initialization
for layer in projector:
if isinstance(layer, nn.Linear):
nn.init.xavier_uniform_(layer.weight)
nn.init.zeros_(layer.bias)
except Exception as e:
print(f"Error loading text model: {e}")
return
# 3) Embed the query
print(f"Embedding query: '{query_text}'")
try:
q = get_text_embedding(query_text, model, projector, device)
print(f"Query embedding shape: {q.shape}")
print(f"Query embedding norm: {np.linalg.norm(q):.4f}")
except Exception as e:
print(f"Error embedding query: {e}")
return
# 4) Cosine similarity
print("Computing similarities...")
scores = vecs @ q # Dot product since both are normalized
top_k_idx = np.argsort(-scores)[:TOP_K]
# 5) Display results
print(f"\nTop-{TOP_K} images for: {query_text!r}\n")
print("-" * 80)
for rank, idx in enumerate(top_k_idx, 1):
print(f"{rank:>2}. {paths[idx]:<60} score={scores[idx]:>7.4f}")
print("-" * 80)
# Show stats
print(f"\nStatistics:")
print(f"Mean similarity: {scores.mean():.4f}")
print(f"Max similarity: {scores.max():.4f}")
print(f"Min similarity: {scores.min():.4f}")
print(f"Std similarity: {scores.std():.4f}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,261 @@
import pickle
import logging
from pathlib import Path
from typing import Dict, List, Tuple, Union, Optional
import argparse
import numpy as np
from PIL import Image
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Default paths
EMBED_DIR = Path(__file__).parent.parent / "embeddings"
IMAGE_EMB_PATH = EMBED_DIR / "image_embeddings.pkl"
class ImageSimilarityFinder:
"""
Find similar images using precomputed embeddings from image_embeddings.pkl.
Uses cosine similarity to rank images by similarity.
"""
def __init__(self, embeddings_path: Union[str, Path] = IMAGE_EMB_PATH):
"""
Initialize the similarity finder with precomputed embeddings.
Args:
embeddings_path: Path to the pickle file containing image embeddings
"""
self.embeddings_path = Path(embeddings_path)
self.embeddings = self._load_embeddings()
self.image_paths = list(self.embeddings.keys())
self.embedding_matrix = np.array(list(self.embeddings.values()))
logger.info(f"Loaded {len(self.embeddings)} image embeddings")
def _load_embeddings(self) -> Dict[str, np.ndarray]:
"""Load embeddings from pickle file."""
try:
with open(self.embeddings_path, "rb") as fp:
embeddings = pickle.load(fp)
return embeddings
except FileNotFoundError:
raise FileNotFoundError(f"Embeddings file not found: {self.embeddings_path}")
except Exception as e:
raise RuntimeError(f"Failed to load embeddings: {e}")
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
"""
Calculate cosine similarity between two vectors.
Since embeddings are already L2 normalized, this is just dot product.
"""
return np.dot(vec1, vec2)
def find_similar_by_path(
self,
query_image_path: Union[str, Path],
k: int = 5,
exclude_self: bool = True
) -> List[Tuple[str, float]]:
"""
Find k most similar images to a query image by its path.
Args:
query_image_path: Path to the query image
k: Number of similar images to return
exclude_self: Whether to exclude the query image itself from results
Returns:
List of tuples (image_path, similarity_score) sorted by similarity (highest first)
"""
query_path = str(Path(query_image_path))
if query_path not in self.embeddings:
raise ValueError(f"Query image not found in embeddings: {query_path}")
query_embedding = self.embeddings[query_path]
return self._find_similar_by_embedding(query_embedding, k, exclude_path=query_path if exclude_self else None)
def find_similar_by_embedding(
self,
query_embedding: np.ndarray,
k: int = 5
) -> List[Tuple[str, float]]:
"""
Find k most similar images to a query embedding.
Args:
query_embedding: Query image embedding vector
k: Number of similar images to return
Returns:
List of tuples (image_path, similarity_score) sorted by similarity (highest first)
"""
return self._find_similar_by_embedding(query_embedding, k)
def _find_similar_by_embedding(
self,
query_embedding: np.ndarray,
k: int,
exclude_path: Optional[str] = None
) -> List[Tuple[str, float]]:
"""
Internal method to find similar images by embedding.
Args:
query_embedding: Query embedding vector
k: Number of results to return
exclude_path: Path to exclude from results (typically the query image itself)
Returns:
List of tuples (image_path, similarity_score) sorted by similarity
"""
# Ensure query embedding is normalized (should already be from ImageEmbedder)
query_embedding = query_embedding / np.linalg.norm(query_embedding)
# Compute similarities with all embeddings
similarities = np.dot(self.embedding_matrix, query_embedding)
# Get indices sorted by similarity (descending)
sorted_indices = np.argsort(similarities)[::-1]
# Collect results, excluding the query image if specified
results = []
for idx in sorted_indices:
image_path = self.image_paths[idx]
similarity = similarities[idx]
# Skip if this is the path to exclude
if exclude_path and image_path == exclude_path:
continue
results.append((image_path, float(similarity)))
# Stop when we have k results
if len(results) >= k:
break
return results
def find_similar_images_batch(
self,
query_paths: List[Union[str, Path]],
k: int = 5
) -> Dict[str, List[Tuple[str, float]]]:
"""
Find similar images for multiple query images at once.
Args:
query_paths: List of paths to query images
k: Number of similar images to return for each query
Returns:
Dictionary mapping query_path -> list of (similar_path, similarity_score)
"""
results = {}
for query_path in query_paths:
try:
results[str(query_path)] = self.find_similar_by_path(query_path, k)
except ValueError as e:
logger.warning(f"Skipping {query_path}: {e}")
results[str(query_path)] = []
return results
def get_image_info(self) -> Dict[str, any]:
"""Get information about loaded embeddings."""
return {
"total_images": len(self.embeddings),
"embedding_dimension": self.embedding_matrix.shape[1] if len(self.embedding_matrix) > 0 else 0,
"sample_paths": self.image_paths[:5] if self.image_paths else []
}
def main():
parser = argparse.ArgumentParser(
description="Find similar images using precomputed embeddings"
)
parser.add_argument(
"--query",
type=str,
required=True,
help="Path to the query image"
)
parser.add_argument(
"--embeddings",
type=str,
default=str(IMAGE_EMB_PATH),
help="Path to the embeddings pickle file"
)
parser.add_argument(
"--k",
type=int,
default=5,
help="Number of similar images to return (default: 5)"
)
parser.add_argument(
"--include-self",
action="store_true",
help="Include the query image in results"
)
parser.add_argument(
"--info",
action="store_true",
help="Show information about loaded embeddings"
)
args = parser.parse_args()
# Initialize similarity finder
try:
finder = ImageSimilarityFinder(args.embeddings)
except Exception as e:
logger.error(f"Failed to initialize similarity finder: {e}")
logger.error(f"Make sure you're providing the path to the embeddings pickle file, not an image file")
logger.error(f"Expected: --embeddings /path/to/image_embeddings.pkl")
return
# Show info if requested
if args.info:
info = finder.get_image_info()
print(f"Embeddings Info:")
print(f" Total images: {info['total_images']}")
print(f" Embedding dimension: {info['embedding_dimension']}")
print(f" Sample paths: {info['sample_paths']}")
print()
# Find similar images
try:
similar_images = finder.find_similar_by_path(
args.query,
k=args.k,
exclude_self=not args.include_self
)
print(f"Top {len(similar_images)} similar images to '{args.query}':")
print("-" * 80)
for i, (image_path, similarity) in enumerate(similar_images, 1):
print(f"{i:2d}. {Path(image_path).name}")
print(f" Path: {image_path}")
print(f" Similarity: {similarity:.4f}")
print()
except Exception as e:
logger.error(f"Failed to find similar images: {e}")
# Check if query image exists in embeddings
if "not found in embeddings" in str(e):
logger.error("Available images in embeddings:")
info = finder.get_image_info()
for i, path in enumerate(info['sample_paths']):
logger.error(f" {i + 1}. {path}")
if len(finder.image_paths) > 5:
logger.error(f" ... and {len(finder.image_paths) - 5} more images")
if __name__ == "__main__":
main()

12
examples/starter.py Normal file
View File

@ -0,0 +1,12 @@
from similarity_finder import ImageSimilarityFinder
# Initialize with the embeddings pickle file (not an image file)
finder = ImageSimilarityFinder("E:/GiteaRepo/text2img/embeddings/image_embeddings.pkl")
# Query with the actual image path
query_image = "data/coco/val2017/1140002154.jpg"
similar_images = finder.find_similar_by_path(query_image, k=5)
print(f"Top 5 similar images to {query_image}:")
for i, (image_path, similarity) in enumerate(similar_images, 1):
print(f"{i}. {image_path} (similarity: {similarity:.4f})")

View File

@ -0,0 +1,41 @@
---
inference: false
---
<br>
<br>
# LLaVA Model Card
This is a pretrained checkpoint, you can use it to instruct tune your multimodal models.
Check out the instructions [here](https://github.com/haotian-liu/LLaVA/blob/main/README.md#visual-instruction-tuning)
## Model details
**Model type:**
LLaVA is an open-source chatbot trained by fine-tuning LLaMA/Vicuna on GPT-generated multimodal instruction-following data.
It is an auto-regressive language model, based on the transformer architecture.
**Model date:**
LLaVA-v1.5-MLP2x-336px-Pretrain-Vicuna-7B-v1.5 was trained in September 2023.
**Paper or resources for more information:**
https://llava-vl.github.io/
## License
Llama 2 is licensed under the LLAMA 2 Community License,
Copyright (c) Meta Platforms, Inc. All Rights Reserved.
**Where to send questions or comments about the model:**
https://github.com/haotian-liu/LLaVA/issues
## Intended use
**Primary intended uses:**
The primary use of LLaVA is research on large multimodal models and chatbots.
**Primary intended users:**
The primary intended users of the model are researchers and hobbyists in computer vision, natural language processing, machine learning, and artificial intelligence.
## Training dataset
- 558K filtered image-text pairs from LAION/CC/SBU, captioned by BLIP.

View File

@ -0,0 +1,38 @@
{
"_name_or_path": "./checkpoints/vicuna-7b-v1-5",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"image_aspect_ratio": "square",
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 4096,
"mm_hidden_size": 1024,
"mm_patch_merge_type": "flat",
"mm_projector_type": "mlp2x_gelu",
"mm_use_im_patch_token": false,
"mm_use_im_start_end": false,
"mm_vision_select_feature": "patch",
"mm_vision_select_layer": -2,
"mm_vision_tower": "openai/clip-vit-large-patch14-336",
"model_type": "llava",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 32,
"pad_token_id": 0,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.31.0",
"tune_mm_mlp_adapter": true,
"tune_mm_vision_resampler": false,
"use_cache": true,
"use_mm_proj": true,
"vocab_size": 32000
}

View File

@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

File diff suppressed because it is too large Load Diff

53
readme.md Normal file
View File

@ -0,0 +1,53 @@
# Text2Img
A rough implementation of generating image embeddings through methodologies introduced in LLaVA
### Structure
We derived the image embeddings by using a CLIP encoder and mapping it with the pretrained LLaVAs projection weights layer
### Prerequisites
1. install requirements.txt
2. Make sure you have downloaded [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin")
3. For example image data, I use [2017 Val images 5K/1GB](http://images.cocodataset.org/zips/val2017.zip) and [2017 Train/Val annotations 241MB](http://images.cocodataset.org/annotations/annotations_trainval2017.zip)
### Usage
For image_embedder.py:
1. Embed a single image (Print Only):
`python -m embed.image_embedder
--image "C:\path\img.jpg"
--no-save
`
2. Embed a single image (Save to File):
`python -m embed.image_embedder
--image "C:\path\to\image.jpg"
--out "C:\project\embeddings\image_embeddings.pkl"
`
3. Embed a single folder of images:
`python -m embed.image_embedder
--folder "C:\path\to\images"
--out "C:\project\embeddings\image_embeddings.pkl"
--batch-size 32
`
For text_embedder.py:
1. Embed a Single Article (Print Only):
`python -m embed.text_embedder
--text "This is my single-article input string."
`
2. Embed a Single Article (Save to File):
`python -m embed.text_embedder
--text "This is my single-article input string."
--out "C:\project\embeddings\text_embeddings.pkl"
`
3. Embed multiple articles from a file (one per line):
`python -m embed.text_embedder
--file "C:\path\to\articles.txt"
--out "C:\project\embeddings\text_embeddings.pkl"
--batch-size 8
`