Initial Commit

2025-06-18 16:07:02 +08:00 · 2025-06-18 16:07:02 +08:00 · d173a94399
commit d173a94399
14 changed files with 14052 additions and 0 deletions
--- a/embed/init.py
+++ b/embed/init.py
@ -0,0 +1,2 @@
+class ImageEmbedder:
+    pass
--- a/embed/image_embedder.py
+++ b/embed/image_embedder.py
@ -0,0 +1,258 @@
+import pickle
+import logging
+from pathlib import Path
+from typing import Dict, List, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image, UnidentifiedImageError
+from transformers import CLIPImageProcessor, CLIPVisionModel
+from tqdm import tqdm
+from atomicwrites import atomic_write
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Device & defaults
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DEFAULT_VISION_MODEL = "openai/clip-vit-large-patch14-336"
+PROJECTOR_WEIGHTS_PATH = (
+        Path(__file__).parent.parent
+        / "models"
+        / "llava-v1.5-mlp2x-336px-vicuna-7b"
+        / "mm_projector.bin"
+)
+OUTPUT_DIM = 4096
+DEFAULT_BATCH_SIZE = 32
+
+# Default output path
+EMBED_DIR = Path(__file__).parent.parent / "embeddings"
+IMAGE_EMB_PATH = EMBED_DIR / "image_embeddings.pkl"
+
+
+class ImageEmbedder:
+    """
+    Frozen CLIP vision encoder + pretrained 2-layer MLP projector.
+    Uses PATCH-LEVEL features only (mean of patch embeddings) following
+    LLaVA's configuration: mm_vision_select_feature="patch".
+    Provides single-image and folder-wide embedding, with options to
+    print only or append to a persistent pickle store via atomic writes.
+    """
+
+    def __init__(
+            self,
+            vision_model_name: str = DEFAULT_VISION_MODEL,
+            projector_weights_path: Union[str, Path] = PROJECTOR_WEIGHTS_PATH,
+            output_dim: int = OUTPUT_DIM,
+    ):
+        # 1) Setup CLIP processor + model
+        self.processor = CLIPImageProcessor.from_pretrained(vision_model_name)
+        self.vision_model = (
+            CLIPVisionModel
+            .from_pretrained(vision_model_name, output_hidden_states=True)
+            .to(DEVICE)
+            .eval()
+        )
+        for p in self.vision_model.parameters():
+            p.requires_grad = False
+
+        # 2) Build MLP projector - PATCH ONLY (1024 input, not 2048)
+        # Following LLaVA config: mm_vision_select_feature="patch", mm_hidden_size=1024
+        hidden_dim = self.vision_model.config.hidden_size  # 1024, not 1024*2
+        self.projector = nn.Sequential(
+            nn.Linear(hidden_dim, output_dim),  # 1024 -> 4096
+            nn.GELU(),
+            nn.Linear(output_dim, output_dim),  # 4096 -> 4096
+        ).to(DEVICE)
+
+        # 3) Load projector weights
+        ckpt = torch.load(projector_weights_path, map_location=DEVICE)
+        state = ckpt.get("projector_state_dict", ckpt)
+        # Strip prefix
+        clean_state = {
+            (k[len("model.mm_projector."):] if k.startswith("model.mm_projector.") else k): v
+            for k, v in state.items()
+        }
+        # Load with warning on mismatch
+        missing, unexpected = self.projector.load_state_dict(clean_state, strict=False)
+        if missing or unexpected:
+            logger.info(f"Loaded projector weights with missing={missing}, unexpected={unexpected}")
+
+    def image_to_embedding(self, image: Image.Image) -> torch.Tensor:
+        """
+        Embed a single PIL.Image → (output_dim,) L2-normalized vector.
+        Uses PATCH-LEVEL features only (mean of patches, no CLS token).
+        """
+        try:
+            inputs = self.processor(images=image, return_tensors="pt")
+            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        except Exception as e:
+            logger.exception(f"Preprocessing failed: {e}")
+            raise
+
+        with torch.no_grad():
+            last_hidden = self.vision_model(**inputs).last_hidden_state
+
+        # PATCH-ONLY: Use mean of patch embeddings, drop CLS token
+        # last_hidden shape: (1, 1+num_patches, 1024)
+        # [:, 0, :] = CLS token (skip)
+        # [:, 1:, :] = patch tokens (use mean)
+        patch_mean = last_hidden[:, 1:, :].mean(dim=1)  # (1, 1024)
+        combo = nn.functional.normalize(patch_mean, p=2, dim=1)  # L2 normalize
+
+        proj = self.projector(combo)  # (1, 4096)
+        emb = nn.functional.normalize(proj, p=2, dim=1)  # L2 normalize output
+        return emb.squeeze(0)  # → (OUTPUT_DIM,)
+
+    def embed_folder(
+            self,
+            folder_path: Union[str, Path],
+            save_path: Union[str, Path] = IMAGE_EMB_PATH,
+            extensions: List[str] = None,
+            batch_size: int = DEFAULT_BATCH_SIZE,
+    ) -> Dict[str, np.ndarray]:
+        """
+        Embed all images in a folder (recursively), skipping already-embedded
+        and corrupted files, saving after each batch.
+        Uses PATCH-LEVEL features only.
+        """
+        if extensions is None:
+            extensions = [".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"]
+
+        folder = Path(folder_path)
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Load existing embeddings
+        try:
+            with open(save_path, "rb") as fp:
+                mapping = pickle.load(fp)
+        except FileNotFoundError:
+            mapping = {}
+        except Exception as e:
+            logger.warning(f"Could not load embeddings: {e}")
+            mapping = {}
+
+        # Collect files to embed
+        all_files = [f for f in folder.rglob("*") if f.suffix.lower() in extensions]
+        to_process = [f for f in all_files if str(f) not in mapping]
+        if not to_process:
+            logger.info("No new images to embed.")
+            return mapping
+
+        logger.info(f"Embedding {len(to_process)} images (batch size {batch_size})")
+        for i in tqdm(range(0, len(to_process), batch_size), desc="Images"):
+            batch = to_process[i: i + batch_size]
+            imgs, paths = [], []
+            for f in batch:
+                try:
+                    imgs.append(Image.open(f).convert("RGB"))
+                    paths.append(f)
+                except UnidentifiedImageError:
+                    logger.warning(f"Skipping corrupted: {f}")
+                except Exception as e:
+                    logger.warning(f"Error loading {f}: {e}")
+
+            if not imgs:
+                continue
+
+            # Embed batch
+            inputs = self.processor(images=imgs, return_tensors="pt")
+            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+            with torch.no_grad():
+                last_hidden = self.vision_model(**inputs).last_hidden_state
+
+            # PATCH-ONLY: Use mean of patches, drop CLS token
+            patch_means = last_hidden[:, 1:, :].mean(dim=1)  # (batch_size, 1024)
+            combo = nn.functional.normalize(patch_means, p=2, dim=1)
+            proj = self.projector(combo)  # (batch_size, 4096)
+            embs = nn.functional.normalize(proj, p=2, dim=1).cpu().detach().numpy()
+
+            # Update and save
+            for pth, emb in zip(paths, embs):
+                mapping[str(pth)] = emb
+            with atomic_write(save_path, mode="wb", overwrite=True) as fp:
+                pickle.dump(mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)
+
+        logger.info(f"Done. Total embeddings: {len(mapping)}")
+        return mapping
+
+    def embed_image(
+            self,
+            image_path: Union[str, Path],
+            save_path: Union[str, Path] = IMAGE_EMB_PATH,
+            no_save: bool = False,
+    ) -> torch.Tensor:
+        """
+        Embed one image using PATCH-LEVEL features only:
+         - prints the vector
+         - unless no_save=True, also appends to the pickle at save_path
+        Returns the torch.Tensor.
+        """
+        img_path = Path(image_path)
+        try:
+            img = Image.open(img_path).convert("RGB")
+            vec = self.image_to_embedding(img)
+        except Exception as e:
+            logger.error(f"Failed embedding {img_path}: {e}")
+            raise
+
+        # Print out the vector
+        vec_np = vec.cpu().detach().numpy()
+        print(f"{img_path} → shape={vec_np.shape}")
+        print(vec_np)
+
+        # Append to pickle unless disabled
+        if not no_save:
+            save_path = Path(save_path)
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                with open(save_path, "rb") as fp:
+                    mapping = pickle.load(fp)
+            except (FileNotFoundError, pickle.PickleError):
+                mapping = {}
+
+            mapping[str(img_path)] = vec_np
+            with atomic_write(save_path, overwrite=True) as fp:
+                fp.write(pickle.dumps(mapping, protocol=pickle.HIGHEST_PROTOCOL))
+            logger.info(f"Saved embedding for {img_path} to {save_path}")
+
+        return vec
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument("--image", type=str, help="Path to a single image to embed")
+    parser.add_argument("--folder", type=str, help="Path to a folder of images")
+    parser.add_argument("--out", type=str, default=str(IMAGE_EMB_PATH),
+                        help="Pickle to save embeddings (default: embeddings/image_embeddings.pkl)")
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
+                        help="Batch size for folder embedding")
+    parser.add_argument("--no-save", action="store_true",
+                        help="When embedding a single image, do not save to the pickle")
+    args = parser.parse_args()
+
+    embedder = ImageEmbedder()
+
+    if args.image:
+        embedder.embed_image(
+            image_path=args.image,
+            save_path=args.out,
+            no_save=args.no_save
+        )
+    elif args.folder:
+        embedder.embed_folder(
+            folder_path=args.folder,
+            save_path=args.out,
+            batch_size=args.batch_size
+        )
+    else:
+        parser.error("Please specify --image or --folder")
+
+
+if __name__ == "__main__":
+    main()
--- a/embed/text_embedder.py
+++ b/embed/text_embedder.py
@ -0,0 +1,138 @@
+import pickle
+import logging
+from pathlib import Path
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from cachetools import LRUCache
+from tqdm import tqdm
+from atomicwrites import atomic_write
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Device & defaults
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Use a model that's good for semantic similarity
+DEFAULT_MODEL = "sentence-transformers/clip-ViT-B-32-multilingual-v1"  # CLIP-based, good for images
+OUTPUT_DIM = 4096
+ARTICLE_CACHE_SIZE = 1024
+
+# Persistence paths
+EMBEDDINGS_DIR = Path(__file__).parent.parent / "embeddings"
+TEXT_EMB_PATH = EMBEDDINGS_DIR / "text_embeddings.pkl"
+
+
+class SimpleSentenceEmbedder:
+    """
+    Simple text embedder using sentence-transformers with CLIP backbone.
+    This should give much better text-image alignment.
+    """
+
+    def __init__(
+            self,
+            model_name: str = DEFAULT_MODEL,
+            output_dim: int = OUTPUT_DIM,
+    ):
+        # Load sentence transformer model
+        self.model = SentenceTransformer(model_name, device=DEVICE)
+
+        # Get the model's output dimension
+        model_dim = self.model.get_sentence_embedding_dimension()
+
+        # Simple projector to match your 4096-D image embeddings
+        self.projector = nn.Sequential(
+            nn.Linear(model_dim, output_dim),
+            nn.ReLU(),
+            nn.Linear(output_dim, output_dim),
+        ).to(DEVICE)
+
+        # Better initialization
+        for layer in self.projector:
+            if isinstance(layer, nn.Linear):
+                nn.init.xavier_uniform_(layer.weight)
+                nn.init.zeros_(layer.bias)
+
+        self.article_cache = LRUCache(maxsize=ARTICLE_CACHE_SIZE)
+
+        # Ensure embeddings dir
+        EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
+
+        logger.info(f"Initialized SentenceEmbedder with {model_name} ({model_dim}D -> {output_dim}D)")
+
+    def text_to_embedding(self, text: str) -> torch.Tensor:
+        """
+        Embed text using sentence transformer + simple projector
+        """
+        if text in self.article_cache:
+            return self.article_cache[text]
+
+        # Get sentence embedding
+        with torch.no_grad():
+            sentence_emb = self.model.encode([text], convert_to_tensor=True, device=DEVICE)
+            sentence_emb = nn.functional.normalize(sentence_emb, p=2, dim=1)
+
+            # Project to target dimension
+            projected = self.projector(sentence_emb)
+            projected = nn.functional.normalize(projected, p=2, dim=1)
+
+            result = projected.squeeze(0).cpu()
+
+        self.article_cache[text] = result
+        return result
+
+    def embed_file(
+            self,
+            text_file: Union[str, Path],
+            save_path: Union[str, Path] = TEXT_EMB_PATH
+    ) -> Dict[str, np.ndarray]:
+        """
+        Read newline-separated articles and embed them
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            with open(save_path, "rb") as fp:
+                mapping: Dict[str, np.ndarray] = pickle.load(fp)
+        except FileNotFoundError:
+            mapping = {}
+        except Exception as e:
+            logger.warning(f"Failed loading {save_path}: {e}")
+            mapping = {}
+
+        try:
+            with open(text_file, "r", encoding="utf-8") as fp:
+                lines = [ln.strip() for ln in fp if ln.strip()]
+        except Exception as e:
+            raise RuntimeError(f"Error reading {text_file}: {e}")
+
+        for ln in tqdm(lines, desc="Embedding articles"):
+            if ln not in mapping:
+                try:
+                    vec = self.text_to_embedding(ln)
+                    mapping[ln] = vec.cpu().numpy()
+                except Exception as e:
+                    logger.error(f"Embedding failed for article: {e}")
+
+        with atomic_write(save_path, mode='wb', overwrite=True) as f:
+            pickle.dump(mapping, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+        return mapping
+
+    def get_memory_usage(self):
+        """Get current GPU memory usage if available."""
+        if torch.cuda.is_available():
+            return {
+                'allocated': torch.cuda.memory_allocated() / 1024 ** 3,  # GB
+                'reserved': torch.cuda.memory_reserved() / 1024 ** 3,  # GB
+            }
+        return {'allocated': 0, 'reserved': 0}
+
+
+# Alias for compatibility
+TextEmbedder = SimpleSentenceEmbedder
--- a/embeddings/chunk_texts.pkl
+++ b/embeddings/chunk_texts.pkl
--- a/embeddings/image_embeddings.pkl
+++ b/embeddings/image_embeddings.pkl
--- a/examples/search.py
+++ b/examples/search.py
@ -0,0 +1,103 @@
+import numpy as np
+import pickle
+from pathlib import Path
+import torch
+import torch.nn as nn
+from sentence_transformers import SentenceTransformer
+
+IMG_EMB_PKL = Path("E:/GiteaRepo/text2img/embeddings/image_embeddings.pkl")
+TOP_K = 5
+query_text = """orange cat"""
+
+
+def get_text_embedding(text, model, projector, device):
+    """Get text embedding using sentence transformer + projector"""
+    with torch.no_grad():
+        # Get sentence embedding
+        sentence_emb = model.encode([text], convert_to_tensor=True, device=device)
+        sentence_emb = nn.functional.normalize(sentence_emb, p=2, dim=1)
+
+        # Project to 4096D
+        projected = projector(sentence_emb)
+        projected = nn.functional.normalize(projected, p=2, dim=1)
+
+        return projected.squeeze(0).cpu().numpy()
+
+
+def main():
+    print(f"Loading image embeddings from {IMG_EMB_PKL}")
+
+    # 1) Load image embeddings
+    try:
+        with open(IMG_EMB_PKL, "rb") as f:
+            data = pickle.load(f)
+    except FileNotFoundError:
+        print(f"Error: Could not find {IMG_EMB_PKL}")
+        return
+
+    paths = np.array(list(data.keys()))
+    vecs = np.stack(list(data.values()))  # shape (N, 4096)
+    print(f"Loaded {len(paths)} image embeddings with shape {vecs.shape}")
+
+    # 2) Initialize text embedding model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    # Use CLIP-based sentence transformer for better text-image alignment
+    model_name = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
+    print(f"Loading text model: {model_name}")
+
+    try:
+        model = SentenceTransformer(model_name, device=device)
+        model_dim = model.get_sentence_embedding_dimension()
+        print(f"Text model dimension: {model_dim}")
+
+        # Simple projector to match 4096D image embeddings
+        projector = nn.Sequential(
+            nn.Linear(model_dim, 4096),
+            nn.ReLU(),
+            nn.Linear(4096, 4096),
+        ).to(device)
+
+        # Better initialization
+        for layer in projector:
+            if isinstance(layer, nn.Linear):
+                nn.init.xavier_uniform_(layer.weight)
+                nn.init.zeros_(layer.bias)
+
+    except Exception as e:
+        print(f"Error loading text model: {e}")
+        return
+
+    # 3) Embed the query
+    print(f"Embedding query: '{query_text}'")
+    try:
+        q = get_text_embedding(query_text, model, projector, device)
+        print(f"Query embedding shape: {q.shape}")
+        print(f"Query embedding norm: {np.linalg.norm(q):.4f}")
+    except Exception as e:
+        print(f"Error embedding query: {e}")
+        return
+
+    # 4) Cosine similarity
+    print("Computing similarities...")
+    scores = vecs @ q  # Dot product since both are normalized
+    top_k_idx = np.argsort(-scores)[:TOP_K]
+
+    # 5) Display results
+    print(f"\nTop-{TOP_K} images for: {query_text!r}\n")
+    print("-" * 80)
+    for rank, idx in enumerate(top_k_idx, 1):
+        print(f"{rank:>2}. {paths[idx]:<60} score={scores[idx]:>7.4f}")
+    print("-" * 80)
+
+    # Show stats
+    print(f"\nStatistics:")
+    print(f"Mean similarity: {scores.mean():.4f}")
+    print(f"Max similarity:  {scores.max():.4f}")
+    print(f"Min similarity:  {scores.min():.4f}")
+    print(f"Std similarity:  {scores.std():.4f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/similarity_finder.py
+++ b/examples/similarity_finder.py
@ -0,0 +1,261 @@
+import pickle
+import logging
+from pathlib import Path
+from typing import Dict, List, Tuple, Union, Optional
+import argparse
+
+import numpy as np
+from PIL import Image
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Default paths
+EMBED_DIR = Path(__file__).parent.parent / "embeddings"
+IMAGE_EMB_PATH = EMBED_DIR / "image_embeddings.pkl"
+
+
+class ImageSimilarityFinder:
+    """
+    Find similar images using precomputed embeddings from image_embeddings.pkl.
+    Uses cosine similarity to rank images by similarity.
+    """
+
+    def __init__(self, embeddings_path: Union[str, Path] = IMAGE_EMB_PATH):
+        """
+        Initialize the similarity finder with precomputed embeddings.
+
+        Args:
+            embeddings_path: Path to the pickle file containing image embeddings
+        """
+        self.embeddings_path = Path(embeddings_path)
+        self.embeddings = self._load_embeddings()
+        self.image_paths = list(self.embeddings.keys())
+        self.embedding_matrix = np.array(list(self.embeddings.values()))
+
+        logger.info(f"Loaded {len(self.embeddings)} image embeddings")
+
+    def _load_embeddings(self) -> Dict[str, np.ndarray]:
+        """Load embeddings from pickle file."""
+        try:
+            with open(self.embeddings_path, "rb") as fp:
+                embeddings = pickle.load(fp)
+            return embeddings
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Embeddings file not found: {self.embeddings_path}")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load embeddings: {e}")
+
+    def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
+        """
+        Calculate cosine similarity between two vectors.
+        Since embeddings are already L2 normalized, this is just dot product.
+        """
+        return np.dot(vec1, vec2)
+
+    def find_similar_by_path(
+            self,
+            query_image_path: Union[str, Path],
+            k: int = 5,
+            exclude_self: bool = True
+    ) -> List[Tuple[str, float]]:
+        """
+        Find k most similar images to a query image by its path.
+
+        Args:
+            query_image_path: Path to the query image
+            k: Number of similar images to return
+            exclude_self: Whether to exclude the query image itself from results
+
+        Returns:
+            List of tuples (image_path, similarity_score) sorted by similarity (highest first)
+        """
+        query_path = str(Path(query_image_path))
+
+        if query_path not in self.embeddings:
+            raise ValueError(f"Query image not found in embeddings: {query_path}")
+
+        query_embedding = self.embeddings[query_path]
+        return self._find_similar_by_embedding(query_embedding, k, exclude_path=query_path if exclude_self else None)
+
+    def find_similar_by_embedding(
+            self,
+            query_embedding: np.ndarray,
+            k: int = 5
+    ) -> List[Tuple[str, float]]:
+        """
+        Find k most similar images to a query embedding.
+
+        Args:
+            query_embedding: Query image embedding vector
+            k: Number of similar images to return
+
+        Returns:
+            List of tuples (image_path, similarity_score) sorted by similarity (highest first)
+        """
+        return self._find_similar_by_embedding(query_embedding, k)
+
+    def _find_similar_by_embedding(
+            self,
+            query_embedding: np.ndarray,
+            k: int,
+            exclude_path: Optional[str] = None
+    ) -> List[Tuple[str, float]]:
+        """
+        Internal method to find similar images by embedding.
+
+        Args:
+            query_embedding: Query embedding vector
+            k: Number of results to return
+            exclude_path: Path to exclude from results (typically the query image itself)
+
+        Returns:
+            List of tuples (image_path, similarity_score) sorted by similarity
+        """
+        # Ensure query embedding is normalized (should already be from ImageEmbedder)
+        query_embedding = query_embedding / np.linalg.norm(query_embedding)
+
+        # Compute similarities with all embeddings
+        similarities = np.dot(self.embedding_matrix, query_embedding)
+
+        # Get indices sorted by similarity (descending)
+        sorted_indices = np.argsort(similarities)[::-1]
+
+        # Collect results, excluding the query image if specified
+        results = []
+        for idx in sorted_indices:
+            image_path = self.image_paths[idx]
+            similarity = similarities[idx]
+
+            # Skip if this is the path to exclude
+            if exclude_path and image_path == exclude_path:
+                continue
+
+            results.append((image_path, float(similarity)))
+
+            # Stop when we have k results
+            if len(results) >= k:
+                break
+
+        return results
+
+    def find_similar_images_batch(
+            self,
+            query_paths: List[Union[str, Path]],
+            k: int = 5
+    ) -> Dict[str, List[Tuple[str, float]]]:
+        """
+        Find similar images for multiple query images at once.
+
+        Args:
+            query_paths: List of paths to query images
+            k: Number of similar images to return for each query
+
+        Returns:
+            Dictionary mapping query_path -> list of (similar_path, similarity_score)
+        """
+        results = {}
+        for query_path in query_paths:
+            try:
+                results[str(query_path)] = self.find_similar_by_path(query_path, k)
+            except ValueError as e:
+                logger.warning(f"Skipping {query_path}: {e}")
+                results[str(query_path)] = []
+
+        return results
+
+    def get_image_info(self) -> Dict[str, any]:
+        """Get information about loaded embeddings."""
+        return {
+            "total_images": len(self.embeddings),
+            "embedding_dimension": self.embedding_matrix.shape[1] if len(self.embedding_matrix) > 0 else 0,
+            "sample_paths": self.image_paths[:5] if self.image_paths else []
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Find similar images using precomputed embeddings"
+    )
+    parser.add_argument(
+        "--query",
+        type=str,
+        required=True,
+        help="Path to the query image"
+    )
+    parser.add_argument(
+        "--embeddings",
+        type=str,
+        default=str(IMAGE_EMB_PATH),
+        help="Path to the embeddings pickle file"
+    )
+    parser.add_argument(
+        "--k",
+        type=int,
+        default=5,
+        help="Number of similar images to return (default: 5)"
+    )
+    parser.add_argument(
+        "--include-self",
+        action="store_true",
+        help="Include the query image in results"
+    )
+    parser.add_argument(
+        "--info",
+        action="store_true",
+        help="Show information about loaded embeddings"
+    )
+
+    args = parser.parse_args()
+
+    # Initialize similarity finder
+    try:
+        finder = ImageSimilarityFinder(args.embeddings)
+    except Exception as e:
+        logger.error(f"Failed to initialize similarity finder: {e}")
+        logger.error(f"Make sure you're providing the path to the embeddings pickle file, not an image file")
+        logger.error(f"Expected: --embeddings /path/to/image_embeddings.pkl")
+        return
+
+    # Show info if requested
+    if args.info:
+        info = finder.get_image_info()
+        print(f"Embeddings Info:")
+        print(f"  Total images: {info['total_images']}")
+        print(f"  Embedding dimension: {info['embedding_dimension']}")
+        print(f"  Sample paths: {info['sample_paths']}")
+        print()
+
+    # Find similar images
+    try:
+        similar_images = finder.find_similar_by_path(
+            args.query,
+            k=args.k,
+            exclude_self=not args.include_self
+        )
+
+        print(f"Top {len(similar_images)} similar images to '{args.query}':")
+        print("-" * 80)
+
+        for i, (image_path, similarity) in enumerate(similar_images, 1):
+            print(f"{i:2d}. {Path(image_path).name}")
+            print(f"    Path: {image_path}")
+            print(f"    Similarity: {similarity:.4f}")
+            print()
+
+    except Exception as e:
+        logger.error(f"Failed to find similar images: {e}")
+
+        # Check if query image exists in embeddings
+        if "not found in embeddings" in str(e):
+            logger.error("Available images in embeddings:")
+            info = finder.get_image_info()
+            for i, path in enumerate(info['sample_paths']):
+                logger.error(f"  {i + 1}. {path}")
+            if len(finder.image_paths) > 5:
+                logger.error(f"  ... and {len(finder.image_paths) - 5} more images")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/starter.py
+++ b/examples/starter.py
@ -0,0 +1,12 @@
+from similarity_finder import ImageSimilarityFinder
+
+# Initialize with the embeddings pickle file (not an image file)
+finder = ImageSimilarityFinder("E:/GiteaRepo/text2img/embeddings/image_embeddings.pkl")
+
+# Query with the actual image path
+query_image = "data/coco/val2017/1140002154.jpg"
+similar_images = finder.find_similar_by_path(query_image, k=5)
+
+print(f"Top 5 similar images to {query_image}:")
+for i, (image_path, similarity) in enumerate(similar_images, 1):
+    print(f"{i}. {image_path} (similarity: {similarity:.4f})")
--- a/models/llava-v1.5-mlp2x-336px-vicuna-7b/README.md
+++ b/models/llava-v1.5-mlp2x-336px-vicuna-7b/README.md
@ -0,0 +1,41 @@
+---
+inference: false
+---
+
+<br>
+<br>
+
+# LLaVA Model Card
+
+This is a pretrained checkpoint, you can use it to instruct tune your multimodal models.
+
+Check out the instructions [here](https://github.com/haotian-liu/LLaVA/blob/main/README.md#visual-instruction-tuning)
+
+## Model details
+
+**Model type:**
+LLaVA is an open-source chatbot trained by fine-tuning LLaMA/Vicuna on GPT-generated multimodal instruction-following data.
+It is an auto-regressive language model, based on the transformer architecture.
+
+**Model date:**
+LLaVA-v1.5-MLP2x-336px-Pretrain-Vicuna-7B-v1.5 was trained in September 2023.
+
+**Paper or resources for more information:**
+https://llava-vl.github.io/
+
+## License
+Llama 2 is licensed under the LLAMA 2 Community License, 
+Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+**Where to send questions or comments about the model:**
+https://github.com/haotian-liu/LLaVA/issues
+
+## Intended use
+**Primary intended uses:**
+The primary use of LLaVA is research on large multimodal models and chatbots.
+
+**Primary intended users:**
+The primary intended users of the model are researchers and hobbyists in computer vision, natural language processing, machine learning, and artificial intelligence.
+
+## Training dataset
+- 558K filtered image-text pairs from LAION/CC/SBU, captioned by BLIP.
--- a/models/llava-v1.5-mlp2x-336px-vicuna-7b/config.json
+++ b/models/llava-v1.5-mlp2x-336px-vicuna-7b/config.json
@ -0,0 +1,38 @@
+{
+  "_name_or_path": "./checkpoints/vicuna-7b-v1-5",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "square",
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "mm_hidden_size": 1024,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "model_type": "llava",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "tune_mm_mlp_adapter": true,
+  "tune_mm_vision_resampler": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vocab_size": 32000
+}
--- a/models/llava-v1.5-mlp2x-336px-vicuna-7b/gitattributes
+++ b/models/llava-v1.5-mlp2x-336px-vicuna-7b/gitattributes
@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/models/llava-v1.5-mlp2x-336px-vicuna-7b/mm_projector.bin
+++ b/models/llava-v1.5-mlp2x-336px-vicuna-7b/mm_projector.bin
--- a/models/llava-v1.5-mlp2x-336px-vicuna-7b/trainer_state.json
+++ b/models/llava-v1.5-mlp2x-336px-vicuna-7b/trainer_state.json
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,53 @@
+# Text2Img
+
+A rough implementation of generating image embeddings through methodologies introduced in LLaVA
+
+### Structure
+We derived the image embeddings by using a CLIP encoder and mapping it with the pretrained LLaVA’s projection weights layer
+
+### Prerequisites
+1. install requirements.txt
+2. Make sure you have downloaded [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin")
+3. For example image data, I use [2017 Val images 5K/1GB](http://images.cocodataset.org/zips/val2017.zip) and [2017 Train/Val annotations 241MB](http://images.cocodataset.org/annotations/annotations_trainval2017.zip)
+
+### Usage
+
+For image_embedder.py:
+
+1. Embed a single image (Print Only):
+`python -m embed.image_embedder
+  --image "C:\path\img.jpg"
+  --no-save
+`
+
+2. Embed a single image (Save to File):
+`python -m embed.image_embedder
+  --image "C:\path\to\image.jpg"
+  --out   "C:\project\embeddings\image_embeddings.pkl"
+`
+
+3. Embed a single folder of images:
+`python -m embed.image_embedder
+  --folder "C:\path\to\images"
+  --out "C:\project\embeddings\image_embeddings.pkl"
+  --batch-size 32
+`
+
+For text_embedder.py:
+1. Embed a Single Article (Print Only):
+`python -m embed.text_embedder
+  --text "This is my single-article input string."
+`
+
+2. Embed a Single Article (Save to File):
+`python -m embed.text_embedder
+  --text "This is my single-article input string."
+  --out  "C:\project\embeddings\text_embeddings.pkl"
+`
+
+3. Embed multiple articles from a file (one per line):
+`python -m embed.text_embedder
+  --file "C:\path\to\articles.txt"
+  --out "C:\project\embeddings\text_embeddings.pkl"
+  --batch-size 8
+`