Spaces:

Ram07
/

image-search-colmodernvbert

Runtime error

App Files Files Community

Ram07 commited on Oct 3

Commit

b3a7580

1 Parent(s): 86e6f10

Add ColModernVBert image search app

Browse files

Files changed (3) hide show

README.md +38 -11
app.py +189 -0
requirements.txt +11 -0

README.md CHANGED Viewed

@@ -1,13 +1,40 @@
----
-title: Image Search Colmodernvbert
-emoji: 📊
-colorFrom: gray
-colorTo: red
-sdk: gradio
-sdk_version: 5.48.0
-app_file: app.py
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Image Search with ColModernVBert
+A multimodal image search demo using ColModernVBert for cross-modal retrieval between text queries and images.
+## Features
+- **Multimodal Search**: Query images using natural language text
+- **ImageNet-1K Dataset**: Searches through 1000 diverse validation images
+- **Real-time Indexing**: Automatically indexes 1000 images on startup
+- **512x512 Optimization**: Images resized for optimal model performance
+- **Multiprocessing**: Fast parallel image preprocessing
+- **Upload Support**: Upload custom images or switch datasets
+## Usage
+Enter text queries like:
+- `"dog"` → Various dog breeds
+- `"sports car"` → Different car models
+- `"musical instrument"` → Guitars, pianos, violins
+- `"food"` → Fruits, vegetables, dishes
+- `"nature"` → Trees, flowers, landscapes
+Adjust the Top-K slider to control the number of results returned.
+## Technical Details
+- **Model**: ColModernVBert (ModernVBERT/colmodernvbert)
+- **Dataset**: ImageNet-1K validation set (1000 images)
+- **Image Size**: 512x512 pixels
+- **Embeddings**: Cosine similarity between text and image embeddings
+## Performance
+- **Indexing**: ~2-3 minutes for 1000 images
+- **Search**: Near-instant results
+- **Memory**: Optimized for Space hardware limits
 ---
+Built with Gradio and ColModernVBert for demonstration purposes.

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import gradio as gr
+import torch
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
+from colpali_engine.utils.torch_utils import get_torch_device
+from datasets import load_dataset
+import multiprocessing as mp
+from functools import partial
+import tqdm
+MODEL_ID = "ModernVBERT/colmodernvbert"
+device = get_torch_device("auto")
+processor = ColModernVBertProcessor.from_pretrained(MODEL_ID)
+model = ColModernVBert.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+)
+model.to(device)
+model.eval()
+INDEX_IMAGES = []
+INDEX_EMB = None
+TARGET_SIZE = (512, 512)
+NUM_WORKERS = mp.cpu_count() // 2  # Use half the CPU cores to avoid contention
+def _ensure_size(img: Image.Image) -> Image.Image:
+    if img.size != TARGET_SIZE:
+        return img.resize(TARGET_SIZE, Image.BICUBIC)
+    return img
+def load_sample_images():
+    paths = [
+        hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/rococo.jpg", repo_type="space"),
+        hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/astronaut.png", repo_type="space"),
+        hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/cat.png", repo_type="space"),
+    ]
+    return [_ensure_size(Image.open(p).convert("RGB")) for p in paths]
+def build_index(images):
+    global INDEX_IMAGES, INDEX_EMB
+    processed = [_ensure_size(img.convert("RGB")) for img in images]
+    INDEX_IMAGES = processed
+    with torch.inference_mode():
+        inputs = processor.process_images(processed)
+        inputs.to(device)
+        emb = model(**inputs)
+        INDEX_EMB = torch.nn.functional.normalize(emb, dim=-1)
+    return f"Indexed {len(processed)} images (resized to {TARGET_SIZE[0]}x{TARGET_SIZE[1]})"
+def ensure_index():
+    if not INDEX_IMAGES:
+        # Auto-load 1000 images from ImageNet-1K dataset
+        print("Auto-loading 1000 images from ImageNet-1K dataset (this may take a few minutes)...")
+        builder_status = build_index_from_dataset("imagenet-1k", "validation", "image", 1000, 64)
+        print(f"Auto-indexing completed: {builder_status}")
+        return builder_status
+def search(query, top_k=3):
+    ensure_index()
+    with torch.inference_mode():
+        q_inputs = processor.process_texts([query])
+        q_inputs.to(device)
+        q_emb = model(**q_inputs)
+        q_emb = torch.nn.functional.normalize(q_emb, dim=-1)
+        sims = (q_emb @ INDEX_EMB.T).squeeze(0)
+        vals, idxs = torch.topk(sims, k=min(top_k, len(INDEX_IMAGES)))
+        results = [(INDEX_IMAGES[i], f"score={vals[j].item():.4f}") for j, i in enumerate(idxs.tolist())]
+    return results
+def upload_and_build(files):
+    if not files:
+        return "No files uploaded"
+    images = [_ensure_size(Image.open(f.name).convert("RGB")) for f in files]
+    return build_index(images)
+def _preprocess_image_worker(args):
+    """Worker function for preprocessing images in parallel"""
+    row_data = args
+    if isinstance(row_data, tuple):
+        row, image_col, index = row_data
+    else:
+        # Handle direct image data
+        row, image_col = args
+        index = 0
+    if image_col not in row or row[image_col] is None:
+        return None, index
+    img = row[image_col]
+    if hasattr(img, "convert"):
+        img = img.convert("RGB")
+    img = _ensure_size(img)
+    return img, index
+def build_index_from_dataset(repo_id: str, split: str = "train", image_col: str = "image", limit: int = 500, batch_size: int = 64):
+    global INDEX_IMAGES, INDEX_EMB
+    ds = load_dataset(repo_id, split=split, streaming=True)
+    # Step 1: Collect images in parallel
+    print(f"Loading and preprocessing {limit} images using {NUM_WORKERS} workers...")
+    image_data = []
+    count = 0
+    # Collect raw data first
+    for row in ds:
+        if image_col not in row or row[image_col] is None:
+            continue
+        image_data.append((row, image_col, count))
+        count += 1
+        if len(image_data) >= limit:
+            break
+    # Preprocess images in parallel
+    with mp.Pool(NUM_WORKERS) as pool:
+        results = list(tqdm.tqdm(
+            pool.imap(_preprocess_image_worker, image_data),
+            total=len(image_data),
+            desc="Preprocessing images"
+        ))
+    # Filter out None results and sort by index
+    valid_results = [(img, idx) for img, idx in results if img is not None]
+    valid_results.sort(key=lambda x: x[1])  # Sort by original index
+    images = [img for img, _ in valid_results]
+    print(f"Successfully preprocessed {len(images)} images")
+    # Step 2: Embed images in batches (GPU intensive, keep single-threaded)
+    print("Computing embeddings...")
+    all_emb = []
+    with torch.inference_mode():
+        for i in tqdm.tqdm(range(0, len(images), batch_size), desc="Computing embeddings"):
+            batch = images[i:i+batch_size]
+            if not batch:
+                continue
+            inputs = processor.process_images(batch)
+            inputs.to(device)
+            emb = model(**inputs)
+            all_emb.append(torch.nn.functional.normalize(emb, dim=-1).to("cpu"))
+    INDEX_IMAGES = images
+    INDEX_EMB = torch.cat(all_emb, dim=0).to(device)
+    return f"Indexed {len(images)} images from {repo_id}:{split} (resized to {TARGET_SIZE[0]}x{TARGET_SIZE[1]}) - Used {NUM_WORKERS} workers"
+with gr.Blocks(theme='default') as demo:
+    gr.Markdown("# ColModernVBert Image Search (Minimal Demo)")
+    gr.Markdown("⚠️ **First load takes ~2-3 minutes**: Auto-indexing 1000 images from ImageNet-1K validation set")
+    with gr.Row():
+        with gr.Column():
+            query = gr.Textbox(label="Text query", value="a baroque painting")
+            topk = gr.Slider(1, 8, value=3, step=1, label="Top-K")
+            btn = gr.Button("Search")
+            out = gr.Gallery(label="Results", columns=3, rows=1)
+        with gr.Column():
+            up = gr.File(file_count="multiple", type="filepath", label="Upload images to index")
+            status = gr.Textbox(label="Index status", interactive=False)
+            build = gr.Button("Build Index")
+    with gr.Accordion("Load from HF dataset (replace auto-loaded images)", open=True):
+        repo = gr.Textbox(label="Dataset repo_id", value="imagenet-1k")
+        split = gr.Textbox(label="Split", value="validation")
+        img_col = gr.Textbox(label="Image column", value="image")
+        lim = gr.Number(label="Max images", value=1000, precision=0)
+        bsize = gr.Number(label="Batch size", value=64, precision=0)
+        build_ds = gr.Button("Build Index from Dataset")
+        status_ds = gr.Textbox(label="Index status", interactive=False)
+    btn.click(fn=search, inputs=[query, topk], outputs=out)
+    build.click(fn=upload_and_build, inputs=[up], outputs=status)
+    build_ds.click(lambda r,s,c,l,b: build_index_from_dataset(r, s, c, int(l), int(b)), inputs=[repo, split, img_col, lim, bsize], outputs=status_ds)
+if __name__ == "__main__":
+    # Start indexing in background (if None, UI still starts; indexing happens on first search)
+    status_msg = ensure_index()
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+huggingface_hub>=0.35.3
+torch>=2.2.0
+torchvision>=0.17.0
+transformers>=4.40.2
+pillow>=10.3.0
+accelerate>=0.29.0
+gradio>=4.44.0
+datasets>=2.20.0
+tqdm>=4.60.0
+# flash-attn>=2.0.0  # Optional: requires CUDA toolkit