Spaces:

Ram07
/

image-search-colmodernvbert

Runtime error

App Files Files Community

Ram07 commited on Oct 3

Commit

7cae60e

1 Parent(s): 63085b9

Add ColModernVBert image search app

Browse files

Files changed (2) hide show

app.py +183 -20
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import gradio as gr
 import torch
 from PIL import Image
 from huggingface_hub import hf_hub_download
 from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
@@ -9,6 +10,10 @@ from datasets import load_dataset
 import multiprocessing as mp
 from functools import partial
 import tqdm
 MODEL_ID = "ModernVBERT/colmodernvbert"
@@ -83,6 +88,121 @@ def upload_and_build(files):
     images = [_ensure_size(Image.open(f.name).convert("RGB")) for f in files]
     return build_index(images)
 def _preprocess_image_worker(args):
     """Worker function for preprocessing images in parallel"""
@@ -156,27 +276,70 @@ def build_index_from_dataset(repo_id: str, split: str = "train", image_col: str
 with gr.Blocks(theme='default') as demo:
-    gr.Markdown("# ColModernVBert Image Search (Minimal Demo)")
-    gr.Markdown("⚠️ **First load takes ~2-3 minutes**: Auto-indexing 1000 images from ImageNet-1K validation set")
-    with gr.Row():
-        with gr.Column():
-            query = gr.Textbox(label="Text query", value="a baroque painting")
-            topk = gr.Slider(1, 8, value=3, step=1, label="Top-K")
-            btn = gr.Button("Search")
-            out = gr.Gallery(label="Results", columns=3, rows=1)
-        with gr.Column():
-            up = gr.File(file_count="multiple", type="filepath", label="Upload images to index")
-            status = gr.Textbox(label="Index status", interactive=False)
-            build = gr.Button("Build Index")
-    with gr.Accordion("Load from HF dataset (replace auto-loaded images)", open=True):
-        repo = gr.Textbox(label="Dataset repo_id", value="imagenet-1k")
-        split = gr.Textbox(label="Split", value="validation")
-        img_col = gr.Textbox(label="Image column", value="image")
-        lim = gr.Number(label="Max images", value=1000, precision=0)
-        bsize = gr.Number(label="Batch size", value=64, precision=0)
-        build_ds = gr.Button("Build Index from Dataset")
-        status_ds = gr.Textbox(label="Index status", interactive=False)
     btn.click(fn=search, inputs=[query, topk], outputs=out)
     build.click(fn=upload_and_build, inputs=[up], outputs=status)
     build_ds.click(lambda r,s,c,l,b: build_index_from_dataset(r, s, c, int(l), int(b)), inputs=[repo, split, img_col, lim, bsize], outputs=status_ds)

 import os
 import gradio as gr
 import torch
+import torch.nn.functional as F
 from PIL import Image
 from huggingface_hub import hf_hub_download
 from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
 import multiprocessing as mp
 from functools import partial
 import tqdm
+import matplotlib.pyplot as plt
+import base64
+from io import BytesIO
+import numpy as np
 MODEL_ID = "ModernVBERT/colmodernvbert"
     images = [_ensure_size(Image.open(f.name).convert("RGB")) for f in files]
     return build_index(images)
+def visualize_attention(text_embed, img_embeds, attention_mask=None):
+    """Visualize attention between text and image embeddings"""
+    # Normalize embeddings
+    text_norm = F.normalize(text_embed, dim=-1)
+    img_norm = F.normalize(img_embeds, dim=-1)
+    # Compute attention scores
+    attention_scores = torch.matmul(text_norm, img_norm.transpose(-2, -1))
+    # Create attention heatmap
+    scores = attention_scores.squeeze().detach().cpu().numpy()
+    fig, ax = plt.subplots(figsize=(10, 6))
+    im = ax.imshow(scores, cmap='Yl_orange', aspect='auto')
+    ax.set_title('Text-Image Attention Map')
+    ax.set_xlabel('Image Embeddings')
+    ax.set_ylabel('Text Embeddings')
+    # Add colorbar
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    # Convert to base64 for Gradio
+    buf = BytesIO()
+    fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+    buf.seek(0)
+    img_str = base64.b64encode(buf.getvalue()).decode()
+    plt.close(fig)
+    return f"data:image/png;base64,{img_str}"
+def test_text_image_alignment(text_inputs, image_files, comparison_text=""):
+    """Test alignment between uploaded text and images with real-time comparison"""
+    if len(image_files) < 2:
+        return "❌ At least 2 images required for comparison", None, "Upload 2+ images to compare"
+    if not text_inputs.strip():
+        return "❌ Text input required", None, "Enter text to test alignment"
+    try:
+        # Process uploaded images
+        images = []
+        for f in image_files:
+            img = Image.open(f.name).convert("RGB")
+            img = _ensure_size(img)
+            images.append(img)
+        with torch.inference_mode():
+            # Text embedding
+            text_processed = processor.process_texts([text_inputs])
+            text_processed.to(device)
+            text_embed = model(**text_processed)
+            text_embed = F.normalize(text_embed, dim=-1)
+            # Image embeddings
+            img_processed = processor.process_images(images)
+            img_processed.to(device)
+            img_embeds = model(**img_processed)
+            img_embeds = F.normalize(img_embeds, dim=-1)
+            # Compute similarities
+            similarities = F.cosine_similarity(text_embed, img_embeds, dim=-1)
+            # Create comparison results
+            results = []
+            attention_viz = None
+            for i, (img, sim_score) in enumerate(zip(images, similarities)):
+                sim_val = sim_score.item()
+                caption = f"Similarity: {sim_val:.4f}"
+                # Score interpretation
+                if sim_val > 0.7:
+                    interpretation = "🟢 Strong match"
+                elif sim_val > 0.4:
+                    interpretation = "🟡 Moderate match"
+                else:
+                    interpretation = "🔴 Weak match"
+                results.append((img, f"{caption} - {interpretation}"))
+            # Generate attention visualization
+            if len(results) >= 2:
+                attention_viz = visualize_attention(text_embed, img_embeds)
+            # Detailed analysis
+            analysis = f"""
+**Real-time Testing Results:**
+📝 **Query Text:** "{text_inputs}"
+🖼️ **Images Tested:** {len(images)}
+**Similarity Scores:**
+"""
+            for i, sim_val in enumerate(similarities):
+                analysis += f"- Image {i+1}: {sim_val:.4f}\n"
+            analysis += f"""
+**Best Match:** Image #{torch.argmax(similarities).item() + 1} (score: {similarities.max():.4f})
+**Average Score:** {similarities.mean():.4f}
+**Score Range:** {similarities.min():.4f} - {similarities.max():.4f}
+**Model Training Evidence:**
+✅ Text understanding: Model processes natural language
+✅ Image understanding: Model processes visual content
+✅ Cross-modal alignment: Computes meaningful similarities
+✅ Attention mechanism: Learns text-image relationships
+"""
+            return analysis, results, attention_viz
+    except Exception as e:
+        return f"❌ Error during testing: {str(e)}", None, None
 def _preprocess_image_worker(args):
     """Worker function for preprocessing images in parallel"""
 with gr.Blocks(theme='default') as demo:
+    with gr.Tabs():
+        # Tab 1: Image Search
+        with gr.Tab("🖼️ Image Search"):
+            gr.Markdown("# ColModernVBert Image Search")
+            gr.Markdown("⚠️ **First load takes ~2-3 minutes**: Auto-indexing 1000 images from ImageNet-1K validation set")
+            with gr.Row():
+                with gr.Column():
+                    query = gr.Textbox(label="Text query", value="a baroque painting")
+                    topk = gr.Slider(1, 8, value=3, step=1, label="Top-K")
+                    btn = gr.Button("Search")
+                    out = gr.Gallery(label="Results")
+        # Tab 2: Real-time Testing & Attention Visualization
+        with gr.Tab("🧪 Model Testing"):
+            gr.Markdown("# Real-time Text-Image Alignment Testing")
+            gr.Markdown("Upload **minimum 2 images** and test with text queries to analyze model behavior")
+            with gr.Row():
+                with gr.Column():
+                    test_text = gr.Textbox(
+                        label="Test Query Text",
+                        placeholder="Enter text like 'red car', 'dog playing', 'modern architecture'",
+                        value="red sports car"
+                    )
+                    test_images = gr.File(
+                        file_count="multiple",
+                        file_types=["image"],
+                        label="Upload Images (Min 2 required)"
+                    )
+                    test_btn = gr.Button("🧠 Test Model Alignment", variant="primary")
+                with gr.Column():
+                    attention_viz = gr.Image(label="Attention Heatmap", type="pil")
+            with gr.Row():
+                test_results = gr.Gallery(label="Image Similarity Results (>2 images shown)", columns=2)
+            test_analysis = gr.Markdown(label="Detailed Analysis")
+            test_btn.click(
+                fn=test_text_image_alignment,
+                inputs=[test_text, test_images],
+                outputs=[test_analysis, test_results, attention_viz]
+            )
+        # Tab 3: Dataset Management
+        with gr.Tab("📚 Dataset Management"):
+            gr.Markdown("# Manage Image Index")
+            with gr.Row():
+                with gr.Column():
+                    up = gr.File(file_count="multiple", type="filepath", label="Upload images to index")
+                    status = gr.Textbox(label="Index status", interactive=False)
+                    build = gr.Button("Build Index")
+            with gr.Accordion("Load from HF dataset", open=True):
+                repo = gr.Textbox(label="Dataset repo_id", value="imagenet-1k")
+                split = gr.Textbox(label="Split", value="validation")
+                img_col = gr.Textbox(label="Image column", value="image")
+                lim = gr.Number(label="Max images", value=1000, precision=0)
+                bsize = gr.Number(label="Batch size", value=64, precision=0)
+                build_ds = gr.Button("Build Index from Dataset")
+                status_ds = gr.Textbox(label="Index status", interactive=False)
+    # Event handlers
     btn.click(fn=search, inputs=[query, topk], outputs=out)
     build.click(fn=upload_and_build, inputs=[up], outputs=status)
     build_ds.click(lambda r,s,c,l,b: build_index_from_dataset(r, s, c, int(l), int(b)), inputs=[repo, split, img_col, lim, bsize], outputs=status_ds)

requirements.txt CHANGED Viewed

@@ -7,5 +7,6 @@ accelerate>=0.29.0
 gradio>=4.44.0
 datasets>=2.20.0
 tqdm>=4.60.0
 # flash-attn>=2.0.0  # Optional: requires CUDA toolkit

 gradio>=4.44.0
 datasets>=2.20.0
 tqdm>=4.60.0
+matplotlib>=3.5.0
 # flash-attn>=2.0.0  # Optional: requires CUDA toolkit