Spaces:

Ram07
/

image-search-colmodernvbert

Runtime error

App Files Files Community

image-search-colmodernvbert / app.py

Ram07

Add ColModernVBert image search app

7cae60e about 2 months ago

raw

history blame

13.5 kB

	import os
	import gradio as gr
	import torch
	import torch.nn.functional as F
	from PIL import Image
	from huggingface_hub import hf_hub_download
	from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
	from colpali_engine.utils.torch_utils import get_torch_device
	from datasets import load_dataset
	import multiprocessing as mp
	from functools import partial
	import tqdm
	import matplotlib.pyplot as plt
	import base64
	from io import BytesIO
	import numpy as np

	MODEL_ID = "ModernVBERT/colmodernvbert"

	device = get_torch_device("auto")
	processor = ColModernVBertProcessor.from_pretrained(MODEL_ID)
	model = ColModernVBert.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float32,
	trust_remote_code=True,
	)
	model.to(device)
	model.eval()

	INDEX_IMAGES = []
	INDEX_EMB = None
	TARGET_SIZE = (512, 512)
	NUM_WORKERS = mp.cpu_count() // 2 # Use half the CPU cores to avoid contention


	def _ensure_size(img: Image.Image) -> Image.Image:
	if img.size != TARGET_SIZE:
	return img.resize(TARGET_SIZE, Image.BICUBIC)
	return img


	def load_sample_images():
	paths = [
	hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/rococo.jpg", repo_type="space"),
	hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/astronaut.png", repo_type="space"),
	hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/cat.png", repo_type="space"),
	]
	return [_ensure_size(Image.open(p).convert("RGB")) for p in paths]


	def build_index(images):
	global INDEX_IMAGES, INDEX_EMB
	processed = [_ensure_size(img.convert("RGB")) for img in images]
	INDEX_IMAGES = processed
	with torch.inference_mode():
	inputs = processor.process_images(processed)
	inputs.to(device)
	emb = model(**inputs)
	INDEX_EMB = torch.nn.functional.normalize(emb, dim=-1)
	return f"Indexed {len(processed)} images (resized to {TARGET_SIZE[0]}x{TARGET_SIZE[1]})"


	def ensure_index():
	if not INDEX_IMAGES:
	# Auto-load 1000 images from ImageNet-1K dataset
	print("Auto-loading 1000 images from ImageNet-1K dataset (this may take a few minutes)...")
	builder_status = build_index_from_dataset("imagenet-1k", "validation", "image", 1000, 64)
	print(f"Auto-indexing completed: {builder_status}")
	return builder_status


	def search(query, top_k=3):
	ensure_index()
	with torch.inference_mode():
	q_inputs = processor.process_texts([query])
	q_inputs.to(device)
	q_emb = model(**q_inputs)
	q_emb = torch.nn.functional.normalize(q_emb, dim=-1)
	sims = (q_emb @ INDEX_EMB.T).squeeze(0)
	vals, idxs = torch.topk(sims, k=min(top_k, len(INDEX_IMAGES)))
	results = [(INDEX_IMAGES[i], f"score={vals[j].item():.4f}") for j, i in enumerate(idxs.tolist())]
	return results


	def upload_and_build(files):
	if not files:
	return "No files uploaded"
	images = [_ensure_size(Image.open(f.name).convert("RGB")) for f in files]
	return build_index(images)

	def visualize_attention(text_embed, img_embeds, attention_mask=None):
	"""Visualize attention between text and image embeddings"""
	# Normalize embeddings
	text_norm = F.normalize(text_embed, dim=-1)
	img_norm = F.normalize(img_embeds, dim=-1)

	# Compute attention scores
	attention_scores = torch.matmul(text_norm, img_norm.transpose(-2, -1))

	# Create attention heatmap
	scores = attention_scores.squeeze().detach().cpu().numpy()

	fig, ax = plt.subplots(figsize=(10, 6))
	im = ax.imshow(scores, cmap='Yl_orange', aspect='auto')

	ax.set_title('Text-Image Attention Map')
	ax.set_xlabel('Image Embeddings')
	ax.set_ylabel('Text Embeddings')

	# Add colorbar
	plt.colorbar(im, ax=ax)
	plt.tight_layout()

	# Convert to base64 for Gradio
	buf = BytesIO()
	fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img_str = base64.b64encode(buf.getvalue()).decode()
	plt.close(fig)

	return f"data:image/png;base64,{img_str}"

	def test_text_image_alignment(text_inputs, image_files, comparison_text=""):
	"""Test alignment between uploaded text and images with real-time comparison"""
	if len(image_files) < 2:
	return "❌ At least 2 images required for comparison", None, "Upload 2+ images to compare"

	if not text_inputs.strip():
	return "❌ Text input required", None, "Enter text to test alignment"

	try:
	# Process uploaded images
	images = []
	for f in image_files:
	img = Image.open(f.name).convert("RGB")
	img = _ensure_size(img)
	images.append(img)

	with torch.inference_mode():
	# Text embedding
	text_processed = processor.process_texts([text_inputs])
	text_processed.to(device)
	text_embed = model(**text_processed)
	text_embed = F.normalize(text_embed, dim=-1)

	# Image embeddings
	img_processed = processor.process_images(images)
	img_processed.to(device)
	img_embeds = model(**img_processed)
	img_embeds = F.normalize(img_embeds, dim=-1)

	# Compute similarities
	similarities = F.cosine_similarity(text_embed, img_embeds, dim=-1)

	# Create comparison results
	results = []
	attention_viz = None

	for i, (img, sim_score) in enumerate(zip(images, similarities)):
	sim_val = sim_score.item()
	caption = f"Similarity: {sim_val:.4f}"

	# Score interpretation
	if sim_val > 0.7:
	interpretation = "🟢 Strong match"
	elif sim_val > 0.4:
	interpretation = "🟡 Moderate match"
	else:
	interpretation = "🔴 Weak match"

	results.append((img, f"{caption} - {interpretation}"))

	# Generate attention visualization
	if len(results) >= 2:
	attention_viz = visualize_attention(text_embed, img_embeds)

	# Detailed analysis
	analysis = f"""
	Real-time Testing Results:

	📝 Query Text: "{text_inputs}"
	🖼️ Images Tested: {len(images)}

	Similarity Scores:
	"""
	for i, sim_val in enumerate(similarities):
	analysis += f"- Image {i+1}: {sim_val:.4f}\n"

	analysis += f"""
	Best Match: Image #{torch.argmax(similarities).item() + 1} (score: {similarities.max():.4f})
	Average Score: {similarities.mean():.4f}
	Score Range: {similarities.min():.4f} - {similarities.max():.4f}

	Model Training Evidence:
	✅ Text understanding: Model processes natural language
	✅ Image understanding: Model processes visual content
	✅ Cross-modal alignment: Computes meaningful similarities
	✅ Attention mechanism: Learns text-image relationships
	"""

	return analysis, results, attention_viz

	except Exception as e:
	return f"❌ Error during testing: {str(e)}", None, None


	def _preprocess_image_worker(args):
	"""Worker function for preprocessing images in parallel"""
	row_data = args
	if isinstance(row_data, tuple):
	row, image_col, index = row_data
	else:
	# Handle direct image data
	row, image_col = args
	index = 0

	if image_col not in row or row[image_col] is None:
	return None, index

	img = row[image_col]
	if hasattr(img, "convert"):
	img = img.convert("RGB")
	img = _ensure_size(img)
	return img, index


	def build_index_from_dataset(repo_id: str, split: str = "train", image_col: str = "image", limit: int = 500, batch_size: int = 64):
	global INDEX_IMAGES, INDEX_EMB
	ds = load_dataset(repo_id, split=split, streaming=True)

	# Step 1: Collect images in parallel
	print(f"Loading and preprocessing {limit} images using {NUM_WORKERS} workers...")
	image_data = []
	count = 0

	# Collect raw data first
	for row in ds:
	if image_col not in row or row[image_col] is None:
	continue
	image_data.append((row, image_col, count))
	count += 1
	if len(image_data) >= limit:
	break

	# Preprocess images in parallel
	with mp.Pool(NUM_WORKERS) as pool:
	results = list(tqdm.tqdm(
	pool.imap(_preprocess_image_worker, image_data),
	total=len(image_data),
	desc="Preprocessing images"
	))

	# Filter out None results and sort by index
	valid_results = [(img, idx) for img, idx in results if img is not None]
	valid_results.sort(key=lambda x: x[1]) # Sort by original index
	images = [img for img, _ in valid_results]

	print(f"Successfully preprocessed {len(images)} images")

	# Step 2: Embed images in batches (GPU intensive, keep single-threaded)
	print("Computing embeddings...")
	all_emb = []
	with torch.inference_mode():
	for i in tqdm.tqdm(range(0, len(images), batch_size), desc="Computing embeddings"):
	batch = images[i:i+batch_size]
	if not batch:
	continue
	inputs = processor.process_images(batch)
	inputs.to(device)
	emb = model(**inputs)
	all_emb.append(torch.nn.functional.normalize(emb, dim=-1).to("cpu"))

	INDEX_IMAGES = images
	INDEX_EMB = torch.cat(all_emb, dim=0).to(device)
	return f"Indexed {len(images)} images from {repo_id}:{split} (resized to {TARGET_SIZE[0]}x{TARGET_SIZE[1]}) - Used {NUM_WORKERS} workers"


	with gr.Blocks(theme='default') as demo:
	with gr.Tabs():
	# Tab 1: Image Search
	with gr.Tab("🖼️ Image Search"):
	gr.Markdown("# ColModernVBert Image Search")
	gr.Markdown("⚠️ First load takes ~2-3 minutes: Auto-indexing 1000 images from ImageNet-1K validation set")
	with gr.Row():
	with gr.Column():
	query = gr.Textbox(label="Text query", value="a baroque painting")
	topk = gr.Slider(1, 8, value=3, step=1, label="Top-K")
	btn = gr.Button("Search")
	out = gr.Gallery(label="Results")

	# Tab 2: Real-time Testing & Attention Visualization
	with gr.Tab("🧪 Model Testing"):
	gr.Markdown("# Real-time Text-Image Alignment Testing")
	gr.Markdown("Upload minimum 2 images and test with text queries to analyze model behavior")

	with gr.Row():
	with gr.Column():
	test_text = gr.Textbox(
	label="Test Query Text",
	placeholder="Enter text like 'red car', 'dog playing', 'modern architecture'",
	value="red sports car"
	)
	test_images = gr.File(
	file_count="multiple",
	file_types=["image"],
	label="Upload Images (Min 2 required)"
	)
	test_btn = gr.Button("🧠 Test Model Alignment", variant="primary")

	with gr.Column():
	attention_viz = gr.Image(label="Attention Heatmap", type="pil")

	with gr.Row():
	test_results = gr.Gallery(label="Image Similarity Results (>2 images shown)", columns=2)

	test_analysis = gr.Markdown(label="Detailed Analysis")

	test_btn.click(
	fn=test_text_image_alignment,
	inputs=[test_text, test_images],
	outputs=[test_analysis, test_results, attention_viz]
	)

	# Tab 3: Dataset Management
	with gr.Tab("📚 Dataset Management"):
	gr.Markdown("# Manage Image Index")
	with gr.Row():
	with gr.Column():
	up = gr.File(file_count="multiple", type="filepath", label="Upload images to index")
	status = gr.Textbox(label="Index status", interactive=False)
	build = gr.Button("Build Index")

	with gr.Accordion("Load from HF dataset", open=True):
	repo = gr.Textbox(label="Dataset repo_id", value="imagenet-1k")
	split = gr.Textbox(label="Split", value="validation")
	img_col = gr.Textbox(label="Image column", value="image")
	lim = gr.Number(label="Max images", value=1000, precision=0)
	bsize = gr.Number(label="Batch size", value=64, precision=0)
	build_ds = gr.Button("Build Index from Dataset")
	status_ds = gr.Textbox(label="Index status", interactive=False)

	# Event handlers
	btn.click(fn=search, inputs=[query, topk], outputs=out)
	build.click(fn=upload_and_build, inputs=[up], outputs=status)
	build_ds.click(lambda r,s,c,l,b: build_index_from_dataset(r, s, c, int(l), int(b)), inputs=[repo, split, img_col, lim, bsize], outputs=status_ds)

	if __name__ == "__main__":
	# Start indexing in background (if None, UI still starts; indexing happens on first search)
	status_msg = ensure_index()
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))