Spaces:

akhaliq
/

FastVLM-7B

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Sep 2

Commit

6a6a2f0

verified ·

1 Parent(s): aab58a2

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import cv2
+import numpy as np
+from typing import Optional
+import tempfile
+import os
+MID = "apple/FastVLM-7B"
+IMAGE_TOKEN_INDEX = -200
+# Load model and tokenizer
+print("Loading FastVLM model...")
+tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MID,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
+    trust_remote_code=True,
+)
+print("Model loaded successfully!")
+def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
+    """Extract frames from video"""
+    cap = cv2.VideoCapture(video_path)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total_frames == 0:
+        cap.release()
+        return []
+    frames = []
+    if sampling_method == "uniform":
+        # Uniform sampling
+        indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    elif sampling_method == "first":
+        # Take first N frames
+        indices = list(range(min(num_frames, total_frames)))
+    elif sampling_method == "last":
+        # Take last N frames
+        start = max(0, total_frames - num_frames)
+        indices = list(range(start, total_frames))
+    else:  # middle
+        # Take frames from the middle
+        start = max(0, (total_frames - num_frames) // 2)
+        indices = list(range(start, min(start + num_frames, total_frames)))
+    for idx in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame_rgb))
+    cap.release()
+    return frames
+def caption_frame(image: Image.Image, prompt: str) -> str:
+    """Generate caption for a single frame"""
+    # Build chat with custom prompt
+    messages = [
+        {"role": "user", "content": f"<image>\n{prompt}"}
+    ]
+    rendered = tok.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    pre, post = rendered.split("<image>", 1)
+    # Tokenize the text around the image token
+    pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
+    post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
+    # Splice in the IMAGE token id
+    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
+    input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
+    attention_mask = torch.ones_like(input_ids, device=model.device)
+    # Preprocess image
+    px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
+    px = px.to(model.device, dtype=model.dtype)
+    # Generate
+    with torch.no_grad():
+        out = model.generate(
+            inputs=input_ids,
+            attention_mask=attention_mask,
+            images=px,
+            max_new_tokens=256,
+            temperature=0.7,
+            do_sample=True,
+        )
+    caption = tok.decode(out[0], skip_special_tokens=True)
+    # Extract only the generated part
+    if prompt in caption:
+        caption = caption.split(prompt)[-1].strip()
+    return caption
+def process_video(
+    video_path: str,
+    num_frames: int,
+    sampling_method: str,
+    caption_mode: str,
+    custom_prompt: str,
+    progress=gr.Progress()
+) -> tuple:
+    """Process video and generate captions"""
+    if not video_path:
+        return "Please upload a video first.", None, None
+    progress(0, desc="Extracting frames...")
+    frames = extract_frames(video_path, num_frames, sampling_method)
+    if not frames:
+        return "Failed to extract frames from video.", None, None
+    # Prepare prompt based on mode
+    if caption_mode == "Detailed Description":
+        prompt = "Describe this image in detail, including all visible objects, actions, and the overall scene."
+    elif caption_mode == "Brief Summary":
+        prompt = "Provide a brief one-sentence description of what's happening in this image."
+    elif caption_mode == "Action Recognition":
+        prompt = "What action or activity is taking place in this image? Focus on the main action."
+    else:  # Custom
+        prompt = custom_prompt if custom_prompt else "Describe this image."
+    captions = []
+    frame_previews = []
+    for i, frame in enumerate(frames):
+        progress((i + 1) / (len(frames) + 1), desc=f"Analyzing frame {i + 1}/{len(frames)}...")
+        caption = caption_frame(frame, prompt)
+        captions.append(f"**Frame {i + 1}:** {caption}")
+        frame_previews.append(frame)
+    progress(1.0, desc="Generating summary...")
+    # Combine captions into a narrative
+    full_caption = "\n\n".join(captions)
+    # Generate overall summary if multiple frames
+    if len(frames) > 1:
+        summary_prompt = f"Based on these frame descriptions, provide a coherent summary of the video:\n{full_caption}\n\nSummary:"
+        # For simplicity, we'll just combine the captions
+        video_summary = f"## Video Analysis ({len(frames)} frames analyzed)\n\n{full_caption}"
+    else:
+        video_summary = f"## Video Analysis\n\n{full_caption}"
+    return video_summary, frame_previews, video_path
+# Create the Gradio interface
+with gr.Blocks(css="""
+    .video-container {
+        height: calc(100vh - 100px) !important;
+    }
+    .sidebar {
+        height: calc(100vh - 100px) !important;
+        overflow-y: auto;
+    }
+""") as demo:
+    gr.Markdown("# 🎬 FastVLM Video Captioning")
+    with gr.Row():
+        # Main video display
+        with gr.Column(scale=7):
+            video_display = gr.Video(
+                label="Video Input",
+                height=600,
+                elem_classes=["video-container"],
+                autoplay=True,
+                loop=True
+            )
+        # Sidebar with controls
+        with gr.Sidebar(width=400, elem_classes=["sidebar"]):
+            gr.Markdown("## ⚙️ Settings")
+            with gr.Group():
+                gr.Markdown("### Frame Sampling")
+                num_frames = gr.Slider(
+                    minimum=1,
+                    maximum=16,
+                    value=8,
+                    step=1,
+                    label="Number of Frames to Analyze",
+                    info="More frames = better understanding but slower processing"
+                )
+                sampling_method = gr.Radio(
+                    choices=["uniform", "first", "last", "middle"],
+                    value="uniform",
+                    label="Sampling Method",
+                    info="How to select frames from the video"
+                )
+            with gr.Group():
+                gr.Markdown("### Caption Settings")
+                caption_mode = gr.Radio(
+                    choices=["Detailed Description", "Brief Summary", "Action Recognition", "Custom"],
+                    value="Detailed Description",
+                    label="Caption Mode"
+                )
+                custom_prompt = gr.Textbox(
+                    label="Custom Prompt",
+                    placeholder="Enter your custom prompt here...",
+                    visible=False,
+                    lines=3
+                )
+            process_btn = gr.Button("🎯 Analyze Video", variant="primary", size="lg")
+            gr.Markdown("### 📝 Results")
+            output_text = gr.Markdown(
+                value="Upload a video and click 'Analyze Video' to begin.",
+                elem_classes=["output-text"]
+            )
+            with gr.Accordion("🖼️ Analyzed Frames", open=False):
+                frame_gallery = gr.Gallery(
+                    label="Extracted Frames",
+                    show_label=False,
+                    columns=2,
+                    rows=4,
+                    object_fit="contain",
+                    height="auto"
+                )
+    # Show/hide custom prompt based on mode selection
+    def toggle_custom_prompt(mode):
+        return gr.Textbox(visible=(mode == "Custom"))
+    caption_mode.change(
+        toggle_custom_prompt,
+        inputs=[caption_mode],
+        outputs=[custom_prompt]
+    )
+    # Upload handler
+    def handle_upload(video):
+        if video:
+            return video, "Video loaded! Click 'Analyze Video' to generate captions."
+        return None, "Upload a video to begin."
+    video_display.upload(
+        handle_upload,
+        inputs=[video_display],
+        outputs=[video_display, output_text]
+    )
+    # Process button
+    process_btn.click(
+        process_video,
+        inputs=[video_display, num_frames, sampling_method, caption_mode, custom_prompt],
+        outputs=[output_text, frame_gallery, video_display]
+    )
+    demo.launch()