Video-Analysis-AppleFastVLM-7B

Sleeping

App Files Files Community

rahul7star commited on Sep 3

Commit

920c71d

verified ·

1 Parent(s): 4d88196

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -12

app.py CHANGED Viewed

@@ -4,8 +4,6 @@ from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import cv2
 import numpy as np
-from typing import Optional
-import tempfile
 import os
 MID = "apple/FastVLM-7B"
@@ -15,6 +13,7 @@ IMAGE_TOKEN_INDEX = -200
 tok = None
 model = None
 def load_model():
     global tok, model
     if tok is None or model is None:
@@ -29,15 +28,16 @@ def load_model():
         print("Model loaded successfully on CPU!")
     return tok, model
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
-    """Extract frames from video"""
     cap = cv2.VideoCapture(video_path)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     if total_frames == 0:
         cap.release()
         return []
     frames = []
     if sampling_method == "uniform":
         indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
@@ -49,19 +49,20 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
     else:  # middle
         start = max(0, (total_frames - num_frames) // 2)
         indices = list(range(start, min(start + num_frames, total_frames)))
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
     cap.release()
     return frames
 def caption_frame(image: Image.Image, prompt: str) -> str:
-    """Generate caption for a single frame (CPU only)"""
     tok, model = load_model()
     messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
@@ -75,11 +76,8 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
     input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1)
     attention_mask = torch.ones_like(input_ids)
-    # Preprocess image
     px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
-    # Generate on CPU
     with torch.no_grad():
         out = model.generate(
             inputs=input_ids,
@@ -93,5 +91,85 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
     caption = tok.decode(out[0], skip_special_tokens=True)
     if prompt in caption:
         caption = caption.split(prompt)[-1].strip()
     return caption

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import cv2
 import numpy as np
 import os
 MID = "apple/FastVLM-7B"
 tok = None
 model = None
+# ---------------- Load Model ----------------
 def load_model():
     global tok, model
     if tok is None or model is None:
         print("Model loaded successfully on CPU!")
     return tok, model
+# ---------------- Frame Extraction ----------------
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
     cap = cv2.VideoCapture(video_path)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     if total_frames == 0:
         cap.release()
         return []
     frames = []
     if sampling_method == "uniform":
         indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     else:  # middle
         start = max(0, (total_frames - num_frames) // 2)
         indices = list(range(start, min(start + num_frames, total_frames)))
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
     cap.release()
     return frames
+# ---------------- Caption Frame ----------------
 def caption_frame(image: Image.Image, prompt: str) -> str:
     tok, model = load_model()
     messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
     input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1)
     attention_mask = torch.ones_like(input_ids)
     px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
     with torch.no_grad():
         out = model.generate(
             inputs=input_ids,
     caption = tok.decode(out[0], skip_special_tokens=True)
     if prompt in caption:
         caption = caption.split(prompt)[-1].strip()
     return caption
+# ---------------- Process Video ----------------
+def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
+    if not video_path:
+        chat_history.append(["Assistant", "Please upload a video first."])
+        return chat_history, None
+    progress(0, desc="Extracting frames...")
+    frames = extract_frames(video_path, num_frames, sampling_method)
+    if not frames:
+        chat_history.append(["Assistant", "Failed to extract frames."])
+        return chat_history, None
+    prompt = "Provide a brief one-sentence description of what's happening in this image."
+    captions = []
+    chat_history.append(["Assistant", "Analyzing frames..."])
+    for i, frame in enumerate(frames):
+        caption = caption_frame(frame, prompt)
+        captions.append(f"Frame {i+1}: {caption}")
+        chat_history[-1] = ["Assistant", "\n".join(captions)]
+        progress((i + 1) / len(frames))
+    progress(1.0, desc="Analysis complete!")
+    return chat_history, frames
+# ---------------- Custom Apple-like Theme ----------------
+class AppleTheme(gr.themes.Base):
+    def __init__(self):
+        super().__init__(
+            primary_hue=gr.themes.colors.blue,
+            secondary_hue=gr.themes.colors.gray,
+            neutral_hue=gr.themes.colors.gray,
+            spacing_size=gr.themes.sizes.spacing_md,
+            radius_size=gr.themes.sizes.radius_md,
+            text_size=gr.themes.sizes.text_md,
+            font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "Helvetica Neue", "Arial", "sans-serif"],
+            font_mono=[gr.themes.GoogleFont("SF Mono"), "Consolas", "monospace"]
+        )
+# ---------------- Gradio UI ----------------
+with gr.Blocks(theme=AppleTheme()) as demo:
+    gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only)")
+    with gr.Row():
+        with gr.Column(scale=7):
+            video_display = gr.Video(label="Video Input", autoplay=True, loop=True)
+        with gr.Sidebar(width=400):
+            chatbot = gr.Chatbot(
+                value=[["Assistant", "Upload a video and I'll analyze it for you!"]],
+                height=400
+            )
+            process_btn = gr.Button("🎯 Analyze Video", variant="primary")
+            with gr.Accordion("🖼️ Analyzed Frames", open=False):
+                frame_gallery = gr.Gallery(columns=2, rows=4, height="auto")
+    num_frames = gr.State(value=4)
+    sampling_method = gr.State(value="uniform")
+    process_btn.click(
+        fn=process_video,
+        inputs=[video_display, num_frames, sampling_method, chatbot],
+        outputs=[chatbot, frame_gallery],
+        show_progress=True
+    )
+# ---------------- Launch ----------------
+demo.launch(
+    server_name="0.0.0.0",  # Spaces/containers need this
+    server_port=7860,
+    share=False,
+    show_error=True
+)