Spaces:

yulu2
/

FoundationMotion

Sleeping

App Files Files Community

sunrainyg commited on 24 days ago

Commit

3957f9a

1 Parent(s): 4821aa5

Update

Browse files

Files changed (2) hide show

app.py +73 -21
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -1,35 +1,76 @@
 import os
 import gradio as gr
 import torch
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
 # ========== Basic Configuration ==========
 MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
 USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 quant_cfg = None
-if USE_INT4:
     quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype=dtype,
     attn_implementation="sdpa",
     quantization_config=quant_cfg,
 )
 MIN_PIXELS = 256 * 28 * 28
 MAX_PIXELS = 1024 * 28 * 28
-processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
-SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
 # ========== Conversation Builder ==========
 def build_conversation(video_path: str, question: str, fps: int):
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
@@ -41,10 +82,14 @@ def build_conversation(video_path: str, question: str, fps: int):
         },
     ]
-# ========== Main Inference Function ==========
 @torch.inference_mode()
 def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
     if video is None:
         return "Please upload or drag a video first."
     if not question or question.strip() == "":
@@ -60,18 +105,22 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
         return_dict=True,
         return_tensors="pt",
     )
-    inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
-        do_sample=(float(temperature) > 0),
         pad_token_id=processor.tokenizer.eos_token_id,
     )
     output_ids = model.generate(**inputs, **gen_kwargs)
-    generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
     text = processor.batch_decode(
         generated_ids.unsqueeze(0),
         skip_special_tokens=True,
@@ -80,19 +129,23 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
     return text.strip()
 # ========== Gradio UI ==========
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
-    gr.Markdown("""
-    # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
-    - Drag or upload any video, type your question, then click **Ask**.
-    - Default `fps=1` (sample 1 frame per second) saves VRAM; for short or detailed videos, increase fps slightly.
-    """)
     with gr.Row():
         video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
         with gr.Column():
-            question = gr.Textbox(label="Your question", placeholder="e.g., What happens in this video? Provide 5 QA pairs.")
             ask = gr.Button("Ask", variant="primary")
             output = gr.Textbox(label="Answer", lines=12)
@@ -108,8 +161,7 @@ with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
         outputs=[output],
     )
-# ========== App Launch ==========
 if __name__ == "__main__":
-    demo.launch(ssr_mode=False)  # <- disable SSR, avoids the GPU check

 import os
 import gradio as gr
 import torch
+import spaces  # for @spaces.GPU on Hugging Face Spaces
+# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
+# If unavailable in your transformers version, we safely fall back to no quantization.
+try:
+    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
+    _HAS_TORCHAO = True
+except Exception:
+    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+    TorchAoConfig = None  # type: ignore
+    _HAS_TORCHAO = False
 # ========== Basic Configuration ==========
 MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
 USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
+# Prefer bfloat16 on GPU, float32 on CPU
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 quant_cfg = None
+if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
+    # Optional int4 weight-only quantization (saves VRAM on GPU)
     quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
+# ---- ZeroGPU warm-up: must exist AND be called at import time ----
+@spaces.GPU
+def _warmup():
+    """
+    A very light GPU-touch to satisfy ZeroGPU's startup detector.
+    Called at import-time (below). Never raise; return a short status string.
+    """
+    try:
+        if torch.cuda.is_available():
+            _ = torch.tensor([0], device="cuda")
+        return "gpu-ready"
+    except Exception as e:
+        return f"warmup-error: {e}"
+# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
+_WARMUP_STATUS = _warmup()
+# ========== Load Model & Processor ==========
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
+    dtype=dtype,  # (modern arg; replaces deprecated torch_dtype)
     attn_implementation="sdpa",
     quantization_config=quant_cfg,
 )
+# Resolution bounds to balance quality vs. memory
 MIN_PIXELS = 256 * 28 * 28
 MAX_PIXELS = 1024 * 28 * 28
+processor = AutoProcessor.from_pretrained(
+    MODEL_ID,
+    min_pixels=MIN_PIXELS,
+    max_pixels=MAX_PIXELS,
+)
+SYSTEM_PROMPT = (
+    "You are a helpful assistant that watches a user-provided video and answers questions "
+    "about it concisely and accurately."
+)
 # ========== Conversation Builder ==========
 def build_conversation(video_path: str, question: str, fps: int):
+    """
+    Qwen2.5-VL expects a chat-style list where media and text are items in 'content'.
+    """
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
         },
     ]
+# ========== Inference ==========
 @torch.inference_mode()
 def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
+    """
+    Main inference entry used by the Gradio UI.
+    - video: filepath from gr.Video
+    - question: user text; if empty, produce a summary + 5 QA pairs
+    """
     if video is None:
         return "Please upload or drag a video first."
     if not question or question.strip() == "":
         return_dict=True,
         return_tensors="pt",
     )
+    # move tensors to model device
+    inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
+        do_sample=(float(temperature) > 0.0),
         pad_token_id=processor.tokenizer.eos_token_id,
     )
     output_ids = model.generate(**inputs, **gen_kwargs)
+    # Remove the prompt portion for clean decoding
+    prompt_len = inputs["input_ids"].shape[1]
+    generated_ids = output_ids[0, prompt_len:]
     text = processor.batch_decode(
         generated_ids.unsqueeze(0),
         skip_special_tokens=True,
     return text.strip()
 # ========== Gradio UI ==========
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
+    gr.Markdown(
+        """
+        # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
+        - Drag or upload any video, type your question, then click **Ask**.
+        - Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
+        """
+    )
     with gr.Row():
         video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
         with gr.Column():
+            question = gr.Textbox(
+                label="Your question",
+                placeholder="e.g., What happens in this video? Provide 5 QA pairs."
+            )
             ask = gr.Button("Ask", variant="primary")
             output = gr.Textbox(label="Answer", lines=12)
         outputs=[output],
     )
+# ========== Launch ==========
 if __name__ == "__main__":
+    # Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
+    demo.launch(ssr_mode=False)

requirements.txt CHANGED Viewed

@@ -3,9 +3,10 @@ transformers>=4.50.0
 accelerate>=0.34.0
 torch>=2.2.0
 torchvision
-sentencepiece
-protobuf
 av
 decord
 pillow
 numpy

 accelerate>=0.34.0
 torch>=2.2.0
 torchvision
+spaces
 av
 decord
+sentencepiece
 pillow
 numpy
+protobuf