Spaces:

yulu2
/

FoundationMotion

Sleeping

App Files Files Community

sunrainyg commited on 28 days ago

Commit

d7f4e54

1 Parent(s): 7b5201e

Update

Browse files

Files changed (1) hide show

app.py +101 -31

app.py CHANGED Viewed

@@ -1,44 +1,114 @@
 import os
-gen_kwargs = dict(
-max_new_tokens=int(max_new_tokens),
-temperature=float(temperature),
-top_p=float(top_p),
-do_sample=(float(temperature) > 0),
-pad_token_id=processor.tokenizer.eos_token_id,
 )
-output_ids = model.generate(**inputs, **gen_kwargs)
-# Slice off the input portion for clean decoding (batch size = 1 here)
-generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
-text = processor.batch_decode(generated_ids.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
-return text.strip()
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
     gr.Markdown("""
     # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
-    - Drag and drop or upload any video, then input your question and click **Ask**.
-    - Default `fps=1` (1 frame per second) saves GPU memory; if the video is short with many details, you can increase the fps.
     """)
-with gr.Row():
-    video = gr.Video(label="Drag video here (mp4, mov, webm)", interactive=True)
-with gr.Column():
-    question = gr.Textbox(label="Your question", placeholder="Example: What's happening in the video? Provide 5 Q&A pairs.")
-    ask = gr.Button("Ask", variant="primary")
-    output = gr.Textbox(label="Answer", lines=12)
-with gr.Accordion("Advanced", open=False):
-    fps = gr.Slider(1, 6, value=1, step=1, label="Sampling rate (fps)")
-    max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="max_new_tokens")
-    temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="temperature")
-    top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
-ask.click(
-    fn=answer,
-    inputs=[video, question, fps, max_new_tokens, temperature, top_p],
-    outputs=[output],
     )
 if __name__ == "__main__":
-    demo.launch()

 import os
+import gradio as gr
+import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
+# ========== Basic Configuration ==========
+MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
+USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+quant_cfg = None
+if USE_INT4:
+    quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype=dtype,
+    attn_implementation="sdpa",
+    quantization_config=quant_cfg,
 )
+MIN_PIXELS = 256 * 28 * 28
+MAX_PIXELS = 1024 * 28 * 28
+processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
+# ========== Conversation Builder ==========
+def build_conversation(video_path: str, question: str, fps: int):
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "path": video_path},
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+# ========== Main Inference Function ==========
+@torch.inference_mode()
+def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
+    if video is None:
+        return "Please upload or drag a video first."
+    if not question or question.strip() == "":
+        question = "Summarize this video and provide 5 representative question–answer pairs."
+    conv = build_conversation(video, question, int(fps))
+    inputs = processor.apply_chat_template(
+        conv,
+        fps=int(fps),
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+    inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
+    gen_kwargs = dict(
+        max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature),
+        top_p=float(top_p),
+        do_sample=(float(temperature) > 0),
+        pad_token_id=processor.tokenizer.eos_token_id,
+    )
+    output_ids = model.generate(**inputs, **gen_kwargs)
+    generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
+    text = processor.batch_decode(
+        generated_ids.unsqueeze(0),
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True,
+    )[0]
+    return text.strip()
+# ========== Gradio UI ==========
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
     gr.Markdown("""
     # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
+    - Drag or upload any video, type your question, then click **Ask**.
+    - Default `fps=1` (sample 1 frame per second) saves VRAM; for short or detailed videos, increase fps slightly.
     """)
+    with gr.Row():
+        video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
+        with gr.Column():
+            question = gr.Textbox(label="Your question", placeholder="e.g., What happens in this video? Provide 5 QA pairs.")
+            ask = gr.Button("Ask", variant="primary")
+            output = gr.Textbox(label="Answer", lines=12)
+    with gr.Accordion("Advanced", open=False):
+        fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
+        max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
+        temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
+        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+    ask.click(
+        fn=answer,
+        inputs=[video, question, fps, max_new_tokens, temperature, top_p],
+        outputs=[output],
     )
+# ========== App Launch ==========
 if __name__ == "__main__":
+    demo.launch()