sunrainyg commited on
Commit
d7f4e54
·
1 Parent(s): 7b5201e
Files changed (1) hide show
  1. app.py +101 -31
app.py CHANGED
@@ -1,44 +1,114 @@
1
  import os
 
 
 
2
 
3
- gen_kwargs = dict(
4
- max_new_tokens=int(max_new_tokens),
5
- temperature=float(temperature),
6
- top_p=float(top_p),
7
- do_sample=(float(temperature) > 0),
8
- pad_token_id=processor.tokenizer.eos_token_id,
 
 
 
 
 
 
 
 
 
 
9
  )
10
 
11
- output_ids = model.generate(**inputs, **gen_kwargs)
12
- # Slice off the input portion for clean decoding (batch size = 1 here)
13
- generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
14
- text = processor.batch_decode(generated_ids.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
15
- return text.strip()
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
18
  gr.Markdown("""
19
  # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
20
- - Drag and drop or upload any video, then input your question and click **Ask**.
21
- - Default `fps=1` (1 frame per second) saves GPU memory; if the video is short with many details, you can increase the fps.
22
  """)
23
 
24
- with gr.Row():
25
- video = gr.Video(label="Drag video here (mp4, mov, webm)", interactive=True)
26
- with gr.Column():
27
- question = gr.Textbox(label="Your question", placeholder="Example: What's happening in the video? Provide 5 Q&A pairs.")
28
- ask = gr.Button("Ask", variant="primary")
29
- output = gr.Textbox(label="Answer", lines=12)
30
-
31
- with gr.Accordion("Advanced", open=False):
32
- fps = gr.Slider(1, 6, value=1, step=1, label="Sampling rate (fps)")
33
- max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="max_new_tokens")
34
- temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="temperature")
35
- top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
36
-
37
- ask.click(
38
- fn=answer,
39
- inputs=[video, question, fps, max_new_tokens, temperature, top_p],
40
- outputs=[output],
41
  )
42
 
 
 
43
  if __name__ == "__main__":
44
- demo.launch()
 
1
  import os
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
5
 
6
+ # ========== Basic Configuration ==========
7
+ MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
8
+ USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
9
+
10
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
11
+
12
+ quant_cfg = None
13
+ if USE_INT4:
14
+ quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
15
+
16
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
17
+ MODEL_ID,
18
+ device_map="auto",
19
+ torch_dtype=dtype,
20
+ attn_implementation="sdpa",
21
+ quantization_config=quant_cfg,
22
  )
23
 
24
+ MIN_PIXELS = 256 * 28 * 28
25
+ MAX_PIXELS = 1024 * 28 * 28
26
+ processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
27
+
28
+ SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
29
+
30
 
31
+ # ========== Conversation Builder ==========
32
+ def build_conversation(video_path: str, question: str, fps: int):
33
+ return [
34
+ {"role": "system", "content": SYSTEM_PROMPT},
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "video", "path": video_path},
39
+ {"type": "text", "text": question},
40
+ ],
41
+ },
42
+ ]
43
+
44
+
45
+ # ========== Main Inference Function ==========
46
+ @torch.inference_mode()
47
+ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
48
+ if video is None:
49
+ return "Please upload or drag a video first."
50
+ if not question or question.strip() == "":
51
+ question = "Summarize this video and provide 5 representative question–answer pairs."
52
+
53
+ conv = build_conversation(video, question, int(fps))
54
+
55
+ inputs = processor.apply_chat_template(
56
+ conv,
57
+ fps=int(fps),
58
+ add_generation_prompt=True,
59
+ tokenize=True,
60
+ return_dict=True,
61
+ return_tensors="pt",
62
+ )
63
+ inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
64
+
65
+ gen_kwargs = dict(
66
+ max_new_tokens=int(max_new_tokens),
67
+ temperature=float(temperature),
68
+ top_p=float(top_p),
69
+ do_sample=(float(temperature) > 0),
70
+ pad_token_id=processor.tokenizer.eos_token_id,
71
+ )
72
+
73
+ output_ids = model.generate(**inputs, **gen_kwargs)
74
+ generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
75
+ text = processor.batch_decode(
76
+ generated_ids.unsqueeze(0),
77
+ skip_special_tokens=True,
78
+ clean_up_tokenization_spaces=True,
79
+ )[0]
80
+
81
+ return text.strip()
82
+
83
+
84
+ # ========== Gradio UI ==========
85
  with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
86
  gr.Markdown("""
87
  # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
88
+ - Drag or upload any video, type your question, then click **Ask**.
89
+ - Default `fps=1` (sample 1 frame per second) saves VRAM; for short or detailed videos, increase fps slightly.
90
  """)
91
 
92
+ with gr.Row():
93
+ video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
94
+ with gr.Column():
95
+ question = gr.Textbox(label="Your question", placeholder="e.g., What happens in this video? Provide 5 QA pairs.")
96
+ ask = gr.Button("Ask", variant="primary")
97
+ output = gr.Textbox(label="Answer", lines=12)
98
+
99
+ with gr.Accordion("Advanced", open=False):
100
+ fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
101
+ max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
102
+ temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
103
+ top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
104
+
105
+ ask.click(
106
+ fn=answer,
107
+ inputs=[video, question, fps, max_new_tokens, temperature, top_p],
108
+ outputs=[output],
109
  )
110
 
111
+
112
+ # ========== App Launch ==========
113
  if __name__ == "__main__":
114
+ demo.launch()