sunrainyg commited on
Commit
3957f9a
Β·
1 Parent(s): 4821aa5
Files changed (2) hide show
  1. app.py +73 -21
  2. requirements.txt +3 -2
app.py CHANGED
@@ -1,35 +1,76 @@
1
  import os
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
 
 
 
 
 
 
 
 
 
 
5
 
6
  # ========== Basic Configuration ==========
7
  MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
8
  USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
9
 
 
10
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
11
 
12
  quant_cfg = None
13
- if USE_INT4:
 
14
  quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
17
  MODEL_ID,
18
  device_map="auto",
19
- torch_dtype=dtype,
20
  attn_implementation="sdpa",
21
  quantization_config=quant_cfg,
22
  )
23
 
 
24
  MIN_PIXELS = 256 * 28 * 28
25
  MAX_PIXELS = 1024 * 28 * 28
26
- processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
27
 
28
- SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
 
 
 
 
29
 
 
 
 
 
30
 
31
  # ========== Conversation Builder ==========
32
  def build_conversation(video_path: str, question: str, fps: int):
 
 
 
33
  return [
34
  {"role": "system", "content": SYSTEM_PROMPT},
35
  {
@@ -41,10 +82,14 @@ def build_conversation(video_path: str, question: str, fps: int):
41
  },
42
  ]
43
 
44
-
45
- # ========== Main Inference Function ==========
46
  @torch.inference_mode()
47
  def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
 
 
 
 
 
48
  if video is None:
49
  return "Please upload or drag a video first."
50
  if not question or question.strip() == "":
@@ -60,18 +105,22 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
60
  return_dict=True,
61
  return_tensors="pt",
62
  )
63
- inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
 
64
 
65
  gen_kwargs = dict(
66
  max_new_tokens=int(max_new_tokens),
67
  temperature=float(temperature),
68
  top_p=float(top_p),
69
- do_sample=(float(temperature) > 0),
70
  pad_token_id=processor.tokenizer.eos_token_id,
71
  )
72
 
73
  output_ids = model.generate(**inputs, **gen_kwargs)
74
- generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
 
 
 
75
  text = processor.batch_decode(
76
  generated_ids.unsqueeze(0),
77
  skip_special_tokens=True,
@@ -80,19 +129,23 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
80
 
81
  return text.strip()
82
 
83
-
84
  # ========== Gradio UI ==========
85
  with gr.Blocks(title="Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
86
- gr.Markdown("""
87
- # 🎬 Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)
88
- - Drag or upload any video, type your question, then click **Ask**.
89
- - Default `fps=1` (sample 1 frame per second) saves VRAM; for short or detailed videos, increase fps slightly.
90
- """)
 
 
91
 
92
  with gr.Row():
93
  video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
94
  with gr.Column():
95
- question = gr.Textbox(label="Your question", placeholder="e.g., What happens in this video? Provide 5 QA pairs.")
 
 
 
96
  ask = gr.Button("Ask", variant="primary")
97
  output = gr.Textbox(label="Answer", lines=12)
98
 
@@ -108,8 +161,7 @@ with gr.Blocks(title="Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
108
  outputs=[output],
109
  )
110
 
111
-
112
- # ========== App Launch ==========
113
  if __name__ == "__main__":
114
- demo.launch(ssr_mode=False) # <- disable SSR, avoids the GPU check
115
-
 
1
  import os
2
  import gradio as gr
3
  import torch
4
+ import spaces # for @spaces.GPU on Hugging Face Spaces
5
+
6
+ # Try to import TorchAoConfig for optional 4-bit weight-only quantization.
7
+ # If unavailable in your transformers version, we safely fall back to no quantization.
8
+ try:
9
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
10
+ _HAS_TORCHAO = True
11
+ except Exception:
12
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
13
+ TorchAoConfig = None # type: ignore
14
+ _HAS_TORCHAO = False
15
 
16
  # ========== Basic Configuration ==========
17
  MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
18
  USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
19
 
20
+ # Prefer bfloat16 on GPU, float32 on CPU
21
  dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
22
 
23
  quant_cfg = None
24
+ if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
25
+ # Optional int4 weight-only quantization (saves VRAM on GPU)
26
  quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
27
 
28
+ # ---- ZeroGPU warm-up: must exist AND be called at import time ----
29
+ @spaces.GPU
30
+ def _warmup():
31
+ """
32
+ A very light GPU-touch to satisfy ZeroGPU's startup detector.
33
+ Called at import-time (below). Never raise; return a short status string.
34
+ """
35
+ try:
36
+ if torch.cuda.is_available():
37
+ _ = torch.tensor([0], device="cuda")
38
+ return "gpu-ready"
39
+ except Exception as e:
40
+ return f"warmup-error: {e}"
41
+
42
+ # Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
43
+ _WARMUP_STATUS = _warmup()
44
+
45
+ # ========== Load Model & Processor ==========
46
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
47
  MODEL_ID,
48
  device_map="auto",
49
+ dtype=dtype, # (modern arg; replaces deprecated torch_dtype)
50
  attn_implementation="sdpa",
51
  quantization_config=quant_cfg,
52
  )
53
 
54
+ # Resolution bounds to balance quality vs. memory
55
  MIN_PIXELS = 256 * 28 * 28
56
  MAX_PIXELS = 1024 * 28 * 28
 
57
 
58
+ processor = AutoProcessor.from_pretrained(
59
+ MODEL_ID,
60
+ min_pixels=MIN_PIXELS,
61
+ max_pixels=MAX_PIXELS,
62
+ )
63
 
64
+ SYSTEM_PROMPT = (
65
+ "You are a helpful assistant that watches a user-provided video and answers questions "
66
+ "about it concisely and accurately."
67
+ )
68
 
69
  # ========== Conversation Builder ==========
70
  def build_conversation(video_path: str, question: str, fps: int):
71
+ """
72
+ Qwen2.5-VL expects a chat-style list where media and text are items in 'content'.
73
+ """
74
  return [
75
  {"role": "system", "content": SYSTEM_PROMPT},
76
  {
 
82
  },
83
  ]
84
 
85
+ # ========== Inference ==========
 
86
  @torch.inference_mode()
87
  def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
88
+ """
89
+ Main inference entry used by the Gradio UI.
90
+ - video: filepath from gr.Video
91
+ - question: user text; if empty, produce a summary + 5 QA pairs
92
+ """
93
  if video is None:
94
  return "Please upload or drag a video first."
95
  if not question or question.strip() == "":
 
105
  return_dict=True,
106
  return_tensors="pt",
107
  )
108
+ # move tensors to model device
109
+ inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
110
 
111
  gen_kwargs = dict(
112
  max_new_tokens=int(max_new_tokens),
113
  temperature=float(temperature),
114
  top_p=float(top_p),
115
+ do_sample=(float(temperature) > 0.0),
116
  pad_token_id=processor.tokenizer.eos_token_id,
117
  )
118
 
119
  output_ids = model.generate(**inputs, **gen_kwargs)
120
+ # Remove the prompt portion for clean decoding
121
+ prompt_len = inputs["input_ids"].shape[1]
122
+ generated_ids = output_ids[0, prompt_len:]
123
+
124
  text = processor.batch_decode(
125
  generated_ids.unsqueeze(0),
126
  skip_special_tokens=True,
 
129
 
130
  return text.strip()
131
 
 
132
  # ========== Gradio UI ==========
133
  with gr.Blocks(title="Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
134
+ gr.Markdown(
135
+ """
136
+ # 🎬 Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)
137
+ - Drag or upload any video, type your question, then click **Ask**.
138
+ - Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
139
+ """
140
+ )
141
 
142
  with gr.Row():
143
  video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
144
  with gr.Column():
145
+ question = gr.Textbox(
146
+ label="Your question",
147
+ placeholder="e.g., What happens in this video? Provide 5 QA pairs."
148
+ )
149
  ask = gr.Button("Ask", variant="primary")
150
  output = gr.Textbox(label="Answer", lines=12)
151
 
 
161
  outputs=[output],
162
  )
163
 
164
+ # ========== Launch ==========
 
165
  if __name__ == "__main__":
166
+ # Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
167
+ demo.launch(ssr_mode=False)
requirements.txt CHANGED
@@ -3,9 +3,10 @@ transformers>=4.50.0
3
  accelerate>=0.34.0
4
  torch>=2.2.0
5
  torchvision
6
- sentencepiece
7
- protobuf
8
  av
9
  decord
 
10
  pillow
11
  numpy
 
 
3
  accelerate>=0.34.0
4
  torch>=2.2.0
5
  torchvision
6
+ spaces
 
7
  av
8
  decord
9
+ sentencepiece
10
  pillow
11
  numpy
12
+ protobuf