Spaces:
Sleeping
Sleeping
File size: 5,795 Bytes
05b3646 d7f4e54 3957f9a 0c5cb30 d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 05b3646 0c5cb30 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a 0c5cb30 d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 3957f9a d7f4e54 05b3646 3957f9a 05b3646 d7f4e54 3957f9a d7f4e54 cdc1784 05b3646 3957f9a 05b3646 3957f9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
import gradio as gr
import torch
import spaces # for @spaces.GPU on Hugging Face Spaces
# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
# If unavailable in your transformers version, we safely fall back to no quantization.
try:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
_HAS_TORCHAO = True
except Exception:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
TorchAoConfig = None # type: ignore
_HAS_TORCHAO = False
# ========== Basic Configuration ==========
MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
# Prefer bfloat16 on GPU, float32 on CPU
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
quant_cfg = None
if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
# Optional int4 weight-only quantization (saves VRAM on GPU)
quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
# ---- ZeroGPU warm-up: must exist AND be called at import time ----
@spaces.GPU
def _warmup():
"""
A very light GPU-touch to satisfy ZeroGPU's startup detector.
Called at import-time (below). Never raise; return a short status string.
"""
try:
if torch.cuda.is_available():
_ = torch.tensor([0], device="cuda")
return "gpu-ready"
except Exception as e:
return f"warmup-error: {e}"
# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
_WARMUP_STATUS = _warmup()
# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID,
device_map="auto",
dtype=dtype, # (modern arg; replaces deprecated torch_dtype)
attn_implementation="sdpa",
quantization_config=quant_cfg,
)
# Resolution bounds to balance quality vs. memory
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 1024 * 28 * 28
processor = AutoProcessor.from_pretrained(
MODEL_ID,
min_pixels=MIN_PIXELS,
max_pixels=MAX_PIXELS,
)
SYSTEM_PROMPT = (
"You are a helpful assistant that watches a user-provided video and answers questions "
"about it concisely and accurately."
)
# ========== Conversation Builder ==========
def build_conversation(video_path: str, question: str, fps: int):
"""
Qwen2.5-VL expects a chat-style list where media and text are items in 'content'.
"""
return [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "video", "path": video_path},
{"type": "text", "text": question},
],
},
]
# ========== Inference ==========
@torch.inference_mode()
def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
"""
Main inference entry used by the Gradio UI.
- video: filepath from gr.Video
- question: user text; if empty, produce a summary + 5 QA pairs
"""
if video is None:
return "Please upload or drag a video first."
if not question or question.strip() == "":
question = "Summarize this video and provide 5 representative question–answer pairs."
conv = build_conversation(video, question, int(fps))
inputs = processor.apply_chat_template(
conv,
fps=int(fps),
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
# move tensors to model device
inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
gen_kwargs = dict(
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=(float(temperature) > 0.0),
pad_token_id=processor.tokenizer.eos_token_id,
)
output_ids = model.generate(**inputs, **gen_kwargs)
# Remove the prompt portion for clean decoding
prompt_len = inputs["input_ids"].shape[1]
generated_ids = output_ids[0, prompt_len:]
text = processor.batch_decode(
generated_ids.unsqueeze(0),
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)[0]
return text.strip()
# ========== Gradio UI ==========
with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
gr.Markdown(
"""
# 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
- Drag or upload any video, type your question, then click **Ask**.
- Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
"""
)
with gr.Row():
video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
with gr.Column():
question = gr.Textbox(
label="Your question",
placeholder="e.g., What happens in this video? Provide 5 QA pairs."
)
ask = gr.Button("Ask", variant="primary")
output = gr.Textbox(label="Answer", lines=12)
with gr.Accordion("Advanced", open=False):
fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
ask.click(
fn=answer,
inputs=[video, question, fps, max_new_tokens, temperature, top_p],
outputs=[output],
)
# ========== Launch ==========
if __name__ == "__main__":
# Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
demo.launch(ssr_mode=False)
|