File size: 5,795 Bytes
05b3646
d7f4e54
 
3957f9a
 
 
 
 
 
 
 
 
 
 
0c5cb30
d7f4e54
 
 
 
3957f9a
d7f4e54
 
 
3957f9a
 
d7f4e54
 
3957f9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7f4e54
 
 
3957f9a
d7f4e54
 
05b3646
0c5cb30
3957f9a
d7f4e54
 
 
3957f9a
 
 
 
 
d7f4e54
3957f9a
 
 
 
0c5cb30
d7f4e54
 
3957f9a
 
 
d7f4e54
 
 
 
 
 
 
 
 
 
 
3957f9a
d7f4e54
 
3957f9a
 
 
 
 
d7f4e54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3957f9a
 
d7f4e54
 
 
 
 
3957f9a
d7f4e54
 
 
 
3957f9a
 
 
 
d7f4e54
 
 
 
 
 
 
 
 
05b3646
3957f9a
 
 
 
 
 
 
05b3646
d7f4e54
 
 
3957f9a
 
 
 
d7f4e54
 
 
 
 
 
 
 
 
 
 
 
 
cdc1784
05b3646
3957f9a
05b3646
3957f9a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import gradio as gr
import torch
import spaces  # for @spaces.GPU on Hugging Face Spaces

# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
# If unavailable in your transformers version, we safely fall back to no quantization.
try:
    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
    _HAS_TORCHAO = True
except Exception:
    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
    TorchAoConfig = None  # type: ignore
    _HAS_TORCHAO = False

# ========== Basic Configuration ==========
MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
USE_INT4 = os.environ.get("USE_INT4", "0") == "1"

# Prefer bfloat16 on GPU, float32 on CPU
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

quant_cfg = None
if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
    # Optional int4 weight-only quantization (saves VRAM on GPU)
    quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)

# ---- ZeroGPU warm-up: must exist AND be called at import time ----
@spaces.GPU
def _warmup():
    """
    A very light GPU-touch to satisfy ZeroGPU's startup detector.
    Called at import-time (below). Never raise; return a short status string.
    """
    try:
        if torch.cuda.is_available():
            _ = torch.tensor([0], device="cuda")
        return "gpu-ready"
    except Exception as e:
        return f"warmup-error: {e}"

# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
_WARMUP_STATUS = _warmup()

# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    dtype=dtype,  # (modern arg; replaces deprecated torch_dtype)
    attn_implementation="sdpa",
    quantization_config=quant_cfg,
)

# Resolution bounds to balance quality vs. memory
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 1024 * 28 * 28

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

SYSTEM_PROMPT = (
    "You are a helpful assistant that watches a user-provided video and answers questions "
    "about it concisely and accurately."
)

# ========== Conversation Builder ==========
def build_conversation(video_path: str, question: str, fps: int):
    """
    Qwen2.5-VL expects a chat-style list where media and text are items in 'content'.
    """
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "video", "path": video_path},
                {"type": "text", "text": question},
            ],
        },
    ]

# ========== Inference ==========
@torch.inference_mode()
def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
    """
    Main inference entry used by the Gradio UI.
    - video: filepath from gr.Video
    - question: user text; if empty, produce a summary + 5 QA pairs
    """
    if video is None:
        return "Please upload or drag a video first."
    if not question or question.strip() == "":
        question = "Summarize this video and provide 5 representative question–answer pairs."

    conv = build_conversation(video, question, int(fps))

    inputs = processor.apply_chat_template(
        conv,
        fps=int(fps),
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    # move tensors to model device
    inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}

    gen_kwargs = dict(
        max_new_tokens=int(max_new_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
        do_sample=(float(temperature) > 0.0),
        pad_token_id=processor.tokenizer.eos_token_id,
    )

    output_ids = model.generate(**inputs, **gen_kwargs)
    # Remove the prompt portion for clean decoding
    prompt_len = inputs["input_ids"].shape[1]
    generated_ids = output_ids[0, prompt_len:]

    text = processor.batch_decode(
        generated_ids.unsqueeze(0),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0]

    return text.strip()

# ========== Gradio UI ==========
with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
    gr.Markdown(
        """
        # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
        - Drag or upload any video, type your question, then click **Ask**.
        - Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
        """
    )

    with gr.Row():
        video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
        with gr.Column():
            question = gr.Textbox(
                label="Your question",
                placeholder="e.g., What happens in this video? Provide 5 QA pairs."
            )
            ask = gr.Button("Ask", variant="primary")
            output = gr.Textbox(label="Answer", lines=12)

    with gr.Accordion("Advanced", open=False):
        fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
        max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
        temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")

    ask.click(
        fn=answer,
        inputs=[video, question, fps, max_new_tokens, temperature, top_p],
        outputs=[output],
    )

# ========== Launch ==========
if __name__ == "__main__":
    # Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
    demo.launch(ssr_mode=False)