Spaces:

yulu2
/

FoundationMotion

Sleeping

File size: 5,795 Bytes

import os
import gradio as gr
import torch
import spaces  # for @spaces.GPU on Hugging Face Spaces

# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
# If unavailable in your transformers version, we safely fall back to no quantization.
try:
    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
    _HAS_TORCHAO = True
except Exception:
    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
    TorchAoConfig = None  # type: ignore
    _HAS_TORCHAO = False

# ========== Basic Configuration ==========
MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
USE_INT4 = os.environ.get("USE_INT4", "0") == "1"

# Prefer bfloat16 on GPU, float32 on CPU
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

quant_cfg = None
if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
    # Optional int4 weight-only quantization (saves VRAM on GPU)
    quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)

# ---- ZeroGPU warm-up: must exist AND be called at import time ----
@spaces.GPU
def _warmup():
    """
    A very light GPU-touch to satisfy ZeroGPU's startup detector.
    Called at import-time (below). Never raise; return a short status string.
    """
    try:
        if torch.cuda.is_available():
            _ = torch.tensor([0], device="cuda")
        return "gpu-ready"
    except Exception as e:
        return f"warmup-error: {e}"

# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
_WARMUP_STATUS = _warmup()

# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    dtype=dtype,  # (modern arg; replaces deprecated torch_dtype)
    attn_implementation="sdpa",
    quantization_config=quant_cfg,
)

# Resolution bounds to balance quality vs. memory
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 1024 * 28 * 28

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

SYSTEM_PROMPT = (
    "You are a helpful assistant that watches a user-provided video and answers questions "
    "about it concisely and accurately."
)

# ========== Conversation Builder ==========
def build_conversation(video_path: str, question: str, fps: int):
    """
    Qwen2.5-VL expects a chat-style list where media and text are items in 'content'.
    """
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "video", "path": video_path},
                {"type": "text", "text": question},
            ],
        },
    ]

# ========== Inference ==========
@torch.inference_mode()
def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
    """
    Main inference entry used by the Gradio UI.
    - video: filepath from gr.Video
    - question: user text; if empty, produce a summary + 5 QA pairs
    """
    if video is None:
        return "Please upload or drag a video first."
    if not question or question.strip() == "":
        question = "Summarize this video and provide 5 representative question–answer pairs."

    conv = build_conversation(video, question, int(fps))

    inputs = processor.apply_chat_template(
        conv,
        fps=int(fps),
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    # move tensors to model device
    inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}

    gen_kwargs = dict(
        max_new_tokens=int(max_new_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
        do_sample=(float(temperature) > 0.0),
        pad_token_id=processor.tokenizer.eos_token_id,
    )

    output_ids = model.generate(**inputs, **gen_kwargs)
    # Remove the prompt portion for clean decoding
    prompt_len = inputs["input_ids"].shape[1]
    generated_ids = output_ids[0, prompt_len:]

    text = processor.batch_decode(
        generated_ids.unsqueeze(0),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0]

    return text.strip()

# ========== Gradio UI ==========
with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
    gr.Markdown(
        """
        # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
        - Drag or upload any video, type your question, then click **Ask**.
        - Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
        """
    )

    with gr.Row():
        video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
        with gr.Column():
            question = gr.Textbox(
                label="Your question",
                placeholder="e.g., What happens in this video? Provide 5 QA pairs."
            )
            ask = gr.Button("Ask", variant="primary")
            output = gr.Textbox(label="Answer", lines=12)

    with gr.Accordion("Advanced", open=False):
        fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
        max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
        temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")

    ask.click(
        fn=answer,
        inputs=[video, question, fps, max_new_tokens, temperature, top_p],
        outputs=[output],
    )

# ========== Launch ==========
if __name__ == "__main__":
    # Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
    demo.launch(ssr_mode=False)