Spaces:

Sergidev
/

Qwen2.5-OmniAgent

Runtime error

App Files Files Community

Sergidev commited on Apr 9

Commit

f037fe5

1 Parent(s): addbfa5

v4

Browse files

Files changed (2) hide show

app.py +128 -39
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -1,54 +1,143 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch
 import tempfile
-# Initialize Qwen2.5-Omni-7B with multimodal support
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen2.5-Omni-7B",
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Omni-7B")
-def analyze_media(video_path, prompt, request: gr.Request):
-    # ZeroGPU rate limiting headers
-    headers = {"X-IP-Token": request.headers.get('x-ip-token', '')}
-    # Create multimodal pipeline
-    pipe = pipeline(
-        "multimodal-generation",
-        model=model,
-        tokenizer=tokenizer,
-        device=model.device,
-        max_new_tokens=1024,
-        generate_speech=True
-    )
-    # Process 120s video with TMRoPE alignment
-    result = pipe(
-        media=video_path,
-        text=prompt,
-        headers=headers,
-        timeout=120
     )
-    # Save speech output to temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        result["speech"].export(f.name, format="wav")
-        return result["text"], f.name
 with gr.Blocks() as demo:
     gr.Markdown("## Qwen2.5-Omni-7B Multimodal Demo")
     with gr.Row():
-        media_input = gr.Video(
-            label="Upload Video (max 120s)",
-            sources=["upload"],
-            max_length=120
-        )
         prompt_input = gr.Textbox(label="Analysis Prompt", placeholder="Describe or ask about the video...")
     submit_btn = gr.Button("Analyze", variant="primary")
     with gr.Column():
@@ -56,8 +145,8 @@ with gr.Blocks() as demo:
         audio_output = gr.Audio(label="Speech Response", autoplay=True)
     submit_btn.click(
-        analyze_media,
-        inputs=[media_input, prompt_input, gr.Request()],
         outputs=[text_output, audio_output]
     )

 import gradio as gr
 import torch
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, TextStreamer
+from qwen_omni_utils import process_mm_info
+import soundfile as sf
 import tempfile
+import spaces
+import gc
+# Initialize the model and processor
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
+def get_model():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+    model = Qwen2_5OmniModel.from_pretrained(
+        "Qwen/Qwen2.5-Omni-7B",
+        torch_dtype=torch_dtype,
+        device_map="auto",
+        enable_audio_output=True,
+        low_cpu_mem_usage=True,
+        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
     )
+    return model
+model = get_model()
+processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+# System prompt
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+}
+# Voice options
+VOICE_OPTIONS = {
+    "Chelsie (Female)": "Chelsie",
+    "Ethan (Male)": "Ethan"
+}
+@spaces.GPU(duration=120)
+def process_input(video, text, voice_type, enable_audio_output):
+    try:
+        # Clear GPU memory before processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        # Prepare multimodal input
+        user_input = {
+            "text": text,
+            "video": video if video is not None else None,
+        }
+        # Prepare conversation history for model processing
+        conversation = [SYSTEM_PROMPT]
+        conversation.append({"role": "user", "content": user_input})
+        # Process multimedia information
+        try:
+            audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
+        except Exception as e:
+            print(f"Error processing multimedia: {str(e)}")
+            audios, images, videos = [], [], []
+        inputs = processor(
+            text=processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False),
+            videos=videos,
+            return_tensors="pt",
+            padding=True
+        )
+        # Move inputs to device and convert dtype
+        inputs = {k: v.to(device=model.device, dtype=model.dtype) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        # Generate response with streaming and audio output
+        text_ids = None
+        audio_path = None
+        if enable_audio_output:
+            voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
+            try:
+                generation_output = model.generate(
+                    **inputs,
+                    use_audio_in_video=False,
+                    return_audio=True,
+                    spk=voice_type_value,
+                    max_new_tokens=512,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    streamer=TextStreamer(processor, skip_prompt=True)
+                )
+                if isinstance(generation_output, tuple) and len(generation_output) == 2:
+                    text_ids, audio = generation_output
+                    if audio is not None:
+                        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                            sf.write(tmp_file.name, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
+                            audio_path = tmp_file.name
+            except Exception as e:
+                print(f"Error during audio generation: {str(e)}")
+        # Fall back to text-only generation if audio fails
+        if text_ids is None:
+            try:
+                text_ids = model.generate(
+                    **inputs,
+                    use_audio_in_video=False,
+                    return_audio=False,
+                    max_new_tokens=512,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    streamer=TextStreamer(processor, skip_prompt=True)
+                )
+            except Exception as e:
+                print(f"Error during fallback text generation: {str(e)}")
+        # Decode text response
+        text_response = processor.batch_decode(text_ids, skip_special_tokens=True)[0] if text_ids is not None else "Error generating response."
+        return text_response.strip(), audio_path
+    except Exception as e:
+        print(f"Error in process_input: {str(e)}")
+        return "Error processing input.", None
+# Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("## Qwen2.5-Omni-7B Multimodal Demo")
     with gr.Row():
+        video_input = gr.Video(label="Upload Video (max 120s)", sources=["upload"], max_length=120)
         prompt_input = gr.Textbox(label="Analysis Prompt", placeholder="Describe or ask about the video...")
+    voice_selection = gr.Dropdown(label="Voice Type", choices=list(VOICE_OPTIONS.keys()), value="Chelsie (Female)")
+    enable_audio_checkbox = gr.Checkbox(label="Enable Audio Output", value=True)
     submit_btn = gr.Button("Analyze", variant="primary")
     with gr.Column():
         audio_output = gr.Audio(label="Speech Response", autoplay=True)
     submit_btn.click(
+        process_input,
+        inputs=[video_input, prompt_input, voice_selection, enable_audio_checkbox],
         outputs=[text_output, audio_output]
     )

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch>=2.3.0
-transformers>=4.41.0
-gradio>=4.26.0
-soundfile>=0.12.1

 torch>=2.3.0
+git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+accelerate>=0.30.0
+qwen-omni-utils[decord]>=1.0.0  # For multimedia processing
+soundfile>=0.12.1  # Audio support