Spaces:
Runtime error
Runtime error
v3
Browse files- app.py +42 -26
- requirements.txt +4 -3
app.py
CHANGED
|
@@ -1,49 +1,65 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
import tempfile
|
| 5 |
|
| 6 |
-
# Initialize Qwen2.5-Omni-7B
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
)
|
|
|
|
| 12 |
|
| 13 |
-
def
|
| 14 |
-
#
|
| 15 |
headers = {"X-IP-Token": request.headers.get('x-ip-token', '')}
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
)
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
with gr.Blocks() as demo:
|
| 29 |
-
gr.Markdown("##
|
| 30 |
|
| 31 |
with gr.Row():
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
submit_btn = gr.Button("Analyze")
|
| 36 |
|
| 37 |
with gr.Column():
|
| 38 |
-
text_output = gr.Textbox(label="Analysis Results")
|
| 39 |
-
audio_output = gr.Audio(label="
|
| 40 |
|
| 41 |
submit_btn.click(
|
| 42 |
-
|
| 43 |
-
inputs=[
|
| 44 |
outputs=[text_output, audio_output]
|
| 45 |
)
|
| 46 |
|
| 47 |
-
|
| 48 |
-
demo.queue(default_concurrency_limit=5)
|
| 49 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 3 |
+
import torch
|
| 4 |
import tempfile
|
| 5 |
|
| 6 |
+
# Initialize Qwen2.5-Omni-7B with multimodal support
|
| 7 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 8 |
+
"Qwen/Qwen2.5-Omni-7B",
|
| 9 |
+
torch_dtype=torch.float16,
|
| 10 |
+
device_map="auto"
|
| 11 |
)
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Omni-7B")
|
| 13 |
|
| 14 |
+
def analyze_media(video_path, prompt, request: gr.Request):
|
| 15 |
+
# ZeroGPU rate limiting headers
|
| 16 |
headers = {"X-IP-Token": request.headers.get('x-ip-token', '')}
|
| 17 |
|
| 18 |
+
# Create multimodal pipeline
|
| 19 |
+
pipe = pipeline(
|
| 20 |
+
"multimodal-generation",
|
| 21 |
+
model=model,
|
| 22 |
+
tokenizer=tokenizer,
|
| 23 |
+
device=model.device,
|
| 24 |
+
max_new_tokens=1024,
|
| 25 |
+
generate_speech=True
|
| 26 |
)
|
| 27 |
|
| 28 |
+
# Process 120s video with TMRoPE alignment
|
| 29 |
+
result = pipe(
|
| 30 |
+
media=video_path,
|
| 31 |
+
text=prompt,
|
| 32 |
+
headers=headers,
|
| 33 |
+
timeout=120
|
| 34 |
+
)
|
| 35 |
|
| 36 |
+
# Save speech output to temporary file
|
| 37 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 38 |
+
result["speech"].export(f.name, format="wav")
|
| 39 |
+
return result["text"], f.name
|
| 40 |
|
| 41 |
with gr.Blocks() as demo:
|
| 42 |
+
gr.Markdown("## Qwen2.5-Omni-7B Multimodal Demo")
|
| 43 |
|
| 44 |
with gr.Row():
|
| 45 |
+
media_input = gr.Video(
|
| 46 |
+
label="Upload Video (max 120s)",
|
| 47 |
+
sources=["upload"],
|
| 48 |
+
max_length=120
|
| 49 |
+
)
|
| 50 |
+
prompt_input = gr.Textbox(label="Analysis Prompt", placeholder="Describe or ask about the video...")
|
| 51 |
|
| 52 |
+
submit_btn = gr.Button("Analyze", variant="primary")
|
| 53 |
|
| 54 |
with gr.Column():
|
| 55 |
+
text_output = gr.Textbox(label="Analysis Results", interactive=False)
|
| 56 |
+
audio_output = gr.Audio(label="Speech Response", autoplay=True)
|
| 57 |
|
| 58 |
submit_btn.click(
|
| 59 |
+
analyze_media,
|
| 60 |
+
inputs=[media_input, prompt_input, gr.Request()],
|
| 61 |
outputs=[text_output, audio_output]
|
| 62 |
)
|
| 63 |
|
| 64 |
+
demo.queue(concurrency_count=2)
|
|
|
|
| 65 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
| 1 |
+
torch>=2.3.0
|
| 2 |
+
transformers>=4.41.0
|
| 3 |
+
gradio>=4.26.0
|
| 4 |
+
soundfile>=0.12.1
|