Spaces:

fdaudens
/

colqwen-omni-demo

Runtime error

App Files Files Community

fdaudens commited on Jul 17

Commit

4844f57

verified ·

1 Parent(s): 525d412

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -28

app.py CHANGED Viewed

@@ -115,10 +115,10 @@ def audio_to_base64(data, rate=16000):
 def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
     """Main processing function"""
     if not audio_file_path:
-        return "Please upload an audio file", None, None
     if not query:
-        return "Please enter a search query", None, None
     try:
         # Chunk audio
@@ -132,7 +132,8 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
         # Prepare results
         result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
-        result_text += f"Chunk indices: {top_indices}\n\n"
         # Save first result as audio file
         first_chunk_path = "result_chunk.wav"
@@ -140,6 +141,7 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
         # Optional: Use OpenAI for answer generation
         if use_openai and openai_key:
             from openai import OpenAI
             client = OpenAI(api_key=openai_key)
@@ -162,28 +164,26 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
                     model="gpt-4o-audio-preview",
                     messages=[{"role": "user", "content": content}]
                 )
-                result_text += f"\nOpenAI Answer: {completion.choices[0].message.content}"
             except Exception as e:
-                result_text += f"\nOpenAI Error: {str(e)}"
-        # Create audio visualization
-        import matplotlib.pyplot as plt
-        fig, ax = plt.subplots(figsize=(10, 4))
-        ax.plot(audios[top_indices[0]])
-        ax.set_title(f"Waveform of top matching chunk (#{top_indices[0]})")
-        ax.set_xlabel("Samples")
-        ax.set_ylabel("Amplitude")
-        plt.tight_layout()
-        return result_text, first_chunk_path, fig
     except Exception as e:
-        return f"Error: {str(e)}", None, None
 # Create Gradio interface
 with gr.Blocks(title="AudioRAG Demo") as demo:
     gr.Markdown("# AudioRAG Demo - Semantic Audio Search")
-    gr.Markdown("Upload an audio file and search through it using natural language queries!")
     with gr.Row():
         with gr.Column():
@@ -191,8 +191,9 @@ with gr.Blocks(title="AudioRAG Demo") as demo:
             query_input = gr.Textbox(label="Search Query", placeholder="What are you looking for in the audio?")
             chunk_length = gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)")
-            with gr.Accordion("OpenAI Integration (Optional)", open=False):
-                use_openai = gr.Checkbox(label="Use OpenAI for answer generation")
                 openai_key = gr.Textbox(label="OpenAI API Key", type="password")
             search_btn = gr.Button("Search Audio", variant="primary")
@@ -200,21 +201,19 @@ with gr.Blocks(title="AudioRAG Demo") as demo:
         with gr.Column():
             output_text = gr.Textbox(label="Results", lines=10)
             output_audio = gr.Audio(label="Top Matching Audio Chunk", type="filepath")
-            output_plot = gr.Plot(label="Audio Waveform")
-    search_btn.click(
-        fn=process_audio_rag,
-        inputs=[audio_input, query_input, chunk_length, use_openai, openai_key],
-        outputs=[output_text, output_audio, output_plot]
-    )
     gr.Examples(
         examples=[
-            ["example_audio.wav", "Was Hannibal well liked by his men?", 30],
-            ["podcast.mp3", "What did they say about climate change?", 20],
         ],
         inputs=[audio_input, query_input, chunk_length]
     )
 if __name__ == "__main__":
     # Load model on startup

 def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
     """Main processing function"""
     if not audio_file_path:
+        return "Please upload an audio file", None
     if not query:
+        return "Please enter a search query", None
     try:
         # Chunk audio
         # Prepare results
         result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
+        result_text += f"Chunk indices: {top_indices}\n"
+        result_text += f"Total chunks in audio: {len(audios)}\n\n"
         # Save first result as audio file
         first_chunk_path = "result_chunk.wav"
         # Optional: Use OpenAI for answer generation
         if use_openai and openai_key:
+            result_text += "Generating textual answer from retrieved audio chunks...\n\n"
             from openai import OpenAI
             client = OpenAI(api_key=openai_key)
                     model="gpt-4o-audio-preview",
                     messages=[{"role": "user", "content": content}]
                 )
+                result_text += f"OpenAI Answer: {completion.choices[0].message.content}"
             except Exception as e:
+                result_text += f"OpenAI Error: {str(e)}"
+        return result_text, first_chunk_path
     except Exception as e:
+        return f"Error: {str(e)}", None
 # Create Gradio interface
 with gr.Blocks(title="AudioRAG Demo") as demo:
     gr.Markdown("# AudioRAG Demo - Semantic Audio Search")
+        gr.Markdown("""
+        This demo builds on the work from the ColQwen team, expanding retrieval capabilities beyond images to include audio and video. Inspired by the Qwen-Omni series, ColQwen-Omni (3B) pushes the boundaries of multimodal search — embedding and retrieving almost any type of content.
+        **What’s new?**
+        Unlike traditional methods, this model searches directly through raw audio without converting it to text. It understands semantic meaning in sound, speech, and audio patterns — making "AudioRAG" a real possibility.
+        📖 [Blog post](https://huggingface.co/blog/manu/colqwen-omni-omnimodal-retrieval) | 🤗 [Model on Hugging Face](https://huggingface.co/vidore/colqwen-omni-v0.1)
+        """)
     with gr.Row():
         with gr.Column():
             query_input = gr.Textbox(label="Search Query", placeholder="What are you looking for in the audio?")
             chunk_length = gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)")
+            with gr.Accordion("API key for textual answer (Optional)", open=False):
+                gr.Markdown("Generate a textual answer based on the retrieved audio chunks with an OpenAI api key")
+                use_openai = gr.Checkbox(label="Generate textual answer from retrieved audio")
                 openai_key = gr.Textbox(label="OpenAI API Key", type="password")
             search_btn = gr.Button("Search Audio", variant="primary")
         with gr.Column():
             output_text = gr.Textbox(label="Results", lines=10)
             output_audio = gr.Audio(label="Top Matching Audio Chunk", type="filepath")
     gr.Examples(
         examples=[
+            ["test.m4a", "Who's the guest of the podcast?", 426],
         ],
         inputs=[audio_input, query_input, chunk_length]
     )
+    search_btn.click(
+        fn=process_audio_rag,
+        inputs=[audio_input, query_input, chunk_length, use_openai, openai_key],
+        outputs=[output_text, output_audio]
+    )
 if __name__ == "__main__":
     # Load model on startup