fdaudens commited on
Commit
4844f57
·
verified ·
1 Parent(s): 525d412

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -28
app.py CHANGED
@@ -115,10 +115,10 @@ def audio_to_base64(data, rate=16000):
115
  def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
116
  """Main processing function"""
117
  if not audio_file_path:
118
- return "Please upload an audio file", None, None
119
 
120
  if not query:
121
- return "Please enter a search query", None, None
122
 
123
  try:
124
  # Chunk audio
@@ -132,7 +132,8 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
132
 
133
  # Prepare results
134
  result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
135
- result_text += f"Chunk indices: {top_indices}\n\n"
 
136
 
137
  # Save first result as audio file
138
  first_chunk_path = "result_chunk.wav"
@@ -140,6 +141,7 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
140
 
141
  # Optional: Use OpenAI for answer generation
142
  if use_openai and openai_key:
 
143
  from openai import OpenAI
144
  client = OpenAI(api_key=openai_key)
145
 
@@ -162,28 +164,26 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
162
  model="gpt-4o-audio-preview",
163
  messages=[{"role": "user", "content": content}]
164
  )
165
- result_text += f"\nOpenAI Answer: {completion.choices[0].message.content}"
166
  except Exception as e:
167
- result_text += f"\nOpenAI Error: {str(e)}"
168
 
169
- # Create audio visualization
170
- import matplotlib.pyplot as plt
171
- fig, ax = plt.subplots(figsize=(10, 4))
172
- ax.plot(audios[top_indices[0]])
173
- ax.set_title(f"Waveform of top matching chunk (#{top_indices[0]})")
174
- ax.set_xlabel("Samples")
175
- ax.set_ylabel("Amplitude")
176
- plt.tight_layout()
177
-
178
- return result_text, first_chunk_path, fig
179
 
180
  except Exception as e:
181
- return f"Error: {str(e)}", None, None
182
 
183
  # Create Gradio interface
184
  with gr.Blocks(title="AudioRAG Demo") as demo:
185
  gr.Markdown("# AudioRAG Demo - Semantic Audio Search")
186
- gr.Markdown("Upload an audio file and search through it using natural language queries!")
 
 
 
 
 
 
 
187
 
188
  with gr.Row():
189
  with gr.Column():
@@ -191,8 +191,9 @@ with gr.Blocks(title="AudioRAG Demo") as demo:
191
  query_input = gr.Textbox(label="Search Query", placeholder="What are you looking for in the audio?")
192
  chunk_length = gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)")
193
 
194
- with gr.Accordion("OpenAI Integration (Optional)", open=False):
195
- use_openai = gr.Checkbox(label="Use OpenAI for answer generation")
 
196
  openai_key = gr.Textbox(label="OpenAI API Key", type="password")
197
 
198
  search_btn = gr.Button("Search Audio", variant="primary")
@@ -200,21 +201,19 @@ with gr.Blocks(title="AudioRAG Demo") as demo:
200
  with gr.Column():
201
  output_text = gr.Textbox(label="Results", lines=10)
202
  output_audio = gr.Audio(label="Top Matching Audio Chunk", type="filepath")
203
- output_plot = gr.Plot(label="Audio Waveform")
204
-
205
- search_btn.click(
206
- fn=process_audio_rag,
207
- inputs=[audio_input, query_input, chunk_length, use_openai, openai_key],
208
- outputs=[output_text, output_audio, output_plot]
209
- )
210
 
211
  gr.Examples(
212
  examples=[
213
- ["example_audio.wav", "Was Hannibal well liked by his men?", 30],
214
- ["podcast.mp3", "What did they say about climate change?", 20],
215
  ],
216
  inputs=[audio_input, query_input, chunk_length]
217
  )
 
 
 
 
 
 
218
 
219
  if __name__ == "__main__":
220
  # Load model on startup
 
115
  def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
116
  """Main processing function"""
117
  if not audio_file_path:
118
+ return "Please upload an audio file", None
119
 
120
  if not query:
121
+ return "Please enter a search query", None
122
 
123
  try:
124
  # Chunk audio
 
132
 
133
  # Prepare results
134
  result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
135
+ result_text += f"Chunk indices: {top_indices}\n"
136
+ result_text += f"Total chunks in audio: {len(audios)}\n\n"
137
 
138
  # Save first result as audio file
139
  first_chunk_path = "result_chunk.wav"
 
141
 
142
  # Optional: Use OpenAI for answer generation
143
  if use_openai and openai_key:
144
+ result_text += "Generating textual answer from retrieved audio chunks...\n\n"
145
  from openai import OpenAI
146
  client = OpenAI(api_key=openai_key)
147
 
 
164
  model="gpt-4o-audio-preview",
165
  messages=[{"role": "user", "content": content}]
166
  )
167
+ result_text += f"OpenAI Answer: {completion.choices[0].message.content}"
168
  except Exception as e:
169
+ result_text += f"OpenAI Error: {str(e)}"
170
 
171
+ return result_text, first_chunk_path
 
 
 
 
 
 
 
 
 
172
 
173
  except Exception as e:
174
+ return f"Error: {str(e)}", None
175
 
176
  # Create Gradio interface
177
  with gr.Blocks(title="AudioRAG Demo") as demo:
178
  gr.Markdown("# AudioRAG Demo - Semantic Audio Search")
179
+ gr.Markdown("""
180
+ This demo builds on the work from the ColQwen team, expanding retrieval capabilities beyond images to include audio and video. Inspired by the Qwen-Omni series, ColQwen-Omni (3B) pushes the boundaries of multimodal search — embedding and retrieving almost any type of content.
181
+
182
+ **What’s new?**
183
+ Unlike traditional methods, this model searches directly through raw audio without converting it to text. It understands semantic meaning in sound, speech, and audio patterns — making "AudioRAG" a real possibility.
184
+
185
+ 📖 [Blog post](https://huggingface.co/blog/manu/colqwen-omni-omnimodal-retrieval) | 🤗 [Model on Hugging Face](https://huggingface.co/vidore/colqwen-omni-v0.1)
186
+ """)
187
 
188
  with gr.Row():
189
  with gr.Column():
 
191
  query_input = gr.Textbox(label="Search Query", placeholder="What are you looking for in the audio?")
192
  chunk_length = gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)")
193
 
194
+ with gr.Accordion("API key for textual answer (Optional)", open=False):
195
+ gr.Markdown("Generate a textual answer based on the retrieved audio chunks with an OpenAI api key")
196
+ use_openai = gr.Checkbox(label="Generate textual answer from retrieved audio")
197
  openai_key = gr.Textbox(label="OpenAI API Key", type="password")
198
 
199
  search_btn = gr.Button("Search Audio", variant="primary")
 
201
  with gr.Column():
202
  output_text = gr.Textbox(label="Results", lines=10)
203
  output_audio = gr.Audio(label="Top Matching Audio Chunk", type="filepath")
 
 
 
 
 
 
 
204
 
205
  gr.Examples(
206
  examples=[
207
+ ["test.m4a", "Who's the guest of the podcast?", 426],
 
208
  ],
209
  inputs=[audio_input, query_input, chunk_length]
210
  )
211
+
212
+ search_btn.click(
213
+ fn=process_audio_rag,
214
+ inputs=[audio_input, query_input, chunk_length, use_openai, openai_key],
215
+ outputs=[output_text, output_audio]
216
+ )
217
 
218
  if __name__ == "__main__":
219
  # Load model on startup