Spaces:

tdurzynski
/

real-time-speech-translation

Running

App Files Files Community

tdurzynski commited on Feb 7

Commit

5e70a25

verified ·

1 Parent(s): 951b505

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -33

app.py CHANGED Viewed

@@ -4,12 +4,12 @@ Speech Translation Demo with Automatic TTS, Restart Option, and About Tab
 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
   2. Uses OpenAI’s Whisper model to transcribe the speech.
-  3. Splits the transcription into segments and translates each segment
-     on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
   5. Automatically converts the final translated text to speech using gTTS.
   6. Provides a "Restart Recording" button (located just below the recording section)
      to reset the audio input, translated text, and TTS output.
 Note: True real-time translation (i.e. while speaking) requires a continuous streaming
 solution which is not provided by the standard browser microphone input.
 """
@@ -24,10 +24,8 @@ import uuid
 # -----------------------------------------------------------------------------
 # Global Model Loading
 # -----------------------------------------------------------------------------
-# Load the Whisper model (using "base" for a balance between speed and accuracy).
-whisper_model = whisper.load_model("base")  # Adjust model size as needed
-# Load the M2M100 model and tokenizer for translation.
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
@@ -44,50 +42,48 @@ LANGUAGES = {
 }
 # -----------------------------------------------------------------------------
-# Main Processing Function: Translation (streaming)
 # -----------------------------------------------------------------------------
 def translate_audio(audio, target_language):
     """
-    Process the input audio, transcribe it using Whisper, and translate each segment
-    to the chosen target language. Yields cumulative translation output for streaming.
     """
     if audio is None:
-        yield "No audio provided."
-        return
-    # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
     result = whisper_model.transcribe(audio, fp16=False)
     source_lang = result.get("language", "en")
     target_lang_code = LANGUAGES.get(target_language, "en")
     cumulative_translation = ""
     for segment in result.get("segments", []):
         segment_text = segment.get("text", "").strip()
         if not segment_text:
             continue
         if source_lang == target_lang_code:
             translated_segment = segment_text
         else:
-            # Set the source language for proper translation.
-            tokenizer.src_lang = source_lang
             encoded = tokenizer(segment_text, return_tensors="pt")
             generated_tokens = m2m100_model.generate(
                 **encoded,
                 forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
             )
             translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         cumulative_translation += translated_segment + " "
-        yield cumulative_translation.strip()
 # -----------------------------------------------------------------------------
 # TTS Generation Function
 # -----------------------------------------------------------------------------
 def generate_tts(text, target_language):
     """
-    Convert the translated text to speech using gTTS.
-    Returns the filename of the generated audio file.
     """
     lang_code = LANGUAGES.get(target_language, "en")
     if not text or not text.strip():
@@ -102,8 +98,7 @@ def generate_tts(text, target_language):
 # -----------------------------------------------------------------------------
 def restart_recording():
     """
-    Reset the recording section by clearing the audio input, the translation textbox,
-    and the TTS audio output.
     """
     return None, "", None
@@ -112,7 +107,7 @@ def restart_recording():
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
     with gr.Tabs():
-        # "Demo" Tab: Contains the interactive interface.
         with gr.TabItem("Demo"):
             gr.Markdown("# Real-time Speech Translation Demo")
             gr.Markdown(
@@ -121,7 +116,7 @@ with gr.Blocks() as demo:
                 "**Note:** The translation and speech synthesis occur automatically after recording."
             )
-            # Row for audio input and target language selection.
             with gr.Row():
                 audio_input = gr.Audio(
                     sources=["microphone"],
@@ -135,7 +130,7 @@ with gr.Blocks() as demo:
                     label="Select Target Language"
                 )
-            # Row for the Restart Recording button (placed just below the recording section).
             with gr.Row():
                 restart_button = gr.Button("Restart Recording")
@@ -143,28 +138,25 @@ with gr.Blocks() as demo:
             output_text = gr.Textbox(label="Translated Text", lines=10)
             tts_audio = gr.Audio(label="Translated Speech", type="filepath")
-            # Chain the events:
-            # 1. When new audio is recorded, stream the translation text.
-            # 2. Once translation is complete, automatically generate the TTS audio.
             audio_input.change(
                 fn=translate_audio,
                 inputs=[audio_input, target_lang_dropdown],
-                outputs=output_text,
-                stream=True
             ).then(
                 fn=generate_tts,
                 inputs=[output_text, target_lang_dropdown],
                 outputs=tts_audio
             )
-            # The Restart button clears the audio input, translation text, and TTS audio.
             restart_button.click(
                 fn=restart_recording,
                 inputs=[],
                 outputs=[audio_input, output_text, tts_audio]
             )
-        # "About" Tab: Displays the descriptive text.
         with gr.TabItem("About"):
             gr.Markdown(
                 """
@@ -182,6 +174,6 @@ This demo performs the following:
                 """
             )
-# Launch the Gradio app (suitable for Hugging Face Spaces).
 demo.launch()

 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
   2. Uses OpenAI’s Whisper model to transcribe the speech.
+  3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
   5. Automatically converts the final translated text to speech using gTTS.
   6. Provides a "Restart Recording" button (located just below the recording section)
      to reset the audio input, translated text, and TTS output.
 Note: True real-time translation (i.e. while speaking) requires a continuous streaming
 solution which is not provided by the standard browser microphone input.
 """
 # -----------------------------------------------------------------------------
 # Global Model Loading
 # -----------------------------------------------------------------------------
+whisper_model = whisper.load_model("base")  # Using "base" for a balance between speed and accuracy
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 }
 # -----------------------------------------------------------------------------
+# Main Processing Function: Translation
 # -----------------------------------------------------------------------------
 def translate_audio(audio, target_language):
     """
+    Transcribes the input audio using Whisper and translates the text into the target language.
+    Returns the cumulative translated text.
     """
     if audio is None:
+        return "No audio provided."
+    # Transcribe the audio (using fp16=False for CPU compatibility)
     result = whisper_model.transcribe(audio, fp16=False)
     source_lang = result.get("language", "en")
     target_lang_code = LANGUAGES.get(target_language, "en")
     cumulative_translation = ""
     for segment in result.get("segments", []):
         segment_text = segment.get("text", "").strip()
         if not segment_text:
             continue
         if source_lang == target_lang_code:
             translated_segment = segment_text
         else:
+            tokenizer.src_lang = source_lang  # Set source language for proper translation.
             encoded = tokenizer(segment_text, return_tensors="pt")
             generated_tokens = m2m100_model.generate(
                 **encoded,
                 forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
             )
             translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         cumulative_translation += translated_segment + " "
+    return cumulative_translation.strip()
 # -----------------------------------------------------------------------------
 # TTS Generation Function
 # -----------------------------------------------------------------------------
 def generate_tts(text, target_language):
     """
+    Converts the given text to speech using gTTS and returns the filename of the generated audio.
     """
     lang_code = LANGUAGES.get(target_language, "en")
     if not text or not text.strip():
 # -----------------------------------------------------------------------------
 def restart_recording():
     """
+    Clears the audio input, translated text, and TTS output.
     """
     return None, "", None
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
     with gr.Tabs():
+        # Demo Tab
         with gr.TabItem("Demo"):
             gr.Markdown("# Real-time Speech Translation Demo")
             gr.Markdown(
                 "**Note:** The translation and speech synthesis occur automatically after recording."
             )
+            # Row for audio input and language selection.
             with gr.Row():
                 audio_input = gr.Audio(
                     sources=["microphone"],
                     label="Select Target Language"
                 )
+            # Restart Recording button placed just below the recording section.
             with gr.Row():
                 restart_button = gr.Button("Restart Recording")
             output_text = gr.Textbox(label="Translated Text", lines=10)
             tts_audio = gr.Audio(label="Translated Speech", type="filepath")
+            # When audio is recorded, process translation and then generate TTS.
             audio_input.change(
                 fn=translate_audio,
                 inputs=[audio_input, target_lang_dropdown],
+                outputs=output_text
             ).then(
                 fn=generate_tts,
                 inputs=[output_text, target_lang_dropdown],
                 outputs=tts_audio
             )
+            # Restart button clears all outputs.
             restart_button.click(
                 fn=restart_recording,
                 inputs=[],
                 outputs=[audio_input, output_text, tts_audio]
             )
+        # About Tab
         with gr.TabItem("About"):
             gr.Markdown(
                 """
                 """
             )
+# Launch the Gradio app.
 demo.launch()