Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -4,12 +4,12 @@ Speech Translation Demo with Automatic TTS, Restart Option, and About Tab | |
| 4 | 
             
            This demo performs the following:
         | 
| 5 | 
             
              1. Accepts up to 15 seconds of audio recording from the microphone.
         | 
| 6 | 
             
              2. Uses OpenAI’s Whisper model to transcribe the speech.
         | 
| 7 | 
            -
              3. Splits the transcription into segments and translates each segment
         | 
| 8 | 
            -
                 on-the-fly using Facebook’s M2M100 model.
         | 
| 9 | 
             
              4. Streams the cumulative translation output to the user.
         | 
| 10 | 
             
              5. Automatically converts the final translated text to speech using gTTS.
         | 
| 11 | 
             
              6. Provides a "Restart Recording" button (located just below the recording section)
         | 
| 12 | 
             
                 to reset the audio input, translated text, and TTS output.
         | 
|  | |
| 13 | 
             
            Note: True real-time translation (i.e. while speaking) requires a continuous streaming
         | 
| 14 | 
             
            solution which is not provided by the standard browser microphone input.
         | 
| 15 | 
             
            """
         | 
| @@ -24,10 +24,8 @@ import uuid | |
| 24 | 
             
            # -----------------------------------------------------------------------------
         | 
| 25 | 
             
            # Global Model Loading
         | 
| 26 | 
             
            # -----------------------------------------------------------------------------
         | 
| 27 | 
            -
             | 
| 28 | 
            -
            whisper_model = whisper.load_model("base")  # Adjust model size as needed
         | 
| 29 |  | 
| 30 | 
            -
            # Load the M2M100 model and tokenizer for translation.
         | 
| 31 | 
             
            tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
         | 
| 32 | 
             
            m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
         | 
| 33 |  | 
| @@ -44,50 +42,48 @@ LANGUAGES = { | |
| 44 | 
             
            }
         | 
| 45 |  | 
| 46 | 
             
            # -----------------------------------------------------------------------------
         | 
| 47 | 
            -
            # Main Processing Function: Translation | 
| 48 | 
             
            # -----------------------------------------------------------------------------
         | 
| 49 | 
             
            def translate_audio(audio, target_language):
         | 
| 50 | 
             
                """
         | 
| 51 | 
            -
                 | 
| 52 | 
            -
                 | 
| 53 | 
             
                """
         | 
| 54 | 
             
                if audio is None:
         | 
| 55 | 
            -
                     | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
                # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
         | 
| 59 | 
             
                result = whisper_model.transcribe(audio, fp16=False)
         | 
| 60 | 
             
                source_lang = result.get("language", "en")
         | 
| 61 | 
             
                target_lang_code = LANGUAGES.get(target_language, "en")
         | 
| 62 | 
            -
             | 
| 63 | 
             
                cumulative_translation = ""
         | 
| 64 | 
             
                for segment in result.get("segments", []):
         | 
| 65 | 
             
                    segment_text = segment.get("text", "").strip()
         | 
| 66 | 
             
                    if not segment_text:
         | 
| 67 | 
             
                        continue
         | 
| 68 | 
            -
             | 
| 69 | 
             
                    if source_lang == target_lang_code:
         | 
| 70 | 
             
                        translated_segment = segment_text
         | 
| 71 | 
             
                    else:
         | 
| 72 | 
            -
                        # Set  | 
| 73 | 
            -
                        tokenizer.src_lang = source_lang
         | 
| 74 | 
             
                        encoded = tokenizer(segment_text, return_tensors="pt")
         | 
| 75 | 
             
                        generated_tokens = m2m100_model.generate(
         | 
| 76 | 
             
                            **encoded,
         | 
| 77 | 
             
                            forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
         | 
| 78 | 
             
                        )
         | 
| 79 | 
             
                        translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         | 
| 80 | 
            -
             | 
| 81 | 
             
                    cumulative_translation += translated_segment + " "
         | 
| 82 | 
            -
             | 
|  | |
| 83 |  | 
| 84 | 
             
            # -----------------------------------------------------------------------------
         | 
| 85 | 
             
            # TTS Generation Function
         | 
| 86 | 
             
            # -----------------------------------------------------------------------------
         | 
| 87 | 
             
            def generate_tts(text, target_language):
         | 
| 88 | 
             
                """
         | 
| 89 | 
            -
                 | 
| 90 | 
            -
                Returns the filename of the generated audio file.
         | 
| 91 | 
             
                """
         | 
| 92 | 
             
                lang_code = LANGUAGES.get(target_language, "en")
         | 
| 93 | 
             
                if not text or not text.strip():
         | 
| @@ -102,8 +98,7 @@ def generate_tts(text, target_language): | |
| 102 | 
             
            # -----------------------------------------------------------------------------
         | 
| 103 | 
             
            def restart_recording():
         | 
| 104 | 
             
                """
         | 
| 105 | 
            -
                 | 
| 106 | 
            -
                and the TTS audio output.
         | 
| 107 | 
             
                """
         | 
| 108 | 
             
                return None, "", None
         | 
| 109 |  | 
| @@ -112,7 +107,7 @@ def restart_recording(): | |
| 112 | 
             
            # -----------------------------------------------------------------------------
         | 
| 113 | 
             
            with gr.Blocks() as demo:
         | 
| 114 | 
             
                with gr.Tabs():
         | 
| 115 | 
            -
                    #  | 
| 116 | 
             
                    with gr.TabItem("Demo"):
         | 
| 117 | 
             
                        gr.Markdown("# Real-time Speech Translation Demo")
         | 
| 118 | 
             
                        gr.Markdown(
         | 
| @@ -121,7 +116,7 @@ with gr.Blocks() as demo: | |
| 121 | 
             
                            "**Note:** The translation and speech synthesis occur automatically after recording."
         | 
| 122 | 
             
                        )
         | 
| 123 |  | 
| 124 | 
            -
                        # Row for audio input and  | 
| 125 | 
             
                        with gr.Row():
         | 
| 126 | 
             
                            audio_input = gr.Audio(
         | 
| 127 | 
             
                                sources=["microphone"],
         | 
| @@ -135,7 +130,7 @@ with gr.Blocks() as demo: | |
| 135 | 
             
                                label="Select Target Language"
         | 
| 136 | 
             
                            )
         | 
| 137 |  | 
| 138 | 
            -
                        #  | 
| 139 | 
             
                        with gr.Row():
         | 
| 140 | 
             
                            restart_button = gr.Button("Restart Recording")
         | 
| 141 |  | 
| @@ -143,28 +138,25 @@ with gr.Blocks() as demo: | |
| 143 | 
             
                        output_text = gr.Textbox(label="Translated Text", lines=10)
         | 
| 144 | 
             
                        tts_audio = gr.Audio(label="Translated Speech", type="filepath")
         | 
| 145 |  | 
| 146 | 
            -
                        #  | 
| 147 | 
            -
                        # 1. When new audio is recorded, stream the translation text.
         | 
| 148 | 
            -
                        # 2. Once translation is complete, automatically generate the TTS audio.
         | 
| 149 | 
             
                        audio_input.change(
         | 
| 150 | 
             
                            fn=translate_audio,
         | 
| 151 | 
             
                            inputs=[audio_input, target_lang_dropdown],
         | 
| 152 | 
            -
                            outputs=output_text | 
| 153 | 
            -
                            stream=True
         | 
| 154 | 
             
                        ).then(
         | 
| 155 | 
             
                            fn=generate_tts,
         | 
| 156 | 
             
                            inputs=[output_text, target_lang_dropdown],
         | 
| 157 | 
             
                            outputs=tts_audio
         | 
| 158 | 
             
                        )
         | 
| 159 |  | 
| 160 | 
            -
                        #  | 
| 161 | 
             
                        restart_button.click(
         | 
| 162 | 
             
                            fn=restart_recording,
         | 
| 163 | 
             
                            inputs=[],
         | 
| 164 | 
             
                            outputs=[audio_input, output_text, tts_audio]
         | 
| 165 | 
             
                        )
         | 
| 166 |  | 
| 167 | 
            -
                    #  | 
| 168 | 
             
                    with gr.TabItem("About"):
         | 
| 169 | 
             
                        gr.Markdown(
         | 
| 170 | 
             
                            """
         | 
| @@ -182,6 +174,6 @@ This demo performs the following: | |
| 182 | 
             
                            """
         | 
| 183 | 
             
                        )
         | 
| 184 |  | 
| 185 | 
            -
            # Launch the Gradio app | 
| 186 | 
             
            demo.launch()
         | 
| 187 |  | 
|  | |
| 4 | 
             
            This demo performs the following:
         | 
| 5 | 
             
              1. Accepts up to 15 seconds of audio recording from the microphone.
         | 
| 6 | 
             
              2. Uses OpenAI’s Whisper model to transcribe the speech.
         | 
| 7 | 
            +
              3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
         | 
|  | |
| 8 | 
             
              4. Streams the cumulative translation output to the user.
         | 
| 9 | 
             
              5. Automatically converts the final translated text to speech using gTTS.
         | 
| 10 | 
             
              6. Provides a "Restart Recording" button (located just below the recording section)
         | 
| 11 | 
             
                 to reset the audio input, translated text, and TTS output.
         | 
| 12 | 
            +
                 
         | 
| 13 | 
             
            Note: True real-time translation (i.e. while speaking) requires a continuous streaming
         | 
| 14 | 
             
            solution which is not provided by the standard browser microphone input.
         | 
| 15 | 
             
            """
         | 
|  | |
| 24 | 
             
            # -----------------------------------------------------------------------------
         | 
| 25 | 
             
            # Global Model Loading
         | 
| 26 | 
             
            # -----------------------------------------------------------------------------
         | 
| 27 | 
            +
            whisper_model = whisper.load_model("base")  # Using "base" for a balance between speed and accuracy
         | 
|  | |
| 28 |  | 
|  | |
| 29 | 
             
            tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
         | 
| 30 | 
             
            m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
         | 
| 31 |  | 
|  | |
| 42 | 
             
            }
         | 
| 43 |  | 
| 44 | 
             
            # -----------------------------------------------------------------------------
         | 
| 45 | 
            +
            # Main Processing Function: Translation
         | 
| 46 | 
             
            # -----------------------------------------------------------------------------
         | 
| 47 | 
             
            def translate_audio(audio, target_language):
         | 
| 48 | 
             
                """
         | 
| 49 | 
            +
                Transcribes the input audio using Whisper and translates the text into the target language.
         | 
| 50 | 
            +
                Returns the cumulative translated text.
         | 
| 51 | 
             
                """
         | 
| 52 | 
             
                if audio is None:
         | 
| 53 | 
            +
                    return "No audio provided."
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                # Transcribe the audio (using fp16=False for CPU compatibility)
         | 
|  | |
| 56 | 
             
                result = whisper_model.transcribe(audio, fp16=False)
         | 
| 57 | 
             
                source_lang = result.get("language", "en")
         | 
| 58 | 
             
                target_lang_code = LANGUAGES.get(target_language, "en")
         | 
| 59 | 
            +
                
         | 
| 60 | 
             
                cumulative_translation = ""
         | 
| 61 | 
             
                for segment in result.get("segments", []):
         | 
| 62 | 
             
                    segment_text = segment.get("text", "").strip()
         | 
| 63 | 
             
                    if not segment_text:
         | 
| 64 | 
             
                        continue
         | 
| 65 | 
            +
                    
         | 
| 66 | 
             
                    if source_lang == target_lang_code:
         | 
| 67 | 
             
                        translated_segment = segment_text
         | 
| 68 | 
             
                    else:
         | 
| 69 | 
            +
                        tokenizer.src_lang = source_lang  # Set source language for proper translation.
         | 
|  | |
| 70 | 
             
                        encoded = tokenizer(segment_text, return_tensors="pt")
         | 
| 71 | 
             
                        generated_tokens = m2m100_model.generate(
         | 
| 72 | 
             
                            **encoded,
         | 
| 73 | 
             
                            forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
         | 
| 74 | 
             
                        )
         | 
| 75 | 
             
                        translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         | 
| 76 | 
            +
                    
         | 
| 77 | 
             
                    cumulative_translation += translated_segment + " "
         | 
| 78 | 
            +
                
         | 
| 79 | 
            +
                return cumulative_translation.strip()
         | 
| 80 |  | 
| 81 | 
             
            # -----------------------------------------------------------------------------
         | 
| 82 | 
             
            # TTS Generation Function
         | 
| 83 | 
             
            # -----------------------------------------------------------------------------
         | 
| 84 | 
             
            def generate_tts(text, target_language):
         | 
| 85 | 
             
                """
         | 
| 86 | 
            +
                Converts the given text to speech using gTTS and returns the filename of the generated audio.
         | 
|  | |
| 87 | 
             
                """
         | 
| 88 | 
             
                lang_code = LANGUAGES.get(target_language, "en")
         | 
| 89 | 
             
                if not text or not text.strip():
         | 
|  | |
| 98 | 
             
            # -----------------------------------------------------------------------------
         | 
| 99 | 
             
            def restart_recording():
         | 
| 100 | 
             
                """
         | 
| 101 | 
            +
                Clears the audio input, translated text, and TTS output.
         | 
|  | |
| 102 | 
             
                """
         | 
| 103 | 
             
                return None, "", None
         | 
| 104 |  | 
|  | |
| 107 | 
             
            # -----------------------------------------------------------------------------
         | 
| 108 | 
             
            with gr.Blocks() as demo:
         | 
| 109 | 
             
                with gr.Tabs():
         | 
| 110 | 
            +
                    # Demo Tab
         | 
| 111 | 
             
                    with gr.TabItem("Demo"):
         | 
| 112 | 
             
                        gr.Markdown("# Real-time Speech Translation Demo")
         | 
| 113 | 
             
                        gr.Markdown(
         | 
|  | |
| 116 | 
             
                            "**Note:** The translation and speech synthesis occur automatically after recording."
         | 
| 117 | 
             
                        )
         | 
| 118 |  | 
| 119 | 
            +
                        # Row for audio input and language selection.
         | 
| 120 | 
             
                        with gr.Row():
         | 
| 121 | 
             
                            audio_input = gr.Audio(
         | 
| 122 | 
             
                                sources=["microphone"],
         | 
|  | |
| 130 | 
             
                                label="Select Target Language"
         | 
| 131 | 
             
                            )
         | 
| 132 |  | 
| 133 | 
            +
                        # Restart Recording button placed just below the recording section.
         | 
| 134 | 
             
                        with gr.Row():
         | 
| 135 | 
             
                            restart_button = gr.Button("Restart Recording")
         | 
| 136 |  | 
|  | |
| 138 | 
             
                        output_text = gr.Textbox(label="Translated Text", lines=10)
         | 
| 139 | 
             
                        tts_audio = gr.Audio(label="Translated Speech", type="filepath")
         | 
| 140 |  | 
| 141 | 
            +
                        # When audio is recorded, process translation and then generate TTS.
         | 
|  | |
|  | |
| 142 | 
             
                        audio_input.change(
         | 
| 143 | 
             
                            fn=translate_audio,
         | 
| 144 | 
             
                            inputs=[audio_input, target_lang_dropdown],
         | 
| 145 | 
            +
                            outputs=output_text
         | 
|  | |
| 146 | 
             
                        ).then(
         | 
| 147 | 
             
                            fn=generate_tts,
         | 
| 148 | 
             
                            inputs=[output_text, target_lang_dropdown],
         | 
| 149 | 
             
                            outputs=tts_audio
         | 
| 150 | 
             
                        )
         | 
| 151 |  | 
| 152 | 
            +
                        # Restart button clears all outputs.
         | 
| 153 | 
             
                        restart_button.click(
         | 
| 154 | 
             
                            fn=restart_recording,
         | 
| 155 | 
             
                            inputs=[],
         | 
| 156 | 
             
                            outputs=[audio_input, output_text, tts_audio]
         | 
| 157 | 
             
                        )
         | 
| 158 |  | 
| 159 | 
            +
                    # About Tab
         | 
| 160 | 
             
                    with gr.TabItem("About"):
         | 
| 161 | 
             
                        gr.Markdown(
         | 
| 162 | 
             
                            """
         | 
|  | |
| 174 | 
             
                            """
         | 
| 175 | 
             
                        )
         | 
| 176 |  | 
| 177 | 
            +
            # Launch the Gradio app.
         | 
| 178 | 
             
            demo.launch()
         | 
| 179 |  |