Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Apr 1

Commit

1ac59b7

verified ·

1 Parent(s): f023c8e

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -18

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
-# For Coqui TTS (XTTS-v2)
 try:
     from TTS.api import TTS as CoquiTTS
 except ImportError:
@@ -24,9 +24,10 @@ asr = pipeline(
 )
 # ------------------------------------------------------
-# 2. Translation Models (8 languages)
 # ------------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
     "Indonesian": "Helsinki-NLP/opus-mt-en-id",
@@ -38,6 +39,7 @@ translation_models = {
 }
 translation_tasks = {
     "Spanish": "translation_en_to_es",
     "Vietnamese": "translation_en_to_vi",
     "Indonesian": "translation_en_to_id",
@@ -50,10 +52,11 @@ translation_tasks = {
 # ------------------------------------------------------
 # 3. TTS Configuration
-#    - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
 #    - Coqui XTTS-v2 for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
     "Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
     "Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
@@ -64,7 +67,7 @@ tts_config = {
     "Japanese": {"type": "coqui"}
 }
-# For Coqui, we map our languages to language codes expected by the model.
 coqui_lang_map = {
     "Chinese": "zh",
     "Japanese": "ja"
@@ -74,8 +77,8 @@ coqui_lang_map = {
 # 4. Global Caches for Translators and TTS Models
 # ------------------------------------------------------
 translator_cache = {}
-mms_tts_cache = {}     # For MMS (VITS-based) TTS models
-coqui_tts_cache = None  # Single instance for Coqui XTTS-v2
 # ------------------------------------------------------
 # 5. Translator Helper
@@ -123,7 +126,6 @@ def load_coqui_tts():
     if coqui_tts_cache is not None:
         return coqui_tts_cache
     try:
-        # Set gpu=True if a GPU is available.
         coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
     except Exception as e:
         raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
@@ -131,15 +133,14 @@ def load_coqui_tts():
 def run_coqui_tts(text, lang):
     coqui_tts = load_coqui_tts()
-    lang_code = coqui_lang_map[lang]  # "zh" for Chinese or "ja" for Japanese
-    # Write the output to a temporary file and then read it back.
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
         coqui_tts.tts_to_file(
             text=text,
             file_path=tmp_name,
-            language=lang_code  # using default voice; for cloning, add speaker_wav parameter
         )
         data, sr = sf.read(tmp_name)
     finally:
@@ -169,7 +170,7 @@ def predict(audio, text, target_language):
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         asr_input = {"array": audio_data, "sampling_rate": 16000}
         asr_result = asr(asr_input)
-        english_text = asr_result["text"]
     else:
         return "No input provided.", "", None
@@ -199,7 +200,7 @@ def predict(audio, text, target_language):
 # 9. Gradio Interface
 # ------------------------------------------------------
 language_choices = [
-    "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
 ]
 iface = gr.Interface(
@@ -207,7 +208,7 @@ iface = gr.Interface(
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
-        gr.Dropdown(choices=language_choices, value="Spanish", label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
@@ -216,14 +217,16 @@ iface = gr.Interface(
     ],
     title="Multimodal Language Learning Aid",
     description=(
-        "This app performs the following steps:\n"
-        "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
         "2. Translates the English text to the target language using Helsinki-NLP models.\n"
-        "3. Provides Synthetic speech:\n"
-        "For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
+# For Coqui TTS (XTTS-v2) used for Chinese and Japanese
 try:
     from TTS.api import TTS as CoquiTTS
 except ImportError:
 )
 # ------------------------------------------------------
+# 2. Translation Models (9 languages)
 # ------------------------------------------------------
 translation_models = {
+    "French": "Helsinki-NLP/opus-mt-en-fr",
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
     "Indonesian": "Helsinki-NLP/opus-mt-en-id",
 }
 translation_tasks = {
+    "French": "translation_en_to_fr",
     "Spanish": "translation_en_to_es",
     "Vietnamese": "translation_en_to_vi",
     "Indonesian": "translation_en_to_id",
 # ------------------------------------------------------
 # 3. TTS Configuration
+#    - MMS TTS (VITS) for: French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
 #    - Coqui XTTS-v2 for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
+    "French": {"model_id": "facebook/mms-tts-fra", "architecture": "vits", "type": "mms"},
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
     "Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
     "Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
     "Japanese": {"type": "coqui"}
 }
+# For Coqui, map languages to expected language codes.
 coqui_lang_map = {
     "Chinese": "zh",
     "Japanese": "ja"
 # 4. Global Caches for Translators and TTS Models
 # ------------------------------------------------------
 translator_cache = {}
+mms_tts_cache = {}
+coqui_tts_cache = None
 # ------------------------------------------------------
 # 5. Translator Helper
     if coqui_tts_cache is not None:
         return coqui_tts_cache
     try:
         coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
     except Exception as e:
         raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
 def run_coqui_tts(text, lang):
     coqui_tts = load_coqui_tts()
+    lang_code = coqui_lang_map[lang]
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
         coqui_tts.tts_to_file(
             text=text,
             file_path=tmp_name,
+            language=lang_code
         )
         data, sr = sf.read(tmp_name)
     finally:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         asr_input = {"array": audio_data, "sampling_rate": 16000}
         asr_result = asr(asr_input)
+        english_text = asr_result["text"].lower()
     else:
         return "No input provided.", "", None
 # 9. Gradio Interface
 # ------------------------------------------------------
 language_choices = [
+    "French", "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
 ]
 iface = gr.Interface(
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
+        gr.Dropdown(choices=language_choices, value="French", label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
     ],
     title="Multimodal Language Learning Aid",
     description=(
+        "This app performs the following tasks:\n"
+        "1. Transcribes English speech using Wav2Vec2 (accepts text input as well).\n"
         "2. Translates the English text to the target language using Helsinki-NLP models.\n"
+        "3. Provides speech:\n"
+        "   - For French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
+        "   - For Chinese and Japanese: uses myshell-ai MeloTTS models (work-in-progress).\n"
+        "\nSelect your target language from the dropdown."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)