Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import os
|
|
| 9 |
from transformers import pipeline, VitsModel, AutoTokenizer
|
| 10 |
from datasets import load_dataset
|
| 11 |
|
| 12 |
-
# For Coqui TTS (XTTS-v2)
|
| 13 |
try:
|
| 14 |
from TTS.api import TTS as CoquiTTS
|
| 15 |
except ImportError:
|
|
@@ -24,9 +24,10 @@ asr = pipeline(
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# ------------------------------------------------------
|
| 27 |
-
# 2. Translation Models (
|
| 28 |
# ------------------------------------------------------
|
| 29 |
translation_models = {
|
|
|
|
| 30 |
"Spanish": "Helsinki-NLP/opus-mt-en-es",
|
| 31 |
"Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
|
| 32 |
"Indonesian": "Helsinki-NLP/opus-mt-en-id",
|
|
@@ -38,6 +39,7 @@ translation_models = {
|
|
| 38 |
}
|
| 39 |
|
| 40 |
translation_tasks = {
|
|
|
|
| 41 |
"Spanish": "translation_en_to_es",
|
| 42 |
"Vietnamese": "translation_en_to_vi",
|
| 43 |
"Indonesian": "translation_en_to_id",
|
|
@@ -50,10 +52,11 @@ translation_tasks = {
|
|
| 50 |
|
| 51 |
# ------------------------------------------------------
|
| 52 |
# 3. TTS Configuration
|
| 53 |
-
# - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
|
| 54 |
# - Coqui XTTS-v2 for: Chinese and Japanese
|
| 55 |
# ------------------------------------------------------
|
| 56 |
tts_config = {
|
|
|
|
| 57 |
"Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
|
| 58 |
"Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
|
| 59 |
"Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
|
|
@@ -64,7 +67,7 @@ tts_config = {
|
|
| 64 |
"Japanese": {"type": "coqui"}
|
| 65 |
}
|
| 66 |
|
| 67 |
-
# For Coqui,
|
| 68 |
coqui_lang_map = {
|
| 69 |
"Chinese": "zh",
|
| 70 |
"Japanese": "ja"
|
|
@@ -74,8 +77,8 @@ coqui_lang_map = {
|
|
| 74 |
# 4. Global Caches for Translators and TTS Models
|
| 75 |
# ------------------------------------------------------
|
| 76 |
translator_cache = {}
|
| 77 |
-
mms_tts_cache = {}
|
| 78 |
-
coqui_tts_cache = None
|
| 79 |
|
| 80 |
# ------------------------------------------------------
|
| 81 |
# 5. Translator Helper
|
|
@@ -123,7 +126,6 @@ def load_coqui_tts():
|
|
| 123 |
if coqui_tts_cache is not None:
|
| 124 |
return coqui_tts_cache
|
| 125 |
try:
|
| 126 |
-
# Set gpu=True if a GPU is available.
|
| 127 |
coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
| 128 |
except Exception as e:
|
| 129 |
raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
|
|
@@ -131,15 +133,14 @@ def load_coqui_tts():
|
|
| 131 |
|
| 132 |
def run_coqui_tts(text, lang):
|
| 133 |
coqui_tts = load_coqui_tts()
|
| 134 |
-
lang_code = coqui_lang_map[lang]
|
| 135 |
-
# Write the output to a temporary file and then read it back.
|
| 136 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 137 |
tmp_name = tmp.name
|
| 138 |
try:
|
| 139 |
coqui_tts.tts_to_file(
|
| 140 |
text=text,
|
| 141 |
file_path=tmp_name,
|
| 142 |
-
language=lang_code
|
| 143 |
)
|
| 144 |
data, sr = sf.read(tmp_name)
|
| 145 |
finally:
|
|
@@ -169,7 +170,7 @@ def predict(audio, text, target_language):
|
|
| 169 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
| 170 |
asr_input = {"array": audio_data, "sampling_rate": 16000}
|
| 171 |
asr_result = asr(asr_input)
|
| 172 |
-
english_text = asr_result["text"]
|
| 173 |
else:
|
| 174 |
return "No input provided.", "", None
|
| 175 |
|
|
@@ -199,7 +200,7 @@ def predict(audio, text, target_language):
|
|
| 199 |
# 9. Gradio Interface
|
| 200 |
# ------------------------------------------------------
|
| 201 |
language_choices = [
|
| 202 |
-
"Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
|
| 203 |
]
|
| 204 |
|
| 205 |
iface = gr.Interface(
|
|
@@ -207,7 +208,7 @@ iface = gr.Interface(
|
|
| 207 |
inputs=[
|
| 208 |
gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
|
| 209 |
gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
|
| 210 |
-
gr.Dropdown(choices=language_choices, value="
|
| 211 |
],
|
| 212 |
outputs=[
|
| 213 |
gr.Textbox(label="English Transcription"),
|
|
@@ -216,14 +217,16 @@ iface = gr.Interface(
|
|
| 216 |
],
|
| 217 |
title="Multimodal Language Learning Aid",
|
| 218 |
description=(
|
| 219 |
-
"This app performs the following
|
| 220 |
-
"1. Transcribes English speech using Wav2Vec2 (
|
| 221 |
"2. Translates the English text to the target language using Helsinki-NLP models.\n"
|
| 222 |
-
"3. Provides
|
| 223 |
-
"For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean
|
|
|
|
|
|
|
| 224 |
),
|
| 225 |
allow_flagging="never"
|
| 226 |
)
|
| 227 |
|
| 228 |
if __name__ == "__main__":
|
| 229 |
-
iface.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 9 |
from transformers import pipeline, VitsModel, AutoTokenizer
|
| 10 |
from datasets import load_dataset
|
| 11 |
|
| 12 |
+
# For Coqui TTS (XTTS-v2) used for Chinese and Japanese
|
| 13 |
try:
|
| 14 |
from TTS.api import TTS as CoquiTTS
|
| 15 |
except ImportError:
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# ------------------------------------------------------
|
| 27 |
+
# 2. Translation Models (9 languages)
|
| 28 |
# ------------------------------------------------------
|
| 29 |
translation_models = {
|
| 30 |
+
"French": "Helsinki-NLP/opus-mt-en-fr",
|
| 31 |
"Spanish": "Helsinki-NLP/opus-mt-en-es",
|
| 32 |
"Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
|
| 33 |
"Indonesian": "Helsinki-NLP/opus-mt-en-id",
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
translation_tasks = {
|
| 42 |
+
"French": "translation_en_to_fr",
|
| 43 |
"Spanish": "translation_en_to_es",
|
| 44 |
"Vietnamese": "translation_en_to_vi",
|
| 45 |
"Indonesian": "translation_en_to_id",
|
|
|
|
| 52 |
|
| 53 |
# ------------------------------------------------------
|
| 54 |
# 3. TTS Configuration
|
| 55 |
+
# - MMS TTS (VITS) for: French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
|
| 56 |
# - Coqui XTTS-v2 for: Chinese and Japanese
|
| 57 |
# ------------------------------------------------------
|
| 58 |
tts_config = {
|
| 59 |
+
"French": {"model_id": "facebook/mms-tts-fra", "architecture": "vits", "type": "mms"},
|
| 60 |
"Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
|
| 61 |
"Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
|
| 62 |
"Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
|
|
|
|
| 67 |
"Japanese": {"type": "coqui"}
|
| 68 |
}
|
| 69 |
|
| 70 |
+
# For Coqui, map languages to expected language codes.
|
| 71 |
coqui_lang_map = {
|
| 72 |
"Chinese": "zh",
|
| 73 |
"Japanese": "ja"
|
|
|
|
| 77 |
# 4. Global Caches for Translators and TTS Models
|
| 78 |
# ------------------------------------------------------
|
| 79 |
translator_cache = {}
|
| 80 |
+
mms_tts_cache = {}
|
| 81 |
+
coqui_tts_cache = None
|
| 82 |
|
| 83 |
# ------------------------------------------------------
|
| 84 |
# 5. Translator Helper
|
|
|
|
| 126 |
if coqui_tts_cache is not None:
|
| 127 |
return coqui_tts_cache
|
| 128 |
try:
|
|
|
|
| 129 |
coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
| 130 |
except Exception as e:
|
| 131 |
raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
|
|
|
|
| 133 |
|
| 134 |
def run_coqui_tts(text, lang):
|
| 135 |
coqui_tts = load_coqui_tts()
|
| 136 |
+
lang_code = coqui_lang_map[lang]
|
|
|
|
| 137 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 138 |
tmp_name = tmp.name
|
| 139 |
try:
|
| 140 |
coqui_tts.tts_to_file(
|
| 141 |
text=text,
|
| 142 |
file_path=tmp_name,
|
| 143 |
+
language=lang_code
|
| 144 |
)
|
| 145 |
data, sr = sf.read(tmp_name)
|
| 146 |
finally:
|
|
|
|
| 170 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
| 171 |
asr_input = {"array": audio_data, "sampling_rate": 16000}
|
| 172 |
asr_result = asr(asr_input)
|
| 173 |
+
english_text = asr_result["text"].lower()
|
| 174 |
else:
|
| 175 |
return "No input provided.", "", None
|
| 176 |
|
|
|
|
| 200 |
# 9. Gradio Interface
|
| 201 |
# ------------------------------------------------------
|
| 202 |
language_choices = [
|
| 203 |
+
"French", "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
|
| 204 |
]
|
| 205 |
|
| 206 |
iface = gr.Interface(
|
|
|
|
| 208 |
inputs=[
|
| 209 |
gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
|
| 210 |
gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
|
| 211 |
+
gr.Dropdown(choices=language_choices, value="French", label="Target Language")
|
| 212 |
],
|
| 213 |
outputs=[
|
| 214 |
gr.Textbox(label="English Transcription"),
|
|
|
|
| 217 |
],
|
| 218 |
title="Multimodal Language Learning Aid",
|
| 219 |
description=(
|
| 220 |
+
"This app performs the following tasks:\n"
|
| 221 |
+
"1. Transcribes English speech using Wav2Vec2 (accepts text input as well).\n"
|
| 222 |
"2. Translates the English text to the target language using Helsinki-NLP models.\n"
|
| 223 |
+
"3. Provides speech:\n"
|
| 224 |
+
" - For French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
|
| 225 |
+
" - For Chinese and Japanese: uses myshell-ai MeloTTS models (work-in-progress).\n"
|
| 226 |
+
"\nSelect your target language from the dropdown."
|
| 227 |
),
|
| 228 |
allow_flagging="never"
|
| 229 |
)
|
| 230 |
|
| 231 |
if __name__ == "__main__":
|
| 232 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|