Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 16

Commit

83609f5

verified ·

1 Parent(s): f76188c

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -112

app.py CHANGED Viewed

@@ -1,155 +1,93 @@
-import gradio as gr
 import torch
 import torchaudio
-import re
-import os
-import numpy as np
-import scipy.io.wavfile
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load models
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
-    savedir="./spk_model"
 )
 # Speaker embedding
-EMB_PATH = "speaker_embedding.pt"
-if os.path.exists(EMB_PATH):
-    speaker_embedding = torch.load(EMB_PATH).to(device)
-else:
-    audio, sr = torchaudio.load("1.wav")
     audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
         emb = speaker_model.encode_batch(audio)
         emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
-    torch.save(emb.cpu(), EMB_PATH)
-    speaker_embedding = emb
-# Number conversion (Somali)
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
-    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
-    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
-    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
     20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
     60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
     100: "boqol", 1000: "kun"
 }
-shortcut_map = {
-    "asc": "asalaamu caleykum",
-    "wcs": "wacaleykum salaam",
-    "fcn": "fiican",
-    "xld": "xaaladda ka waran",
-    "kwrn": "kawaran",
-    "scw": "salalaahu caleyhi wa salam",
-    "alx": "alxamdu lilaahi",
-    "m.a": "maasha allah",
-    "sthy": "side tahey",
-    "sxp": "saaxiib"
-}
-def number_to_words(number):
-    number = int(number)
-    if number < 20:
-        return number_words[number]
-    elif number < 100:
-        tens, unit = divmod(number, 10)
-        return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
-    elif number < 1000:
-        hundreds, remainder = divmod(number, 100)
-        part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
-        if remainder:
-            part += " iyo " + number_to_words(remainder)
-        return part
-    elif number < 1000000:
-        thousands, remainder = divmod(number, 1000)
-        words = []
-        if thousands == 1:
-            words.append("kun")
-        else:
-            words.append(number_to_words(thousands) + " kun")
-        if remainder >= 100:
-            hundreds, rem2 = divmod(remainder, 100)
-            if hundreds:
-                boqol_text = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
-                words.append(boqol_text)
-            if rem2:
-                words.append("iyo " + number_to_words(rem2))
-        elif remainder:
-            words.append("iyo " + number_to_words(remainder))
-        return " ".join(words)
-    elif number < 1000000000:
-        millions, remainder = divmod(number, 1000000)
-        words = []
-        if millions == 1:
-            words.append("milyan")
-        else:
-            words.append(number_to_words(millions) + " milyan")
-        if remainder:
-            words.append(number_to_words(remainder))
-        return " ".join(words)
     else:
-        return str(number)
 def replace_numbers_with_words(text):
-    def replace(match):
-        number = int(match.group())
-        return number_to_words(number)
-    return re.sub(r'\b\d+\b', replace, text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
-    def replace_shortcuts(match):
-        word = match.group(0).lower()
-        return shortcut_map.get(word, word)
-    pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
-    text = pattern.sub(replace_shortcuts, text)
-    def replace_countries(match):
-        word = match.group(0).lower()
-        return country_map.get(word, word)
-    country_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in country_map.keys()) + r')\b', re.IGNORECASE)
-    text = country_pattern.sub(replace_countries, text)
-    text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
-    text = re.sub(r'\.\d+', '', text)
-    symbol_map = {
-        '$': 'doolar',
-        '=': 'egwal',
-        '+': 'balaas',
-        '#': 'haash'
-    }
-    for sym, word in symbol_map.items():
-        text = text.replace(sym, ' ' + word + ' ')
     text = re.sub(r'[^\w\s]', '', text)
     return text
 iface = gr.Interface(
-    fn=text_to_speech,
-    inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
-    outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
-    title="Somali TTS",
-    description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (1.wav)"
 )
 iface.launch()

+import os
+import re
+import uuid
 import torch
 import torchaudio
+import soundfile as sf
+import gradio as gr
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from speechbrain.inference.speaker import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
+CACHE_DIR = "/tmp/hf-cache"
 # Load models
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
+model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
+    savedir="/tmp/spk_model"
 )
 # Speaker embedding
+def get_embedding(wav_path, pt_path):
+    if os.path.exists(pt_path):
+        return torch.load(pt_path).to(device)
+    audio, sr = torchaudio.load(wav_path)
     audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
         emb = speaker_model.encode_batch(audio)
         emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
+    torch.save(emb.cpu(), pt_path)
+    return emb
+embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
+# Text normalization
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
     20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
     60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
     100: "boqol", 1000: "kun"
 }
+def number_to_words(n):
+    if n < 20:
+        return number_words.get(n, str(n))
+    elif n < 100:
+        tens, unit = divmod(n, 10)
+        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
+    elif n < 1000:
+        hundreds, rem = divmod(n, 100)
+        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
+    elif n < 1_000_000:
+        th, rem = divmod(n, 1000)
+        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
     else:
+        return str(n)
 def replace_numbers_with_words(text):
+    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s]', '', text)
     return text
+# Gradio TTS Function
+def tts(text):
+    clean_text = normalize_text(text)
+    inputs = processor(text=clean_text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)
+    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
+    sf.write(out_path, waveform.cpu().numpy(), 16000)
+    return out_path
+# Gradio Interface
 iface = gr.Interface(
+    fn=tts,
+    inputs=gr.Textbox(label="Geli qoraalka af Soomaali"),
+    outputs=gr.Audio(label="Codka", type="filepath"),
+    title="Somali Text-to-Speech",
+    description="Ku qor qoraal Soomaali ah si aad cod ugu dhageysato (Female voice only)."
 )
 iface.launch()