Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 18

Commit

2342a7b

verified ·

1 Parent(s): 83609f5

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -44

app.py CHANGED Viewed

@@ -8,35 +8,64 @@ import gradio as gr
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.inference.speaker import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
-CACHE_DIR = "/tmp/hf-cache"
-# Load models
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
-model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
-speaker_model = EncoderClassifier.from_hparams(
-    source="speechbrain/spkrec-xvect-voxceleb",
-    run_opts={"device": device},
-    savedir="/tmp/spk_model"
-)
-# Speaker embedding
-def get_embedding(wav_path, pt_path):
-    if os.path.exists(pt_path):
-        return torch.load(pt_path).to(device)
-    audio, sr = torchaudio.load(wav_path)
-    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
-        emb = speaker_model.encode_batch(audio)
-        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
-    torch.save(emb.cpu(), pt_path)
-    return emb
-embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
-# Text normalization
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
@@ -46,19 +75,17 @@ number_words = {
 }
 def number_to_words(n):
-    if n < 20:
-        return number_words.get(n, str(n))
-    elif n < 100:
         tens, unit = divmod(n, 10)
-        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
-    elif n < 1000:
         hundreds, rem = divmod(n, 100)
-        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
-    elif n < 1_000_000:
         th, rem = divmod(n, 1000)
-        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
-    else:
-        return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
@@ -66,28 +93,58 @@ def replace_numbers_with_words(text):
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
-    text = re.sub(r'[^\w\s]', '', text)
     return text
-# Gradio TTS Function
-def tts(text):
     clean_text = normalize_text(text)
     inputs = processor(text=clean_text, return_tensors="pt").to(device)
     with torch.no_grad():
-        waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)
-    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
     sf.write(out_path, waveform.cpu().numpy(), 16000)
     return out_path
-# Gradio Interface
 iface = gr.Interface(
-    fn=tts,
-    inputs=gr.Textbox(label="Geli qoraalka af Soomaali"),
-    outputs=gr.Audio(label="Codka", type="filepath"),
-    title="Somali Text-to-Speech",
-    description="Ku qor qoraal Soomaali ah si aad cod ugu dhageysato (Female voice only)."
 )
-iface.launch()

 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.inference.speaker import EncoderClassifier
+# --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- TALLAABADA 1: KU DAR MAGACYADA FAYLASHAADA CODADKA HAKAAN ---
+# Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
+# Ku beddel magacyadan kuwaaga dhabta ah. Waa inay noqdaan faylal .wav ah.
+VOICE_SAMPLE_FILES = ["1.wav", "90.wav"]
+# Meelaha lagu keydinayo faylasha ku meel gaarka ah
+CACHE_DIR = "hf_cache"
+SPEAKER_EMBEDDING_DIR = "speaker_embeddings"
+os.makedirs(CACHE_DIR, exist_ok=True)
+os.makedirs(SPEAKER_EMBEDDING_DIR, exist_ok=True)
+# --- Soo Dejinta Model-yada ---
+try:
+    print("Waxaa la soo dejinayaa model-yada...")
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
+    # Magaca model-ka waxaan ka beddelnay 'model_female' oo ka dhignay 'model' maadaama uu hadda codad kala duwan isticmaalayo
+    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
+    speaker_model = EncoderClassifier.from_hparams(
+        source="speechbrain/spkrec-xvect-voxceleb",
+        run_opts={"device": device},
+        savedir=os.path.join(CACHE_DIR, "spk_model")
+    )
+    print("Model-yadii waa diyaar.")
+except Exception as e:
+    raise gr.Error(f"Cillad ayaa ka timid soo dejinta model-yada: {e}. Hubi internet-kaaga.")
+# --- Shaqada Abuurista Astaanta Codka (Speaker Embedding) ---
+def get_speaker_embedding(wav_file_path):
+    """
+    Shaqadan waxay soo saaraysaa "astaanta codka" (speaker embedding)
+    haddii aysan jirin, way abuuraysaa oo keydinaysaa si aan mar dambe loo sugin.
+    """
+    embedding_path = os.path.join(SPEAKER_EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
+    if os.path.exists(embedding_path):
+        return torch.load(embedding_path, map_location=device)
+    if not os.path.exists(wav_file_path):
+        raise gr.Error(f"Lama helin faylka codka: {wav_file_path}. Hubi inuu ku jiro galka saxda ah oo magaca si sax ah u qortay.")
+    print(f"Waxaa la abuurayaa astaan cod oo cusub: {wav_file_path}")
+    audio, sr = torchaudio.load(wav_file_path)
+    if sr != 16000:
+        audio = torchaudio.functional.resample(audio, sr, 16000)
+    audio = audio.mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
+        embedding = speaker_model.encode_batch(audio)
+        embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
+    torch.save(embedding.cpu(), embedding_path)
+    return embedding
+# --- Hagaajinta Qoraalka (Text Normalization) ---
 number_words = {
     0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
     6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
 }
 def number_to_words(n):
+    if n < 20: return number_words.get(n, str(n))
+    if n < 100:
         tens, unit = divmod(n, 10)
+        return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
+    if n < 1000:
         hundreds, rem = divmod(n, 100)
+        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" iyo " + number_to_words(rem) if rem else "")
+    if n < 1_000_000:
         th, rem = divmod(n, 1000)
+        return (number_to_words(th) + " kun") + (" iyo " + number_to_words(rem) if rem else "")
+    return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
+    text = re.sub(r'[^\w\s\']', '', text)
     return text
+# --- Shaqada ugu Muhiimsan (TTS Function) ---
+def text_to_speech(text, voice_choice):
+    """ Hadda shaqadani waxay qaadanaysaa qoraalka iyo codka la doortay """
+    if not text or not voice_choice:
+        gr.Warning("Fadlan geli qoraal oo dooro cod.")
+        return None
+    # Soo qaado astaanta codka la doortay
+    speaker_embedding = get_speaker_embedding(voice_choice)
     clean_text = normalize_text(text)
     inputs = processor(text=clean_text, return_tensors="pt").to(device)
     with torch.no_grad():
+        waveform = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
+    # Si ku meel gaar ah u keydi faylka codka la abuuray
+    os.makedirs("/tmp/tts_outputs", exist_ok=True)
+    out_path = f"/tmp/tts_outputs/{uuid.uuid4().hex}.wav"
     sf.write(out_path, waveform.cpu().numpy(), 16000)
     return out_path
+# --- Interface-ka Gradio ---
+# Hadda wuxuu leeyahay meel qoraalka la geliyo iyo meel codka laga doorto
 iface = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Geli qoraalka af Soomaali"),
+        gr.Dropdown(
+            choices=VOICE_SAMPLE_FILES,
+            label="Dooro Codkaaga (Select Your Voice)",
+            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None,
+            info="Dooro mid ka mid ah codadkaaga aad diyaarisay."
+        )
+    ],
+    outputs=gr.Audio(label="Codka La Abuuray", type="filepath"),
+    title="Soomaali Text-to-Speech (Codad Kala Duwan)",
+    description="Ku qor qoraal Soomaali ah, dooro codka aad rabto, kadibna riix 'Submit' si aad cod ugu dhageysato."
 )
+# Diyaari codadka ka hor inta aan barnaamijka la furin
+if __name__ == "__main__":
+    print("Hubinta faylasha codadka...")
+    if not VOICE_SAMPLE_FILES or not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
+         raise FileNotFoundError("Mid ka mid ah faylasha ku jira 'VOICE_SAMPLE_FILES' lama helin. Fadlan hubi magacyada iyo meesha ay ku jiraan.")
+    print("Diyaarinta codadkaaga...")
+    for voice in VOICE_SAMPLE_FILES:
+        get_speaker_embedding(voice) # Tani waxay abuuraysaa astaamaha codka haddii aysan jirin
+    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
+    iface.launch(share=True)