Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 18

Commit

2e7b63f

verified ·

1 Parent(s): bd60963

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -19

app.py CHANGED Viewed

@@ -10,15 +10,15 @@ import numpy as np
 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- HUBI INAAD SOO GELISAY FAYLASHAN ---
-# Faylashan waa inay ku jiraan Hugging Face Spaces, isla galka uu ku jiro "app.py"
 VOICE_SAMPLE_FILES = ["1.wav"]
-# Directory to store speaker embedding files
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
-# --- Load Models ---
 try:
     print("Loading models... This may take a moment.")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -38,51 +38,120 @@ speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
     if os.path.exists(embedding_path):
         embedding = torch.load(embedding_path, map_location=device)
         speaker_embeddings_cache[wav_file_path] = embedding
         return embedding
     if not os.path.exists(wav_file_path):
-        # Kani waa qaladka dhacay. Markaad faylasha soo geliso, meeshan wuu ka gudbayaa.
-        raise FileNotFoundError(f"Lama helin faylka codka: {wav_file_path}")
     try:
         audio, sr = torchaudio.load(wav_file_path)
-        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
-        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
         return embedding.to(device)
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
-# ... (Inta kale ee koodhka way saxantahay) ...
-# --- Main Text-to-Speech Function ---
 def text_to_speech(text, voice_choice):
-    # ... (sidaadii hore) ...
-    pass # Koodhka intiisa kale halkan geli
 # --- Gradio Interface ---
 iface = gr.Interface(
-    # ... (sidaadii hore) ...
-    pass # Koodhka intiisa kale halkan geli
 )
 # --- Launch the web interface ---
 if __name__ == "__main__":
-    print("Hubinta faylasha codadka...")
     for f in VOICE_SAMPLE_FILES:
         if not os.path.exists(f):
-            # Qaladku halkan ayuu ka bilaabmayaa
-            raise FileNotFoundError(f"Mid ka mid ah faylasha lama helin: '{f}'. Fadlan hubi inaad soo gelisay Hugging Face Spaces.")
-    print("Diyaarinta astaamaha codadka...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
-    print("Dhammaan codadka waa diyaar. Waxaa la furayaa interface-ka.")
     iface.launch(share=True)

 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- KU DAR FAYLASHA CODADKAAGA ---
+# HUBI INAAD FAYLASHAN SOO GELISAY HUGGING FACE SPACES
 VOICE_SAMPLE_FILES = ["1.wav"]
+# Galka lagu keydinayo astaamaha codka
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
+# --- Soo Dejinta Model-yada ---
 try:
     print("Loading models... This may take a moment.")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
     if os.path.exists(embedding_path):
+        print(f"Loading existing embedding for {wav_file_path}")
         embedding = torch.load(embedding_path, map_location=device)
         speaker_embeddings_cache[wav_file_path] = embedding
         return embedding
+    print(f"Creating new speaker embedding for {wav_file_path}...")
     if not os.path.exists(wav_file_path):
+        raise gr.Error(f"Audio file not found: {wav_file_path}.")
     try:
         audio, sr = torchaudio.load(wav_file_path)
+        if sr != 16000:
+            audio = torchaudio.functional.resample(audio, sr, 16000)
+        if audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
+        print(f"Embedding created and saved for {wav_file_path}.")
         return embedding.to(device)
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
+# --- Text Processing Functions ---
+number_words = {
+    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
+    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
+    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
+    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
+    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
+    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
+    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
+    100: "boqol", 1000: "kun",
+}
+def number_to_words(n):
+    if n in number_words: return number_words[n]
+    if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
+    if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
+    if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
+    return str(n)
+def replace_numbers_with_words(text):
+    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
+def normalize_text(text):
+    text = text.lower()
+    text = replace_numbers_with_words(text)
+    text = re.sub(r'[^\w\s\']', '', text)
+    return text
+# --- Main Text-to-Speech Function (with quality improvements) ---
 def text_to_speech(text, voice_choice):
+    if not text:
+        gr.Warning("Please enter some text.")
+        return None, None
+    if not voice_choice:
+        gr.Warning("Please select a voice from the dropdown.")
+        return None, None
+    speaker_embedding = get_speaker_embedding(voice_choice)
+    normalized_text = normalize_text(text)
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        # Using model.generate with sampling for more natural speech
+        speech = model.generate(
+            input_ids=inputs["input_ids"],
+            speaker_embeddings=speaker_embedding.unsqueeze(0),
+            do_sample=True,
+            top_k=50,
+        )
+        # Apply the vocoder separately
+        speech = vocoder(speech)
+    return (16000, speech.cpu().numpy())
 # --- Gradio Interface ---
 iface = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
+        gr.Dropdown(
+            VOICE_SAMPLE_FILES,
+            label="Select Voice",
+            info="Choose the voice you want to use for the speech.",
+            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
+        )
+    ],
+    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
+    title="Multi-Voice Somali Text-to-Speech",
+    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
+    examples=[
+        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else ''],
+        ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else (VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else '')],
+    ]
 )
 # --- Launch the web interface ---
 if __name__ == "__main__":
+    # This check will run first. If it fails, the app will stop.
+    print("Checking for voice files...")
     for f in VOICE_SAMPLE_FILES:
         if not os.path.exists(f):
+            raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Hugging Face Space.")
+    print("Pre-loading all voice embeddings...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
+    print("All voices are ready. Launching interface.")
     iface.launch(share=True)