Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 18

Commit

100e7c2

verified ·

1 Parent(s): 8204814

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -39

app.py CHANGED Viewed

@@ -11,8 +11,9 @@ import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-# --- KU DAR FAYLASHA CODADKAAGA ---
-VOICE_SAMPLE_FILES = ["1.wav"]
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
@@ -34,6 +35,7 @@ except Exception as e:
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
@@ -44,7 +46,6 @@ def get_speaker_embedding(wav_file_path):
     if not os.path.exists(wav_file_path):
         raise gr.Error(f"Audio file not found: {wav_file_path}")
     try:
-        print(f"Creating new speaker embedding for {wav_file_path}...")
         audio, sr = torchaudio.load(wav_file_path)
         if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
         if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
@@ -53,12 +54,11 @@ def get_speaker_embedding(wav_file_path):
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
-        print(f"Embedding created and saved for {wav_file_path}.")
         return embedding.to(device)
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
-# --- Text Processing Functions ---
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -83,35 +83,33 @@ def normalize_text(text):
     text = re.sub(r'[^\w\s\']', '', text)
     return text
-# --- Main Text-to-Speech Function ---
 def text_to_speech(text, voice_choice):
-    try:
-        if not text:
-            gr.Warning("Please enter some text.")
-            return None
-        if not voice_choice:
-            gr.Warning("Please select a voice.")
-            return None
-        speaker_embedding = get_speaker_embedding(voice_choice)
-        normalized_text = normalize_text(text)
-        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            speech = model.generate(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=speaker_embedding.unsqueeze(0),
-                do_sample=True,
-                top_k=50,
-            )
-            speech = vocoder(speech)
-        return (16000, speech.cpu().numpy())
-    except Exception as e:
-        print(f"AN ERROR OCCURRED: {e}")
-        raise gr.Error(f"An error occurred during generation: {e}")
-# --- Gradio Interface ---
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
@@ -119,25 +117,22 @@ iface = gr.Interface(
         gr.Dropdown(
             VOICE_SAMPLE_FILES,
             label="Select Voice",
-            info="Choose the voice you want to use for the speech.",
             value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
         )
     ],
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
-    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech."
 )
-# --- Launch the web interface ---
 if __name__ == "__main__":
-    print("Checking for voice files...")
-    for f in VOICE_SAMPLE_FILES:
-        if not os.path.exists(f):
-            raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Space.")
-    print("Pre-loading all voice embeddings...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
-    print("All voices are ready. Launching interface.")
     iface.launch(share=True)

 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# --- KU DAR FAYLKA CODADKAAGA ---
+# TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
+VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
+    # Shaqadan sidii hore ayay u egtahay
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
     if not os.path.exists(wav_file_path):
         raise gr.Error(f"Audio file not found: {wav_file_path}")
     try:
         audio, sr = torchaudio.load(wav_file_path)
         if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
         if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
         return embedding.to(device)
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
+# --- Text Processing Functions (sidoodii) ---
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
     text = re.sub(r'[^\w\s\']', '', text)
     return text
+# --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
 def text_to_speech(text, voice_choice):
+    if not text or not voice_choice:
+        gr.Warning("Fadlan geli qoraal oo dooro cod.")
+        return None
+    speaker_embedding = get_speaker_embedding(voice_choice)
+    normalized_text = normalize_text(text)
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        speech = model.generate(
+            input_ids=inputs["input_ids"],
+            speaker_embeddings=speaker_embedding.unsqueeze(0),
+            # --- Halbeegyada Tayada Codka ---
+            do_sample=True,          # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
+            top_k=50,                # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
+            temperature=0.75,        # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
+            repetition_penalty=1.2,  # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
+            max_new_tokens=512       # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
+        )
+        speech = vocoder(speech)
+    return (16000, speech.cpu().numpy())
+# --- Gradio Interface (sidii hore) ---
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Dropdown(
             VOICE_SAMPLE_FILES,
             label="Select Voice",
+            info="Dooro codka aad rabto inaad isticmaasho.",
             value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
         )
     ],
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
+    description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
 )
 if __name__ == "__main__":
+    if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
+        raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
+    print("Diyaarinta codadka...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
+    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
     iface.launch(share=True)