Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 20

Commit

1229011

verified ·

1 Parent(s): 4aa5331

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -28

app.py CHANGED Viewed

@@ -12,8 +12,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # --- KU DAR FAYLKA CODADKAAGA ---
-# TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
-VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
@@ -35,7 +34,6 @@ except Exception as e:
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
-    # Shaqadan sidii hore ayay u egtahay
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
@@ -47,8 +45,10 @@ def get_speaker_embedding(wav_file_path):
         raise gr.Error(f"Audio file not found: {wav_file_path}")
     try:
         audio, sr = torchaudio.load(wav_file_path)
-        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
-        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
@@ -58,7 +58,7 @@ def get_speaker_embedding(wav_file_path):
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
-# --- Text Processing Functions (sidoodii) ---
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -69,6 +69,7 @@ number_words = {
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
     if n in number_words:
         return number_words[n]
@@ -87,13 +88,14 @@ def number_to_words(n):
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
-    text = re.sub(r'[^\w\s\']', '', text)
     return text
-# --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
@@ -101,25 +103,36 @@ def text_to_speech(text, voice_choice):
     speaker_embedding = get_speaker_embedding(voice_choice)
     normalized_text = normalize_text(text)
-    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-    with torch.no_grad():
-        speech = model.generate(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embedding.unsqueeze(0),
-            # --- Halbeegyada Tayada Codka ---
-            do_sample=True,          # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
-            top_k=50,                # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
-            temperature=0.75,        # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
-            repetition_penalty=1.2,  # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
-            max_new_tokens=512       # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
-        )
-        speech = vocoder(speech)
-    return (16000, speech.cpu().numpy())
-# --- Gradio Interface (sidii hore) ---
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
@@ -136,13 +149,14 @@ iface = gr.Interface(
     description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
 )
 if __name__ == "__main__":
     if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
         raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
     print("Diyaarinta codadka...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
     print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
-    iface.launch(share=True)

 print(f"Using device: {device}")
 # --- KU DAR FAYLKA CODADKAAGA ---
+VOICE_SAMPLE_FILES = ["1.wav"]
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
         raise gr.Error(f"Audio file not found: {wav_file_path}")
     try:
         audio, sr = torchaudio.load(wav_file_path)
+        if sr != 16000:
+            audio = torchaudio.functional.resample(audio, sr, 16000)
+        if audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
+# --- Number Handling Functions ---
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
     if n in number_words:
         return number_words[n]
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
+    text = re.sub(r'[^\w\s\'.!?]', '', text)  # Ha tirtirin calaamadaha muhiimka ah
     return text
+# --- Main TTS Function with Pause ---
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
     speaker_embedding = get_speaker_embedding(voice_choice)
     normalized_text = normalize_text(text)
+    # Kala qaybi jumladaha
+    lines = re.split(r'(?<=[.!?])\s+', normalized_text.strip())
+    full_audio = []
+    for line in lines:
+        if not line.strip():
+            continue
+        inputs = processor(text=line, return_tensors="pt").to(device)
+        with torch.no_grad():
+            speech = model.generate(
+                input_ids=inputs["input_ids"],
+                speaker_embeddings=speaker_embedding.unsqueeze(0),
+                do_sample=True,
+                top_k=50,
+                temperature=0.75,
+                repetition_penalty=1.2,
+                max_new_tokens=512
+            )
+            audio_chunk = vocoder(speech).cpu().numpy()
+            full_audio.append(audio_chunk)
+        # Nasasho 0.5 ilbiriqsi u dhaxeysa
+        pause = np.zeros((1, 16000 // 2), dtype=np.float32)
+        full_audio.append(pause)
+    return (16000, np.concatenate(full_audio, axis=-1))
+# --- Gradio Interface ---
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
     description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
 )
+# --- Launch ---
 if __name__ == "__main__":
     if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
         raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
     print("Diyaarinta codadka...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
     print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
+    iface.launch(share=True)