Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 18

Commit

e1c9728

verified ·

1 Parent(s): 2e7b63f

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -48

app.py CHANGED Viewed

@@ -9,18 +9,16 @@ import numpy as np
 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- KU DAR FAYLASHA CODADKAAGA ---
-# HUBI INAAD FAYLASHAN SOO GELISAY HUGGING FACE SPACES
 VOICE_SAMPLE_FILES = ["1.wav"]
-# Galka lagu keydinayo astaamaha codka
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
 # --- Soo Dejinta Model-yada ---
 try:
-    print("Loading models... This may take a moment.")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
@@ -31,37 +29,28 @@ try:
     )
     print("Models loaded successfully.")
 except Exception as e:
-    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
     if os.path.exists(embedding_path):
-        print(f"Loading existing embedding for {wav_file_path}")
         embedding = torch.load(embedding_path, map_location=device)
         speaker_embeddings_cache[wav_file_path] = embedding
         return embedding
-    print(f"Creating new speaker embedding for {wav_file_path}...")
     if not os.path.exists(wav_file_path):
-        raise gr.Error(f"Audio file not found: {wav_file_path}.")
     try:
         audio, sr = torchaudio.load(wav_file_path)
-        if sr != 16000:
-            audio = torchaudio.functional.resample(audio, sr, 16000)
-        if audio.shape[0] > 1:
-            audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
         print(f"Embedding created and saved for {wav_file_path}.")
@@ -94,31 +83,42 @@ def normalize_text(text):
     text = re.sub(r'[^\w\s\']', '', text)
     return text
-# --- Main Text-to-Speech Function (with quality improvements) ---
 def text_to_speech(text, voice_choice):
-    if not text:
-        gr.Warning("Please enter some text.")
-        return None, None
-    if not voice_choice:
-        gr.Warning("Please select a voice from the dropdown.")
-        return None, None
-    speaker_embedding = get_speaker_embedding(voice_choice)
-    normalized_text = normalize_text(text)
-    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-    with torch.no_grad():
-        # Using model.generate with sampling for more natural speech
-        speech = model.generate(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embedding.unsqueeze(0),
-            do_sample=True,
-            top_k=50,
-        )
-        # Apply the vocoder separately
-        speech = vocoder(speech)
-    return (16000, speech.cpu().numpy())
 # --- Gradio Interface ---
 iface = gr.Interface(
@@ -134,22 +134,17 @@ iface = gr.Interface(
     ],
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
-    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
-    examples=[
-        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else ''],
-        ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else (VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else '')],
-    ]
 )
 # --- Launch the web interface ---
 if __name__ == "__main__":
-    # This check will run first. If it fails, the app will stop.
     print("Checking for voice files...")
     for f in VOICE_SAMPLE_FILES:
         if not os.path.exists(f):
-            raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Hugging Face Space.")
-    print("Pre-loading all voice embeddings...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
     print("All voices are ready. Launching interface.")

 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
 # --- KU DAR FAYLASHA CODADKAAGA ---
 VOICE_SAMPLE_FILES = ["1.wav"]
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
 # --- Soo Dejinta Model-yada ---
 try:
+    print("Loading models...")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
     )
     print("Models loaded successfully.")
 except Exception as e:
+    raise gr.Error(f"Error loading models: {e}.")
 speaker_embeddings_cache = {}
 def get_speaker_embedding(wav_file_path):
     if wav_file_path in speaker_embeddings_cache:
         return speaker_embeddings_cache[wav_file_path]
     embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
     if os.path.exists(embedding_path):
         embedding = torch.load(embedding_path, map_location=device)
         speaker_embeddings_cache[wav_file_path] = embedding
         return embedding
     if not os.path.exists(wav_file_path):
+        raise gr.Error(f"Audio file not found: {wav_file_path}")
     try:
+        print(f"Creating new speaker embedding for {wav_file_path}...")
         audio, sr = torchaudio.load(wav_file_path)
+        if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
+        if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
         with torch.no_grad():
             embedding = speaker_model.encode_batch(audio.to(device))
             embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
         torch.save(embedding.cpu(), embedding_path)
         speaker_embeddings_cache[wav_file_path] = embedding.to(device)
         print(f"Embedding created and saved for {wav_file_path}.")
     text = re.sub(r'[^\w\s\']', '', text)
     return text
+# --- Main Text-to-Speech Function ---
 def text_to_speech(text, voice_choice):
+    try:
+        print(f"Received request: Text='{text}', Voice='{voice_choice}'")
+        if not text:
+            gr.Warning("Please enter some text.")
+            return None
+        if not voice_choice:
+            gr.Warning("Please select a voice.")
+            return None
+        print("Step 1: Getting speaker embedding...")
+        speaker_embedding = get_speaker_embedding(voice_choice)
+        print("Step 2: Normalizing text...")
+        normalized_text = normalize_text(text)
+        print("Step 3: Processing text with SpeechT5Processor...")
+        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+        print("Step 4: Generating speech with model.generate()...")
+        with torch.no_grad():
+            speech = model.generate(
+                input_ids=inputs["input_ids"],
+                speaker_embeddings=speaker_embedding.unsqueeze(0),
+                do_sample=True,
+                top_k=50,
+            )
+            print("Step 5: Applying vocoder...")
+            speech = vocoder(speech)
+        print("Step 6: Generation complete. Returning audio.")
+        return (16000, speech.cpu().numpy())
+    except Exception as e:
+        print(f"AN ERROR OCCURRED: {e}")
+        raise gr.Error(f"An error occurred during generation: {e}")
 # --- Gradio Interface ---
 iface = gr.Interface(
     ],
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
+    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech."
 )
 # --- Launch the web interface ---
 if __name__ == "__main__":
     print("Checking for voice files...")
     for f in VOICE_SAMPLE_FILES:
         if not os.path.exists(f):
+            raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Space.")
+    print("Pre-loading all voice embeddings for faster startup...")
     for voice_file in VOICE_SAMPLE_FILES:
         get_speaker_embedding(voice_file)
     print("All voices are ready. Launching interface.")