Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 13

Commit

61cbc18

verified ·

1 Parent(s): 25352e2

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -64

app.py CHANGED Viewed

@@ -3,95 +3,190 @@ import torch
 import torchaudio
 import re
 import os
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load models
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
-    savedir="./spk_model"
 )
-# Speaker embedding
 EMB_PATH = "speaker_embedding.pt"
 if os.path.exists(EMB_PATH):
     speaker_embedding = torch.load(EMB_PATH).to(device)
 else:
-    audio, sr = torchaudio.load("1.wav")
-    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
-    with torch.no_grad():
-        emb = speaker_model.encode_batch(audio)
-        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
-    torch.save(emb.cpu(), EMB_PATH)
-    speaker_embedding = emb
-# Number conversion (Somali)
-number_words = {
-    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
-    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
-    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
-    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
-    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
-    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
-    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
-    100: "boqol", 1000: "kun",
-}
-def number_to_words(number):
-    if number < 20:
-        return number_words[number]
-    elif number < 100:
-        tens, unit = divmod(number, 10)
-        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
-    elif number < 1000:
-        hundreds, remainder = divmod(number, 100)
-        return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
-    elif number < 1000000:
-        thousands, remainder = divmod(number, 1000)
-        return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
-    elif number < 1000000000:
-        millions, remainder = divmod(number, 1000000)
-        return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
-    elif number < 1000000000000:
-        billions, remainder = divmod(number, 1000000000)
-        return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
-    else:
-        return str(number)
-def replace_numbers_with_words(text):
-    def replace(match):
-        number = int(match.group())
-        return number_to_words(number)
-    return re.sub(r'\b\d+\b', replace, text)
 def normalize_text(text):
     text = text.lower()
-    text = replace_numbers_with_words(text)
-    text = re.sub(r'[^\w\s]', '', text)
     return text
-# TTS function
 def text_to_speech(text):
-    text = normalize_text(text)
-    inputs = processor(text=text, return_tensors="pt").to(device)
     with torch.no_grad():
-        speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
-    return (16000, speech.cpu().numpy())
-# Gradio Interface
 iface = gr.Interface(
     fn=text_to_speech,
-    inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
-    outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
-    title="Somali TTS",
-    description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (11.wav)"
 )
-iface.launch()

 import torchaudio
 import re
 import os
+import numpy as np
+import soundfile as sf
+from pydub import AudioSegment, effects
+# --- Model Loading ---
+print("Loading models, this may take a moment...")
+# Check for CUDA (GPU) availability
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load the core TTS models from Hugging Face
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+# Load the speaker encoder model from SpeechBrain
+# This model creates the voice profile (embedding) from an audio sample.
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
+    savedir=os.path.join("models", "spk_model") # Organized model saving
 )
+print("Models loaded successfully.")
+# --- Speaker Embedding Generation ---
+# This section creates the unique voice identity for the TTS.
+def create_speaker_embedding(audio_path):
+    """
+    Normalizes the input audio and creates a high-quality speaker embedding.
+    """
+    print("Creating speaker embedding...")
+    # 1. Pre-process the audio for better quality
+    print(f"Normalizing audio file: {audio_path}")
+    raw_audio = AudioSegment.from_wav(audio_path)
+    normalized_audio = effects.normalize(raw_audio)
+    # pydub works with milliseconds
+    normalized_audio_path = "normalized_speaker.wav"
+    normalized_audio.export(normalized_audio_path, format="wav")
+    # 2. Generate the embedding
+    waveform, sr = torchaudio.load(normalized_audio_path)
+    # Resample if necessary and move to the correct device
+    if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
+    with torch.no_grad():
+        embedding = speaker_model.encode_batch(waveform.to(device))
+        # Normalize the embedding itself for model compatibility
+        embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
+    print("Speaker embedding created and cached.")
+    return embedding
+# Path to the speaker audio and the cached embedding
+SPEAKER_WAV = "1.wav"
 EMB_PATH = "speaker_embedding.pt"
+if not os.path.exists(SPEAKER_WAV):
+    raise FileNotFoundError(f"Error: Speaker audio file not found at '{SPEAKER_WAV}'. Please create this file.")
+# Create and cache the embedding if it doesn't exist
 if os.path.exists(EMB_PATH):
+    print("Loading cached speaker embedding.")
     speaker_embedding = torch.load(EMB_PATH).to(device)
 else:
+    speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
+    # Cache the embedding for faster startups next time
+    torch.save(speaker_embedding.cpu(), EMB_PATH)
+# --- Text Normalization (Somali) ---
+# This function cleans the text and converts numbers to words.
+def number_to_somali_words(num_str):
+    """Converts a string of digits into Somali words."""
+    num = int(num_str)
+    if num < 0: return "eber ka yar" # Handle negative case
+    units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
+    teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
+    tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
+    if num == 0: return "eber"
+    if num < 10: return units[num]
+    if num < 20: return teens[num-10]
+    if num < 100:
+        ten, unit = divmod(num, 10)
+        return tens[ten] + ((" iyo " + units[unit]) if unit != 0 else "")
+    if num < 1000:
+        hundred, rest = divmod(num, 100)
+        return (units[hundred] if hundred > 1 else "") + " boqol" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
+    if num < 1000000:
+        thousand, rest = divmod(num, 1000)
+        return number_to_somali_words(str(thousand)) + " kun" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
+    return num_str # Fallback for very large numbers
 def normalize_text(text):
+    """Cleans and normalizes Somali text for TTS."""
     text = text.lower()
+    # Convert numbers to words using a regex substitution
+    text = re.sub(r"\d+", lambda m: number_to_somali_words(m.group(0)), text)
+    # Remove special characters except for basic punctuation that might indicate pauses
+    text = re.sub(r'[^\w\s,\.]', '', text)
+    text = text.strip()
     return text
+# --- Core TTS Function ---
 def text_to_speech(text):
+    """
+    Generates speech from text, including pre- and post-processing steps.
+    """
+    print(f"Received text: {text}")
+    # 1. Normalize the input text
+    normalized_text = normalize_text(text)
+    if not normalized_text:
+        print("Warning: Text is empty after normalization.")
+        # Return silence if there's no text to process
+        return (16000, np.zeros(16000).astype(np.int16))
+    print(f"Normalized text: {normalized_text}")
+    # 2. Process text and generate speech
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
+        speech_tensor = model.generate_speech(
+            inputs["input_ids"],
+            speaker_embedding.unsqueeze(0),
+            vocoder=vocoder
+        )
+    speech_numpy = speech_tensor.cpu().numpy()
+    # 3. Post-process the audio to make it sound more human
+    print("Post-processing generated audio...")
+    # Convert numpy array to a pydub AudioSegment
+    # Ensure numpy array is in the correct format (16-bit PCM)
+    audio_segment = AudioSegment(
+        speech_numpy.tobytes(),
+        frame_rate=16000,
+        sample_width=speech_numpy.dtype.itemsize,
+        channels=1
+    )
+    # Apply normalization - this is a key step for better quality
+    processed_audio = effects.normalize(audio_segment)
+    # Convert back to numpy array for Gradio output
+    processed_numpy = np.array(processed_audio.get_array_of_samples())
+    print("Speech generation complete.")
+    return (16000, processed_numpy)
+# --- Gradio Web Interface ---
 iface = gr.Interface(
     fn=text_to_speech,
+    inputs=gr.Textbox(
+        label="Qoraalka Geli (Enter Somali Text)",
+        placeholder="Ku soo dhawaada barnaamijka codka ee Soomaaliyeed..."
+    ),
+    outputs=gr.Audio(
+        label="Codka La Soo Saaray (Generated Audio)",
+        type="numpy"
+    ),
+    title="🤖 Somali Text-to-Speech (Tayada Sare)",
+    description=(
+        "Ku qor qoraal af-Soomaali ah si aad ugu beddesho cod dabiici ah oo aad moodo mid dad."
+        "Codkan waxaa lagu sameeyay iyadoo la isticmaalayo faylka codka ee `1.wav`."
+        "\n\n(Enter Somali text to convert it to natural, human-like speech. "
+        "This voice was created using the `1.wav` audio file.)"
+    ),
+    examples=[
+        ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
+        ["Qiimaha badeecadani waa 2500 oo shilin."],
+        ["Barnaamijkan waxaa sameeyay sirdoon macmal ah."],
+    ]
 )
+if __name__ == "__main__":
+    iface.launch(share=True) # Set share=True to get a public link