Spaces:

Somalitts
/

8aad

Running

File size: 6,587 Bytes

43ec985
b7217f7
 
43ec985
 
c872044
 
e365862
43ec985
e365862
2342a7b
b7217f7
e1c9728
b7217f7
6ce5da6
43ec985
 
2342a7b
5a3bbd1
2342a7b
e1c9728
43ec985
 
 
2342a7b
 
 
43ec985
2342a7b
43ec985
2342a7b
e1c9728
2342a7b
43ec985
2342a7b
43ec985
 
 
 
2342a7b
43ec985
 
 
2342a7b
e1c9728
43ec985
 
1229011
 
 
 
43ec985
 
 
 
 
 
 
 
 
c872044
2e7b63f
 
 
 
 
 
 
 
 
 
c872044
2e7b63f
4aa5331
 
 
 
 
 
 
 
 
 
 
 
 
2e7b63f
c872044
2e7b63f
 
c872044
2e7b63f
 
 
f685632
2e7b63f
43ec985
6ce5da6
 
 
 
 
 
 
 
5a3bbd1
6ce5da6
2342a7b
100e7c2
 
 
2e7b63f
100e7c2
6ce5da6
100e7c2
c872044
 
6ce5da6
 
 
c872044
df3f293
6ce5da6
 
df3f293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ce5da6
 
df3f293
 
c872044
 
 
 
 
b7217f7
2e7b63f
 
f685632
2e7b63f
 
c872044
2e7b63f
 
 
c872044
2e7b63f
c872044
b7217f7
 
c872044
2342a7b
100e7c2
c872044
1229011
100e7c2
43ec985
 
100e7c2
1229011

import gradio as gr
import torch
import torchaudio
import re
import os
import numpy as np
import scipy.io.wavfile
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier

# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

VOICE_SAMPLE_FILES = ["1.wav"]
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# --- Load models ---
try:
    print("Loading models...")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    speaker_model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-xvect-voxceleb",
        run_opts={"device": device},
        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
    )
    print("Models loaded successfully.")
except Exception as e:
    raise gr.Error(f"Error loading models: {e}.")

speaker_embeddings_cache = {}

def get_speaker_embedding(wav_file_path):
    if wav_file_path in speaker_embeddings_cache:
        return speaker_embeddings_cache[wav_file_path]
    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
    if os.path.exists(embedding_path):
        embedding = torch.load(embedding_path, map_location=device)
        speaker_embeddings_cache[wav_file_path] = embedding
        return embedding
    if not os.path.exists(wav_file_path):
        raise gr.Error(f"Audio file not found: {wav_file_path}")
    try:
        audio, sr = torchaudio.load(wav_file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        with torch.no_grad():
            embedding = speaker_model.encode_batch(audio.to(device))
            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
        torch.save(embedding.cpu(), embedding_path)
        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
        return embedding.to(device)
    except Exception as e:
        raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")

# --- Number words dictionary and functions ---
number_words = {
    0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
    6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
    11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
    14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
    17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
    20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}

def number_to_words(n):
    if n in number_words:
        return number_words[n]
    if n < 100:
        return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
    if n < 1000:
        return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
            " iyo " + number_to_words(n % 100) if n % 100 else "")
    if n < 1_000_000:
        return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
            " iyo " + number_to_words(n % 1000) if n % 1000 else "")
    if n < 1_000_000_000:
        return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
            " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
    return str(n)

def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)

def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s\']', '', text)
    return text

# --- Split long text into chunks by word count ---
def split_long_text_into_chunks(text, max_words=18):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = ' '.join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

# --- Main TTS function ---
def text_to_speech(text, voice_choice):
    if not text or not voice_choice:
        gr.Warning("Fadlan geli qoraal oo dooro cod.")
        return None

    speaker_embedding = get_speaker_embedding(voice_choice)
    text_chunks = split_long_text_into_chunks(text)

    audio_chunks = []

    for idx, chunk in enumerate(text_chunks):
        chunk = chunk.strip()
        if not chunk:
            continue

        norm_chunk = normalize_text(chunk)
        inputs = processor(text=norm_chunk, return_tensors="pt").to(device)

        with torch.no_grad():
            speech = model.generate(
                input_ids=inputs["input_ids"],
                speaker_embeddings=speaker_embedding.unsqueeze(0),
                do_sample=True,
                top_k=50,
                temperature=0.75,
                repetition_penalty=1.2,
                max_new_tokens=512
            )
            audio = vocoder(speech).cpu().squeeze().numpy()

        audio_chunks.append(audio)

        # Pause after each chunk
        if idx < len(text_chunks) - 1:
            pause = np.zeros(int(16000 * 0.8))  # 0.8s pause
            audio_chunks.append(pause)

    final_audio = np.concatenate(audio_chunks)
    return (16000, final_audio)

# --- Gradio Interface ---
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
        gr.Dropdown(
            VOICE_SAMPLE_FILES,
            label="Dooro Codka (Select Voice)",
            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
        )
    ],
    outputs=gr.Audio(label="Codka La Abuuray (Generated Audio)", type="numpy"),
    title="Multi-Voice Somali Text-to-Speech",
    description="Geli qoraal Soomaali ah, dooro cod, kadib riix 'Submit' si aad u abuurto hadal."
)

# --- Launch App ---
if __name__ == "__main__":
    if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
        raise FileNotFoundError("Fadlan hubi inaad faylasha codka ku dartay.")

    print("Diyaarinta codadka...")
    for voice_file in VOICE_SAMPLE_FILES:
        get_speaker_embedding(voice_file)
    print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")

    iface.launch(share=True)