File size: 6,642 Bytes
43ec985 b7217f7 43ec985 c872044 e365862 43ec985 e365862 2342a7b b7217f7 e1c9728 b7217f7 c872044 43ec985 2342a7b 5a3bbd1 2342a7b e1c9728 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b e1c9728 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b e1c9728 43ec985 1229011 43ec985 c872044 2e7b63f c872044 2e7b63f 4aa5331 2e7b63f c872044 2e7b63f c872044 2e7b63f f685632 2e7b63f 43ec985 c872044 5a3bbd1 c872044 5a3bbd1 c872044 df3f293 2342a7b 100e7c2 2e7b63f 100e7c2 c872044 df3f293 c872044 df3f293 c872044 b7217f7 2e7b63f f685632 2e7b63f c872044 2e7b63f c872044 2e7b63f c872044 b7217f7 c872044 2342a7b 100e7c2 c872044 1229011 100e7c2 43ec985 100e7c2 1229011 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import gradio as gr
import torch
import torchaudio
import re
import os
import numpy as np
import scipy.io.wavfile
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
VOICE_SAMPLE_FILES = ["1.wav"] # Codka tusaale ahaan
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)
# --- Load models ---
try:
print("Loading models...")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": device},
savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
)
print("Models loaded successfully.")
except Exception as e:
raise gr.Error(f"Error loading models: {e}.")
speaker_embeddings_cache = {}
def get_speaker_embedding(wav_file_path):
if wav_file_path in speaker_embeddings_cache:
return speaker_embeddings_cache[wav_file_path]
embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
if os.path.exists(embedding_path):
embedding = torch.load(embedding_path, map_location=device)
speaker_embeddings_cache[wav_file_path] = embedding
return embedding
if not os.path.exists(wav_file_path):
raise gr.Error(f"Audio file not found: {wav_file_path}")
try:
audio, sr = torchaudio.load(wav_file_path)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
with torch.no_grad():
embedding = speaker_model.encode_batch(audio.to(device))
embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
torch.save(embedding.cpu(), embedding_path)
speaker_embeddings_cache[wav_file_path] = embedding.to(device)
return embedding.to(device)
except Exception as e:
raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
# --- Number words dictionary and functions ---
number_words = {
0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
11: "kow iyo toban", 12: "labo iyo toban", 13: "saddex iyo toban",
14: "afar iyo toban", 15: "shan iyo toban", 16: "lix iyo toban",
17: "toddobo iyo toban", 18: "siddeed iyo toban", 19: "sagaal iyo toban",
20: "labaatan", 30: "soddon", 40: "afartan", 50: "konton",
60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
100: "boqol", 1000: "kun",
}
def number_to_words(n):
if n in number_words:
return number_words[n]
if n < 100:
return number_words[n // 10 * 10] + (" iyo " + number_words[n % 10] if n % 10 else "")
if n < 1000:
return (number_words[n // 100] + " boqol" if n // 100 > 1 else "boqol") + (
" iyo " + number_to_words(n % 100) if n % 100 else "")
if n < 1_000_000:
return (number_to_words(n // 1000) + " kun" if n // 1000 > 1 else "kun") + (
" iyo " + number_to_words(n % 1000) if n % 1000 else "")
if n < 1_000_000_000:
return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
" iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
return str(n)
def replace_numbers_with_words(text):
return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
def normalize_text(text):
text = text.lower()
text = replace_numbers_with_words(text)
text = re.sub(r'[^\w\s\']', '', text)
return text
# --- Helper to split text into sentences ---
def split_into_sentences(text):
sentence_endings = re.compile(r'(?<=[.!?])\s+')
sentences = sentence_endings.split(text)
return [s.strip() for s in sentences if s.strip()]
# --- Main TTS function with pauses between sentences ---
# --- Main TTS function with pause after each new line only ---
def text_to_speech(text, voice_choice):
if not text or not voice_choice:
gr.Warning("Fadlan geli qoraal oo dooro cod.")
return None
speaker_embedding = get_speaker_embedding(voice_choice)
paragraphs = text.strip().split("\n")
audio_chunks = []
for idx, para in enumerate(paragraphs):
para = para.strip()
if not para:
continue
norm_para = normalize_text(para)
inputs = processor(text=norm_para, return_tensors="pt").to(device)
with torch.no_grad():
speech = model.generate(
input_ids=inputs["input_ids"],
speaker_embeddings=speaker_embedding.unsqueeze(0),
do_sample=True,
top_k=50,
temperature=0.75,
repetition_penalty=1.2,
max_new_tokens=512
)
audio = vocoder(speech).cpu().squeeze().numpy()
audio_chunks.append(audio)
# Pause after each paragraph (new line)
if idx < len(paragraphs) - 1:
pause = np.zeros(int(16000 * 0.8)) # 0.8s pause
audio_chunks.append(pause)
final_audio = np.concatenate(audio_chunks)
return (16000, final_audio)
# --- Gradio Interface ---
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
gr.Dropdown(
VOICE_SAMPLE_FILES,
label="Dooro Codka (Select Voice)",
value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
)
],
outputs=gr.Audio(label="Codka La Abuuray (Generated Audio)", type="numpy"),
title="Multi-Voice Somali Text-to-Speech",
description="Geli qoraal Soomaali ah, dooro cod, kadib riix 'Submit' si aad u abuurto hadal."
)
# --- Launch App ---
if __name__ == "__main__":
if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
raise FileNotFoundError("Fadlan hubi inaad faylasha codka ku dartay.")
print("Diyaarinta codadka...")
for voice_file in VOICE_SAMPLE_FILES:
get_speaker_embedding(voice_file)
print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
iface.launch(share=True)
|