Somalitts commited on
Commit
e1c9728
·
verified ·
1 Parent(s): 2e7b63f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -48
app.py CHANGED
@@ -9,18 +9,16 @@ import numpy as np
9
 
10
  # --- Configuration ---
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
12
 
13
  # --- KU DAR FAYLASHA CODADKAAGA ---
14
- # HUBI INAAD FAYLASHAN SOO GELISAY HUGGING FACE SPACES
15
  VOICE_SAMPLE_FILES = ["1.wav"]
16
-
17
- # Galka lagu keydinayo astaamaha codka
18
  EMBEDDING_DIR = "speaker_embeddings"
19
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
20
 
21
  # --- Soo Dejinta Model-yada ---
22
  try:
23
- print("Loading models... This may take a moment.")
24
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
25
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
26
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
@@ -31,37 +29,28 @@ try:
31
  )
32
  print("Models loaded successfully.")
33
  except Exception as e:
34
- raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
35
 
36
  speaker_embeddings_cache = {}
37
 
38
  def get_speaker_embedding(wav_file_path):
39
  if wav_file_path in speaker_embeddings_cache:
40
  return speaker_embeddings_cache[wav_file_path]
41
-
42
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
43
-
44
  if os.path.exists(embedding_path):
45
- print(f"Loading existing embedding for {wav_file_path}")
46
  embedding = torch.load(embedding_path, map_location=device)
47
  speaker_embeddings_cache[wav_file_path] = embedding
48
  return embedding
49
-
50
- print(f"Creating new speaker embedding for {wav_file_path}...")
51
  if not os.path.exists(wav_file_path):
52
- raise gr.Error(f"Audio file not found: {wav_file_path}.")
53
-
54
  try:
 
55
  audio, sr = torchaudio.load(wav_file_path)
56
- if sr != 16000:
57
- audio = torchaudio.functional.resample(audio, sr, 16000)
58
- if audio.shape[0] > 1:
59
- audio = torch.mean(audio, dim=0, keepdim=True)
60
-
61
  with torch.no_grad():
62
  embedding = speaker_model.encode_batch(audio.to(device))
63
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
64
-
65
  torch.save(embedding.cpu(), embedding_path)
66
  speaker_embeddings_cache[wav_file_path] = embedding.to(device)
67
  print(f"Embedding created and saved for {wav_file_path}.")
@@ -94,31 +83,42 @@ def normalize_text(text):
94
  text = re.sub(r'[^\w\s\']', '', text)
95
  return text
96
 
97
- # --- Main Text-to-Speech Function (with quality improvements) ---
98
  def text_to_speech(text, voice_choice):
99
- if not text:
100
- gr.Warning("Please enter some text.")
101
- return None, None
102
- if not voice_choice:
103
- gr.Warning("Please select a voice from the dropdown.")
104
- return None, None
 
 
 
 
 
105
 
106
- speaker_embedding = get_speaker_embedding(voice_choice)
107
- normalized_text = normalize_text(text)
108
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
109
-
110
- with torch.no_grad():
111
- # Using model.generate with sampling for more natural speech
112
- speech = model.generate(
113
- input_ids=inputs["input_ids"],
114
- speaker_embeddings=speaker_embedding.unsqueeze(0),
115
- do_sample=True,
116
- top_k=50,
117
- )
118
- # Apply the vocoder separately
119
- speech = vocoder(speech)
120
 
121
- return (16000, speech.cpu().numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # --- Gradio Interface ---
124
  iface = gr.Interface(
@@ -134,22 +134,17 @@ iface = gr.Interface(
134
  ],
135
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
136
  title="Multi-Voice Somali Text-to-Speech",
137
- description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
138
- examples=[
139
- ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else ''],
140
- ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else (VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else '')],
141
- ]
142
  )
143
 
144
  # --- Launch the web interface ---
145
  if __name__ == "__main__":
146
- # This check will run first. If it fails, the app will stop.
147
  print("Checking for voice files...")
148
  for f in VOICE_SAMPLE_FILES:
149
  if not os.path.exists(f):
150
- raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Hugging Face Space.")
151
 
152
- print("Pre-loading all voice embeddings...")
153
  for voice_file in VOICE_SAMPLE_FILES:
154
  get_speaker_embedding(voice_file)
155
  print("All voices are ready. Launching interface.")
 
9
 
10
  # --- Configuration ---
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ print(f"Using device: {device}")
13
 
14
  # --- KU DAR FAYLASHA CODADKAAGA ---
 
15
  VOICE_SAMPLE_FILES = ["1.wav"]
 
 
16
  EMBEDDING_DIR = "speaker_embeddings"
17
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
18
 
19
  # --- Soo Dejinta Model-yada ---
20
  try:
21
+ print("Loading models...")
22
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
23
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
24
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
29
  )
30
  print("Models loaded successfully.")
31
  except Exception as e:
32
+ raise gr.Error(f"Error loading models: {e}.")
33
 
34
  speaker_embeddings_cache = {}
35
 
36
  def get_speaker_embedding(wav_file_path):
37
  if wav_file_path in speaker_embeddings_cache:
38
  return speaker_embeddings_cache[wav_file_path]
 
39
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
 
40
  if os.path.exists(embedding_path):
 
41
  embedding = torch.load(embedding_path, map_location=device)
42
  speaker_embeddings_cache[wav_file_path] = embedding
43
  return embedding
 
 
44
  if not os.path.exists(wav_file_path):
45
+ raise gr.Error(f"Audio file not found: {wav_file_path}")
 
46
  try:
47
+ print(f"Creating new speaker embedding for {wav_file_path}...")
48
  audio, sr = torchaudio.load(wav_file_path)
49
+ if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
50
+ if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
 
 
 
51
  with torch.no_grad():
52
  embedding = speaker_model.encode_batch(audio.to(device))
53
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
 
54
  torch.save(embedding.cpu(), embedding_path)
55
  speaker_embeddings_cache[wav_file_path] = embedding.to(device)
56
  print(f"Embedding created and saved for {wav_file_path}.")
 
83
  text = re.sub(r'[^\w\s\']', '', text)
84
  return text
85
 
86
+ # --- Main Text-to-Speech Function ---
87
  def text_to_speech(text, voice_choice):
88
+ try:
89
+ print(f"Received request: Text='{text}', Voice='{voice_choice}'")
90
+ if not text:
91
+ gr.Warning("Please enter some text.")
92
+ return None
93
+ if not voice_choice:
94
+ gr.Warning("Please select a voice.")
95
+ return None
96
+
97
+ print("Step 1: Getting speaker embedding...")
98
+ speaker_embedding = get_speaker_embedding(voice_choice)
99
 
100
+ print("Step 2: Normalizing text...")
101
+ normalized_text = normalize_text(text)
102
+
103
+ print("Step 3: Processing text with SpeechT5Processor...")
104
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
105
 
106
+ print("Step 4: Generating speech with model.generate()...")
107
+ with torch.no_grad():
108
+ speech = model.generate(
109
+ input_ids=inputs["input_ids"],
110
+ speaker_embeddings=speaker_embedding.unsqueeze(0),
111
+ do_sample=True,
112
+ top_k=50,
113
+ )
114
+ print("Step 5: Applying vocoder...")
115
+ speech = vocoder(speech)
116
+
117
+ print("Step 6: Generation complete. Returning audio.")
118
+ return (16000, speech.cpu().numpy())
119
+ except Exception as e:
120
+ print(f"AN ERROR OCCURRED: {e}")
121
+ raise gr.Error(f"An error occurred during generation: {e}")
122
 
123
  # --- Gradio Interface ---
124
  iface = gr.Interface(
 
134
  ],
135
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
136
  title="Multi-Voice Somali Text-to-Speech",
137
+ description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech."
 
 
 
 
138
  )
139
 
140
  # --- Launch the web interface ---
141
  if __name__ == "__main__":
 
142
  print("Checking for voice files...")
143
  for f in VOICE_SAMPLE_FILES:
144
  if not os.path.exists(f):
145
+ raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Space.")
146
 
147
+ print("Pre-loading all voice embeddings for faster startup...")
148
  for voice_file in VOICE_SAMPLE_FILES:
149
  get_speaker_embedding(voice_file)
150
  print("All voices are ready. Launching interface.")