Somalitts commited on
Commit
e365862
·
verified ·
1 Parent(s): 61cbc18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -43
app.py CHANGED
@@ -1,3 +1,12 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
@@ -7,6 +16,12 @@ import numpy as np
7
  import soundfile as sf
8
  from pydub import AudioSegment, effects
9
 
 
 
 
 
 
 
10
  # --- Model Loading ---
11
  print("Loading models, this may take a moment...")
12
 
@@ -20,7 +35,6 @@ model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
21
 
22
  # Load the speaker encoder model from SpeechBrain
23
- # This model creates the voice profile (embedding) from an audio sample.
24
  speaker_model = EncoderClassifier.from_hparams(
25
  source="speechbrain/spkrec-xvect-voxceleb",
26
  run_opts={"device": device},
@@ -30,60 +44,48 @@ print("Models loaded successfully.")
30
 
31
 
32
  # --- Speaker Embedding Generation ---
33
- # This section creates the unique voice identity for the TTS.
34
-
35
  def create_speaker_embedding(audio_path):
36
  """
37
  Normalizes the input audio and creates a high-quality speaker embedding.
38
  """
39
  print("Creating speaker embedding...")
40
- # 1. Pre-process the audio for better quality
41
- print(f"Normalizing audio file: {audio_path}")
42
  raw_audio = AudioSegment.from_wav(audio_path)
43
  normalized_audio = effects.normalize(raw_audio)
44
-
45
- # pydub works with milliseconds
46
  normalized_audio_path = "normalized_speaker.wav"
47
  normalized_audio.export(normalized_audio_path, format="wav")
48
 
49
- # 2. Generate the embedding
50
  waveform, sr = torchaudio.load(normalized_audio_path)
51
- # Resample if necessary and move to the correct device
52
  if sr != 16000:
53
  waveform = torchaudio.functional.resample(waveform, sr, 16000)
54
 
55
  with torch.no_grad():
56
  embedding = speaker_model.encode_batch(waveform.to(device))
57
- # Normalize the embedding itself for model compatibility
58
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
59
 
60
  print("Speaker embedding created and cached.")
61
  return embedding
62
 
63
- # Path to the speaker audio and the cached embedding
64
  SPEAKER_WAV = "1.wav"
65
  EMB_PATH = "speaker_embedding.pt"
66
 
 
67
  if not os.path.exists(SPEAKER_WAV):
68
- raise FileNotFoundError(f"Error: Speaker audio file not found at '{SPEAKER_WAV}'. Please create this file.")
 
 
69
 
70
- # Create and cache the embedding if it doesn't exist
71
  if os.path.exists(EMB_PATH):
72
  print("Loading cached speaker embedding.")
73
  speaker_embedding = torch.load(EMB_PATH).to(device)
74
  else:
75
  speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
76
- # Cache the embedding for faster startups next time
77
  torch.save(speaker_embedding.cpu(), EMB_PATH)
78
 
79
 
80
  # --- Text Normalization (Somali) ---
81
- # This function cleans the text and converts numbers to words.
82
-
83
  def number_to_somali_words(num_str):
84
- """Converts a string of digits into Somali words."""
85
  num = int(num_str)
86
- if num < 0: return "eber ka yar" # Handle negative case
87
 
88
  units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
89
  teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
@@ -101,36 +103,22 @@ def number_to_somali_words(num_str):
101
  if num < 1000000:
102
  thousand, rest = divmod(num, 1000)
103
  return number_to_somali_words(str(thousand)) + " kun" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
104
- return num_str # Fallback for very large numbers
105
 
106
  def normalize_text(text):
107
- """Cleans and normalizes Somali text for TTS."""
108
  text = text.lower()
109
- # Convert numbers to words using a regex substitution
110
  text = re.sub(r"\d+", lambda m: number_to_somali_words(m.group(0)), text)
111
- # Remove special characters except for basic punctuation that might indicate pauses
112
  text = re.sub(r'[^\w\s,\.]', '', text)
113
  text = text.strip()
114
  return text
115
 
116
 
117
  # --- Core TTS Function ---
118
-
119
  def text_to_speech(text):
120
- """
121
- Generates speech from text, including pre- and post-processing steps.
122
- """
123
- print(f"Received text: {text}")
124
- # 1. Normalize the input text
125
  normalized_text = normalize_text(text)
126
  if not normalized_text:
127
- print("Warning: Text is empty after normalization.")
128
- # Return silence if there's no text to process
129
  return (16000, np.zeros(16000).astype(np.int16))
130
 
131
- print(f"Normalized text: {normalized_text}")
132
-
133
- # 2. Process text and generate speech
134
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
135
  with torch.no_grad():
136
  speech_tensor = model.generate_speech(
@@ -141,10 +129,6 @@ def text_to_speech(text):
141
 
142
  speech_numpy = speech_tensor.cpu().numpy()
143
 
144
- # 3. Post-process the audio to make it sound more human
145
- print("Post-processing generated audio...")
146
- # Convert numpy array to a pydub AudioSegment
147
- # Ensure numpy array is in the correct format (16-bit PCM)
148
  audio_segment = AudioSegment(
149
  speech_numpy.tobytes(),
150
  frame_rate=16000,
@@ -152,18 +136,13 @@ def text_to_speech(text):
152
  channels=1
153
  )
154
 
155
- # Apply normalization - this is a key step for better quality
156
  processed_audio = effects.normalize(audio_segment)
157
-
158
- # Convert back to numpy array for Gradio output
159
  processed_numpy = np.array(processed_audio.get_array_of_samples())
160
 
161
- print("Speech generation complete.")
162
  return (16000, processed_numpy)
163
 
164
 
165
  # --- Gradio Web Interface ---
166
-
167
  iface = gr.Interface(
168
  fn=text_to_speech,
169
  inputs=gr.Textbox(
@@ -189,4 +168,4 @@ iface = gr.Interface(
189
  )
190
 
191
  if __name__ == "__main__":
192
- iface.launch(share=True) # Set share=True to get a public link
 
1
+ # ==============================================================================
2
+ # Enhanced Somali Text-to-Speech (Corrected)
3
+ # ==============================================================================
4
+ # This script builds a Gradio web interface for Somali TTS.
5
+ #
6
+ # FIX: Added the necessary import from the 'transformers' library to resolve
7
+ # the NameError.
8
+ # ==============================================================================
9
+
10
  import gradio as gr
11
  import torch
12
  import torchaudio
 
16
  import soundfile as sf
17
  from pydub import AudioSegment, effects
18
 
19
+ # --- FIX IS HERE ---
20
+ # Import the required classes from the transformers library
21
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
22
+ from speechbrain.pretrained import EncoderClassifier
23
+
24
+
25
  # --- Model Loading ---
26
  print("Loading models, this may take a moment...")
27
 
 
35
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
36
 
37
  # Load the speaker encoder model from SpeechBrain
 
38
  speaker_model = EncoderClassifier.from_hparams(
39
  source="speechbrain/spkrec-xvect-voxceleb",
40
  run_opts={"device": device},
 
44
 
45
 
46
  # --- Speaker Embedding Generation ---
 
 
47
  def create_speaker_embedding(audio_path):
48
  """
49
  Normalizes the input audio and creates a high-quality speaker embedding.
50
  """
51
  print("Creating speaker embedding...")
 
 
52
  raw_audio = AudioSegment.from_wav(audio_path)
53
  normalized_audio = effects.normalize(raw_audio)
 
 
54
  normalized_audio_path = "normalized_speaker.wav"
55
  normalized_audio.export(normalized_audio_path, format="wav")
56
 
 
57
  waveform, sr = torchaudio.load(normalized_audio_path)
 
58
  if sr != 16000:
59
  waveform = torchaudio.functional.resample(waveform, sr, 16000)
60
 
61
  with torch.no_grad():
62
  embedding = speaker_model.encode_batch(waveform.to(device))
 
63
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
64
 
65
  print("Speaker embedding created and cached.")
66
  return embedding
67
 
 
68
  SPEAKER_WAV = "1.wav"
69
  EMB_PATH = "speaker_embedding.pt"
70
 
71
+ # Create a dummy 1.wav if it doesn't exist for the Space to build
72
  if not os.path.exists(SPEAKER_WAV):
73
+ print(f"Warning: Speaker file '{SPEAKER_WAV}' not found. Creating a dummy silent file.")
74
+ dummy_audio = np.zeros(16000 * 2) # 2 seconds of silence
75
+ sf.write(SPEAKER_WAV, dummy_audio, 16000)
76
 
 
77
  if os.path.exists(EMB_PATH):
78
  print("Loading cached speaker embedding.")
79
  speaker_embedding = torch.load(EMB_PATH).to(device)
80
  else:
81
  speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
 
82
  torch.save(speaker_embedding.cpu(), EMB_PATH)
83
 
84
 
85
  # --- Text Normalization (Somali) ---
 
 
86
  def number_to_somali_words(num_str):
 
87
  num = int(num_str)
88
+ if num < 0: return "eber ka yar"
89
 
90
  units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
91
  teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
 
103
  if num < 1000000:
104
  thousand, rest = divmod(num, 1000)
105
  return number_to_somali_words(str(thousand)) + " kun" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
106
+ return num_str
107
 
108
  def normalize_text(text):
 
109
  text = text.lower()
 
110
  text = re.sub(r"\d+", lambda m: number_to_somali_words(m.group(0)), text)
 
111
  text = re.sub(r'[^\w\s,\.]', '', text)
112
  text = text.strip()
113
  return text
114
 
115
 
116
  # --- Core TTS Function ---
 
117
  def text_to_speech(text):
 
 
 
 
 
118
  normalized_text = normalize_text(text)
119
  if not normalized_text:
 
 
120
  return (16000, np.zeros(16000).astype(np.int16))
121
 
 
 
 
122
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
123
  with torch.no_grad():
124
  speech_tensor = model.generate_speech(
 
129
 
130
  speech_numpy = speech_tensor.cpu().numpy()
131
 
 
 
 
 
132
  audio_segment = AudioSegment(
133
  speech_numpy.tobytes(),
134
  frame_rate=16000,
 
136
  channels=1
137
  )
138
 
 
139
  processed_audio = effects.normalize(audio_segment)
 
 
140
  processed_numpy = np.array(processed_audio.get_array_of_samples())
141
 
 
142
  return (16000, processed_numpy)
143
 
144
 
145
  # --- Gradio Web Interface ---
 
146
  iface = gr.Interface(
147
  fn=text_to_speech,
148
  inputs=gr.Textbox(
 
168
  )
169
 
170
  if __name__ == "__main__":
171
+ iface.launch() # share=True is not needed inside Hugging Face Spaces