Somalitts commited on
Commit
1229011
·
verified ·
1 Parent(s): 4aa5331

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -28
app.py CHANGED
@@ -12,8 +12,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {device}")
13
 
14
  # --- KU DAR FAYLKA CODADKAAGA ---
15
- # TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
16
- VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
17
  EMBEDDING_DIR = "speaker_embeddings"
18
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
19
 
@@ -35,7 +34,6 @@ except Exception as e:
35
  speaker_embeddings_cache = {}
36
 
37
  def get_speaker_embedding(wav_file_path):
38
- # Shaqadan sidii hore ayay u egtahay
39
  if wav_file_path in speaker_embeddings_cache:
40
  return speaker_embeddings_cache[wav_file_path]
41
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
@@ -47,8 +45,10 @@ def get_speaker_embedding(wav_file_path):
47
  raise gr.Error(f"Audio file not found: {wav_file_path}")
48
  try:
49
  audio, sr = torchaudio.load(wav_file_path)
50
- if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
51
- if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
 
 
52
  with torch.no_grad():
53
  embedding = speaker_model.encode_batch(audio.to(device))
54
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
@@ -58,7 +58,7 @@ def get_speaker_embedding(wav_file_path):
58
  except Exception as e:
59
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
60
 
61
- # --- Text Processing Functions (sidoodii) ---
62
  number_words = {
63
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
64
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -69,6 +69,7 @@ number_words = {
69
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
70
  100: "boqol", 1000: "kun",
71
  }
 
72
  def number_to_words(n):
73
  if n in number_words:
74
  return number_words[n]
@@ -87,13 +88,14 @@ def number_to_words(n):
87
 
88
  def replace_numbers_with_words(text):
89
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 
90
  def normalize_text(text):
91
  text = text.lower()
92
  text = replace_numbers_with_words(text)
93
- text = re.sub(r'[^\w\s\']', '', text)
94
  return text
95
 
96
- # --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
97
  def text_to_speech(text, voice_choice):
98
  if not text or not voice_choice:
99
  gr.Warning("Fadlan geli qoraal oo dooro cod.")
@@ -101,25 +103,36 @@ def text_to_speech(text, voice_choice):
101
 
102
  speaker_embedding = get_speaker_embedding(voice_choice)
103
  normalized_text = normalize_text(text)
104
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
105
-
106
- with torch.no_grad():
107
- speech = model.generate(
108
- input_ids=inputs["input_ids"],
109
- speaker_embeddings=speaker_embedding.unsqueeze(0),
110
-
111
- # --- Halbeegyada Tayada Codka ---
112
- do_sample=True, # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
113
- top_k=50, # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
114
- temperature=0.75, # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
115
- repetition_penalty=1.2, # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
116
- max_new_tokens=512 # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
117
- )
118
- speech = vocoder(speech)
119
 
120
- return (16000, speech.cpu().numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # --- Gradio Interface (sidii hore) ---
123
  iface = gr.Interface(
124
  fn=text_to_speech,
125
  inputs=[
@@ -136,13 +149,14 @@ iface = gr.Interface(
136
  description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
137
  )
138
 
 
139
  if __name__ == "__main__":
140
  if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
141
  raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
142
-
143
  print("Diyaarinta codadka...")
144
  for voice_file in VOICE_SAMPLE_FILES:
145
  get_speaker_embedding(voice_file)
146
  print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
147
-
148
- iface.launch(share=True)
 
12
  print(f"Using device: {device}")
13
 
14
  # --- KU DAR FAYLKA CODADKAAGA ---
15
+ VOICE_SAMPLE_FILES = ["1.wav"]
 
16
  EMBEDDING_DIR = "speaker_embeddings"
17
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
18
 
 
34
  speaker_embeddings_cache = {}
35
 
36
  def get_speaker_embedding(wav_file_path):
 
37
  if wav_file_path in speaker_embeddings_cache:
38
  return speaker_embeddings_cache[wav_file_path]
39
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
 
45
  raise gr.Error(f"Audio file not found: {wav_file_path}")
46
  try:
47
  audio, sr = torchaudio.load(wav_file_path)
48
+ if sr != 16000:
49
+ audio = torchaudio.functional.resample(audio, sr, 16000)
50
+ if audio.shape[0] > 1:
51
+ audio = torch.mean(audio, dim=0, keepdim=True)
52
  with torch.no_grad():
53
  embedding = speaker_model.encode_batch(audio.to(device))
54
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
 
58
  except Exception as e:
59
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
60
 
61
+ # --- Number Handling Functions ---
62
  number_words = {
63
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
64
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
 
69
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
70
  100: "boqol", 1000: "kun",
71
  }
72
+
73
  def number_to_words(n):
74
  if n in number_words:
75
  return number_words[n]
 
88
 
89
  def replace_numbers_with_words(text):
90
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
91
+
92
  def normalize_text(text):
93
  text = text.lower()
94
  text = replace_numbers_with_words(text)
95
+ text = re.sub(r'[^\w\s\'.!?]', '', text) # Ha tirtirin calaamadaha muhiimka ah
96
  return text
97
 
98
+ # --- Main TTS Function with Pause ---
99
  def text_to_speech(text, voice_choice):
100
  if not text or not voice_choice:
101
  gr.Warning("Fadlan geli qoraal oo dooro cod.")
 
103
 
104
  speaker_embedding = get_speaker_embedding(voice_choice)
105
  normalized_text = normalize_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ # Kala qaybi jumladaha
108
+ lines = re.split(r'(?<=[.!?])\s+', normalized_text.strip())
109
+ full_audio = []
110
+
111
+ for line in lines:
112
+ if not line.strip():
113
+ continue
114
+ inputs = processor(text=line, return_tensors="pt").to(device)
115
+
116
+ with torch.no_grad():
117
+ speech = model.generate(
118
+ input_ids=inputs["input_ids"],
119
+ speaker_embeddings=speaker_embedding.unsqueeze(0),
120
+ do_sample=True,
121
+ top_k=50,
122
+ temperature=0.75,
123
+ repetition_penalty=1.2,
124
+ max_new_tokens=512
125
+ )
126
+ audio_chunk = vocoder(speech).cpu().numpy()
127
+ full_audio.append(audio_chunk)
128
+
129
+ # Nasasho 0.5 ilbiriqsi u dhaxeysa
130
+ pause = np.zeros((1, 16000 // 2), dtype=np.float32)
131
+ full_audio.append(pause)
132
+
133
+ return (16000, np.concatenate(full_audio, axis=-1))
134
 
135
+ # --- Gradio Interface ---
136
  iface = gr.Interface(
137
  fn=text_to_speech,
138
  inputs=[
 
149
  description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
150
  )
151
 
152
+ # --- Launch ---
153
  if __name__ == "__main__":
154
  if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
155
  raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
156
+
157
  print("Diyaarinta codadka...")
158
  for voice_file in VOICE_SAMPLE_FILES:
159
  get_speaker_embedding(voice_file)
160
  print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
161
+
162
+ iface.launch(share=True)