Somalitts commited on
Commit
100e7c2
·
verified ·
1 Parent(s): 8204814

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -39
app.py CHANGED
@@ -11,8 +11,9 @@ import numpy as np
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {device}")
13
 
14
- # --- KU DAR FAYLASHA CODADKAAGA ---
15
- VOICE_SAMPLE_FILES = ["1.wav"]
 
16
  EMBEDDING_DIR = "speaker_embeddings"
17
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
18
 
@@ -34,6 +35,7 @@ except Exception as e:
34
  speaker_embeddings_cache = {}
35
 
36
  def get_speaker_embedding(wav_file_path):
 
37
  if wav_file_path in speaker_embeddings_cache:
38
  return speaker_embeddings_cache[wav_file_path]
39
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
@@ -44,7 +46,6 @@ def get_speaker_embedding(wav_file_path):
44
  if not os.path.exists(wav_file_path):
45
  raise gr.Error(f"Audio file not found: {wav_file_path}")
46
  try:
47
- print(f"Creating new speaker embedding for {wav_file_path}...")
48
  audio, sr = torchaudio.load(wav_file_path)
49
  if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
50
  if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
@@ -53,12 +54,11 @@ def get_speaker_embedding(wav_file_path):
53
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
54
  torch.save(embedding.cpu(), embedding_path)
55
  speaker_embeddings_cache[wav_file_path] = embedding.to(device)
56
- print(f"Embedding created and saved for {wav_file_path}.")
57
  return embedding.to(device)
58
  except Exception as e:
59
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
60
 
61
- # --- Text Processing Functions ---
62
  number_words = {
63
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
64
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -83,35 +83,33 @@ def normalize_text(text):
83
  text = re.sub(r'[^\w\s\']', '', text)
84
  return text
85
 
86
- # --- Main Text-to-Speech Function ---
87
  def text_to_speech(text, voice_choice):
88
- try:
89
- if not text:
90
- gr.Warning("Please enter some text.")
91
- return None
92
- if not voice_choice:
93
- gr.Warning("Please select a voice.")
94
- return None
95
-
96
- speaker_embedding = get_speaker_embedding(voice_choice)
97
- normalized_text = normalize_text(text)
98
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
99
 
100
- with torch.no_grad():
101
- speech = model.generate(
102
- input_ids=inputs["input_ids"],
103
- speaker_embeddings=speaker_embedding.unsqueeze(0),
104
- do_sample=True,
105
- top_k=50,
106
- )
107
- speech = vocoder(speech)
108
 
109
- return (16000, speech.cpu().numpy())
110
- except Exception as e:
111
- print(f"AN ERROR OCCURRED: {e}")
112
- raise gr.Error(f"An error occurred during generation: {e}")
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # --- Gradio Interface ---
115
  iface = gr.Interface(
116
  fn=text_to_speech,
117
  inputs=[
@@ -119,25 +117,22 @@ iface = gr.Interface(
119
  gr.Dropdown(
120
  VOICE_SAMPLE_FILES,
121
  label="Select Voice",
122
- info="Choose the voice you want to use for the speech.",
123
  value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
124
  )
125
  ],
126
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
127
  title="Multi-Voice Somali Text-to-Speech",
128
- description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech."
129
  )
130
 
131
- # --- Launch the web interface ---
132
  if __name__ == "__main__":
133
- print("Checking for voice files...")
134
- for f in VOICE_SAMPLE_FILES:
135
- if not os.path.exists(f):
136
- raise FileNotFoundError(f"Voice file not found: '{f}'. Please upload it to your Space.")
137
 
138
- print("Pre-loading all voice embeddings...")
139
  for voice_file in VOICE_SAMPLE_FILES:
140
  get_speaker_embedding(voice_file)
141
- print("All voices are ready. Launching interface.")
142
 
143
  iface.launch(share=True)
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {device}")
13
 
14
+ # --- KU DAR FAYLKA CODADKAAGA ---
15
+ # TAYADA CODADKAN AYAA UGU MUHIMSAN NATIIJADA
16
+ VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
17
  EMBEDDING_DIR = "speaker_embeddings"
18
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
19
 
 
35
  speaker_embeddings_cache = {}
36
 
37
  def get_speaker_embedding(wav_file_path):
38
+ # Shaqadan sidii hore ayay u egtahay
39
  if wav_file_path in speaker_embeddings_cache:
40
  return speaker_embeddings_cache[wav_file_path]
41
  embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
 
46
  if not os.path.exists(wav_file_path):
47
  raise gr.Error(f"Audio file not found: {wav_file_path}")
48
  try:
 
49
  audio, sr = torchaudio.load(wav_file_path)
50
  if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
51
  if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
 
54
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
55
  torch.save(embedding.cpu(), embedding_path)
56
  speaker_embeddings_cache[wav_file_path] = embedding.to(device)
 
57
  return embedding.to(device)
58
  except Exception as e:
59
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
60
 
61
+ # --- Text Processing Functions (sidoodii) ---
62
  number_words = {
63
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
64
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
 
83
  text = re.sub(r'[^\w\s\']', '', text)
84
  return text
85
 
86
+ # --- Main Text-to-Speech Function (oo la Hagaajiyay) ---
87
  def text_to_speech(text, voice_choice):
88
+ if not text or not voice_choice:
89
+ gr.Warning("Fadlan geli qoraal oo dooro cod.")
90
+ return None
 
 
 
 
 
 
 
 
91
 
92
+ speaker_embedding = get_speaker_embedding(voice_choice)
93
+ normalized_text = normalize_text(text)
94
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
 
 
 
 
 
95
 
96
+ with torch.no_grad():
97
+ speech = model.generate(
98
+ input_ids=inputs["input_ids"],
99
+ speaker_embeddings=speaker_embedding.unsqueeze(0),
100
+
101
+ # --- Halbeegyada Tayada Codka ---
102
+ do_sample=True, # MUHIIM: Waxay ka dhigaysaa codka mid dabiici ah
103
+ top_k=50, # Waxay xaddidaysaa hal-abuurka si uusan u qaldamin
104
+ temperature=0.75, # Kani wuxuu xakameeyaa hal-abuurka. (0.7 - 0.8 waa fiican yahay)
105
+ repetition_penalty=1.2, # Waxay ka hortagtaa inuu ku celceliyo isku dhawaaq
106
+ max_new_tokens=512 # Waxay siinaysaa model-ka meel ku filan oo uu ku dhameystiro hadalka
107
+ )
108
+ speech = vocoder(speech)
109
+
110
+ return (16000, speech.cpu().numpy())
111
 
112
+ # --- Gradio Interface (sidii hore) ---
113
  iface = gr.Interface(
114
  fn=text_to_speech,
115
  inputs=[
 
117
  gr.Dropdown(
118
  VOICE_SAMPLE_FILES,
119
  label="Select Voice",
120
+ info="Dooro codka aad rabto inaad isticmaasho.",
121
  value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
122
  )
123
  ],
124
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
125
  title="Multi-Voice Somali Text-to-Speech",
126
+ description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
127
  )
128
 
 
129
  if __name__ == "__main__":
130
+ if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
131
+ raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
 
 
132
 
133
+ print("Diyaarinta codadka...")
134
  for voice_file in VOICE_SAMPLE_FILES:
135
  get_speaker_embedding(voice_file)
136
+ print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
137
 
138
  iface.launch(share=True)