Somalitts commited on
Commit
2342a7b
·
verified ·
1 Parent(s): 83609f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -44
app.py CHANGED
@@ -8,35 +8,64 @@ import gradio as gr
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  from speechbrain.inference.speaker import EncoderClassifier
10
 
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
- CACHE_DIR = "/tmp/hf-cache"
13
 
14
- # Load models
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
16
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
17
- model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
18
 
19
- speaker_model = EncoderClassifier.from_hparams(
20
- source="speechbrain/spkrec-xvect-voxceleb",
21
- run_opts={"device": device},
22
- savedir="/tmp/spk_model"
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Speaker embedding
26
- def get_embedding(wav_path, pt_path):
27
- if os.path.exists(pt_path):
28
- return torch.load(pt_path).to(device)
29
- audio, sr = torchaudio.load(wav_path)
30
- audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
31
  with torch.no_grad():
32
- emb = speaker_model.encode_batch(audio)
33
- emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
34
- torch.save(emb.cpu(), pt_path)
35
- return emb
36
 
37
- embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
 
38
 
39
- # Text normalization
40
  number_words = {
41
  0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
42
  6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
@@ -46,19 +75,17 @@ number_words = {
46
  }
47
 
48
  def number_to_words(n):
49
- if n < 20:
50
- return number_words.get(n, str(n))
51
- elif n < 100:
52
  tens, unit = divmod(n, 10)
53
- return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
54
- elif n < 1000:
55
  hundreds, rem = divmod(n, 100)
56
- return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
57
- elif n < 1_000_000:
58
  th, rem = divmod(n, 1000)
59
- return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
60
- else:
61
- return str(n)
62
 
63
  def replace_numbers_with_words(text):
64
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
@@ -66,28 +93,58 @@ def replace_numbers_with_words(text):
66
  def normalize_text(text):
67
  text = text.lower()
68
  text = replace_numbers_with_words(text)
69
- text = re.sub(r'[^\w\s]', '', text)
70
  return text
71
 
72
- # Gradio TTS Function
73
- def tts(text):
 
 
 
 
 
 
 
 
74
  clean_text = normalize_text(text)
75
  inputs = processor(text=clean_text, return_tensors="pt").to(device)
76
 
77
  with torch.no_grad():
78
- waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)
79
 
80
- out_path = f"/tmp/{uuid.uuid4().hex}.wav"
 
 
81
  sf.write(out_path, waveform.cpu().numpy(), 16000)
82
  return out_path
83
 
84
- # Gradio Interface
 
85
  iface = gr.Interface(
86
- fn=tts,
87
- inputs=gr.Textbox(label="Geli qoraalka af Soomaali"),
88
- outputs=gr.Audio(label="Codka", type="filepath"),
89
- title="Somali Text-to-Speech",
90
- description="Ku qor qoraal Soomaali ah si aad cod ugu dhageysato (Female voice only)."
 
 
 
 
 
 
 
 
91
  )
92
 
93
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  from speechbrain.inference.speaker import EncoderClassifier
10
 
11
+ # --- Configuration ---
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
13
 
14
+ # --- TALLAABADA 1: KU DAR MAGACYADA FAYLASHAADA CODADKA HAKAAN ---
15
+ # Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
16
+ # Ku beddel magacyadan kuwaaga dhabta ah. Waa inay noqdaan faylal .wav ah.
17
+ VOICE_SAMPLE_FILES = ["1.wav", "90.wav"]
18
 
19
+ # Meelaha lagu keydinayo faylasha ku meel gaarka ah
20
+ CACHE_DIR = "hf_cache"
21
+ SPEAKER_EMBEDDING_DIR = "speaker_embeddings"
22
+ os.makedirs(CACHE_DIR, exist_ok=True)
23
+ os.makedirs(SPEAKER_EMBEDDING_DIR, exist_ok=True)
24
+
25
+ # --- Soo Dejinta Model-yada ---
26
+ try:
27
+ print("Waxaa la soo dejinayaa model-yada...")
28
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
29
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
30
+ # Magaca model-ka waxaan ka beddelnay 'model_female' oo ka dhignay 'model' maadaama uu hadda codad kala duwan isticmaalayo
31
+ model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
32
+ speaker_model = EncoderClassifier.from_hparams(
33
+ source="speechbrain/spkrec-xvect-voxceleb",
34
+ run_opts={"device": device},
35
+ savedir=os.path.join(CACHE_DIR, "spk_model")
36
+ )
37
+ print("Model-yadii waa diyaar.")
38
+ except Exception as e:
39
+ raise gr.Error(f"Cillad ayaa ka timid soo dejinta model-yada: {e}. Hubi internet-kaaga.")
40
+
41
+ # --- Shaqada Abuurista Astaanta Codka (Speaker Embedding) ---
42
+ def get_speaker_embedding(wav_file_path):
43
+ """
44
+ Shaqadan waxay soo saaraysaa "astaanta codka" (speaker embedding)
45
+ haddii aysan jirin, way abuuraysaa oo keydinaysaa si aan mar dambe loo sugin.
46
+ """
47
+ embedding_path = os.path.join(SPEAKER_EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
48
+
49
+ if os.path.exists(embedding_path):
50
+ return torch.load(embedding_path, map_location=device)
51
+
52
+ if not os.path.exists(wav_file_path):
53
+ raise gr.Error(f"Lama helin faylka codka: {wav_file_path}. Hubi inuu ku jiro galka saxda ah oo magaca si sax ah u qortay.")
54
+
55
+ print(f"Waxaa la abuurayaa astaan cod oo cusub: {wav_file_path}")
56
+ audio, sr = torchaudio.load(wav_file_path)
57
+ if sr != 16000:
58
+ audio = torchaudio.functional.resample(audio, sr, 16000)
59
+ audio = audio.mean(dim=0).unsqueeze(0).to(device)
60
 
 
 
 
 
 
 
61
  with torch.no_grad():
62
+ embedding = speaker_model.encode_batch(audio)
63
+ embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
 
 
64
 
65
+ torch.save(embedding.cpu(), embedding_path)
66
+ return embedding
67
 
68
+ # --- Hagaajinta Qoraalka (Text Normalization) ---
69
  number_words = {
70
  0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
71
  6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
 
75
  }
76
 
77
  def number_to_words(n):
78
+ if n < 20: return number_words.get(n, str(n))
79
+ if n < 100:
 
80
  tens, unit = divmod(n, 10)
81
+ return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
82
+ if n < 1000:
83
  hundreds, rem = divmod(n, 100)
84
+ return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" iyo " + number_to_words(rem) if rem else "")
85
+ if n < 1_000_000:
86
  th, rem = divmod(n, 1000)
87
+ return (number_to_words(th) + " kun") + (" iyo " + number_to_words(rem) if rem else "")
88
+ return str(n)
 
89
 
90
  def replace_numbers_with_words(text):
91
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 
93
  def normalize_text(text):
94
  text = text.lower()
95
  text = replace_numbers_with_words(text)
96
+ text = re.sub(r'[^\w\s\']', '', text)
97
  return text
98
 
99
+ # --- Shaqada ugu Muhiimsan (TTS Function) ---
100
+ def text_to_speech(text, voice_choice):
101
+ """ Hadda shaqadani waxay qaadanaysaa qoraalka iyo codka la doortay """
102
+ if not text or not voice_choice:
103
+ gr.Warning("Fadlan geli qoraal oo dooro cod.")
104
+ return None
105
+
106
+ # Soo qaado astaanta codka la doortay
107
+ speaker_embedding = get_speaker_embedding(voice_choice)
108
+
109
  clean_text = normalize_text(text)
110
  inputs = processor(text=clean_text, return_tensors="pt").to(device)
111
 
112
  with torch.no_grad():
113
+ waveform = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
114
 
115
+ # Si ku meel gaar ah u keydi faylka codka la abuuray
116
+ os.makedirs("/tmp/tts_outputs", exist_ok=True)
117
+ out_path = f"/tmp/tts_outputs/{uuid.uuid4().hex}.wav"
118
  sf.write(out_path, waveform.cpu().numpy(), 16000)
119
  return out_path
120
 
121
+ # --- Interface-ka Gradio ---
122
+ # Hadda wuxuu leeyahay meel qoraalka la geliyo iyo meel codka laga doorto
123
  iface = gr.Interface(
124
+ fn=text_to_speech,
125
+ inputs=[
126
+ gr.Textbox(label="Geli qoraalka af Soomaali"),
127
+ gr.Dropdown(
128
+ choices=VOICE_SAMPLE_FILES,
129
+ label="Dooro Codkaaga (Select Your Voice)",
130
+ value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None,
131
+ info="Dooro mid ka mid ah codadkaaga aad diyaarisay."
132
+ )
133
+ ],
134
+ outputs=gr.Audio(label="Codka La Abuuray", type="filepath"),
135
+ title="Soomaali Text-to-Speech (Codad Kala Duwan)",
136
+ description="Ku qor qoraal Soomaali ah, dooro codka aad rabto, kadibna riix 'Submit' si aad cod ugu dhageysato."
137
  )
138
 
139
+ # Diyaari codadka ka hor inta aan barnaamijka la furin
140
+ if __name__ == "__main__":
141
+ print("Hubinta faylasha codadka...")
142
+ if not VOICE_SAMPLE_FILES or not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
143
+ raise FileNotFoundError("Mid ka mid ah faylasha ku jira 'VOICE_SAMPLE_FILES' lama helin. Fadlan hubi magacyada iyo meesha ay ku jiraan.")
144
+
145
+ print("Diyaarinta codadkaaga...")
146
+ for voice in VOICE_SAMPLE_FILES:
147
+ get_speaker_embedding(voice) # Tani waxay abuuraysaa astaamaha codka haddii aysan jirin
148
+ print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
149
+
150
+ iface.launch(share=True)