Somalitts commited on
Commit
43ec985
·
verified ·
1 Parent(s): 2342a7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -121
app.py CHANGED
@@ -1,150 +1,88 @@
1
- import os
2
- import re
3
- import uuid
4
  import torch
5
  import torchaudio
6
- import soundfile as sf
7
- import gradio as gr
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
- from speechbrain.inference.speaker import EncoderClassifier
 
10
 
11
  # --- Configuration ---
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
- # --- TALLAABADA 1: KU DAR MAGACYADA FAYLASHAADA CODADKA HAKAAN ---
15
- # Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
16
- # Ku beddel magacyadan kuwaaga dhabta ah. Waa inay noqdaan faylal .wav ah.
17
- VOICE_SAMPLE_FILES = ["1.wav", "90.wav"]
18
 
19
- # Meelaha lagu keydinayo faylasha ku meel gaarka ah
20
- CACHE_DIR = "hf_cache"
21
- SPEAKER_EMBEDDING_DIR = "speaker_embeddings"
22
- os.makedirs(CACHE_DIR, exist_ok=True)
23
- os.makedirs(SPEAKER_EMBEDDING_DIR, exist_ok=True)
24
 
25
- # --- Soo Dejinta Model-yada ---
26
  try:
27
- print("Waxaa la soo dejinayaa model-yada...")
28
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
29
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
30
- # Magaca model-ka waxaan ka beddelnay 'model_female' oo ka dhignay 'model' maadaama uu hadda codad kala duwan isticmaalayo
31
- model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
32
  speaker_model = EncoderClassifier.from_hparams(
33
  source="speechbrain/spkrec-xvect-voxceleb",
34
  run_opts={"device": device},
35
- savedir=os.path.join(CACHE_DIR, "spk_model")
36
  )
37
- print("Model-yadii waa diyaar.")
38
  except Exception as e:
39
- raise gr.Error(f"Cillad ayaa ka timid soo dejinta model-yada: {e}. Hubi internet-kaaga.")
40
 
41
- # --- Shaqada Abuurista Astaanta Codka (Speaker Embedding) ---
42
- def get_speaker_embedding(wav_file_path):
43
- """
44
- Shaqadan waxay soo saaraysaa "astaanta codka" (speaker embedding)
45
- haddii aysan jirin, way abuuraysaa oo keydinaysaa si aan mar dambe loo sugin.
46
- """
47
- embedding_path = os.path.join(SPEAKER_EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
48
 
 
 
 
 
49
  if os.path.exists(embedding_path):
50
- return torch.load(embedding_path, map_location=device)
51
-
 
52
  if not os.path.exists(wav_file_path):
53
- raise gr.Error(f"Lama helin faylka codka: {wav_file_path}. Hubi inuu ku jiro galka saxda ah oo magaca si sax ah u qortay.")
54
-
55
- print(f"Waxaa la abuurayaa astaan cod oo cusub: {wav_file_path}")
56
- audio, sr = torchaudio.load(wav_file_path)
57
- if sr != 16000:
58
- audio = torchaudio.functional.resample(audio, sr, 16000)
59
- audio = audio.mean(dim=0).unsqueeze(0).to(device)
60
-
61
- with torch.no_grad():
62
- embedding = speaker_model.encode_batch(audio)
63
- embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
64
-
65
- torch.save(embedding.cpu(), embedding_path)
66
- return embedding
67
-
68
- # --- Hagaajinta Qoraalka (Text Normalization) ---
69
- number_words = {
70
- 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
71
- 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
72
- 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
73
- 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
74
- 100: "boqol", 1000: "kun"
75
- }
76
-
77
- def number_to_words(n):
78
- if n < 20: return number_words.get(n, str(n))
79
- if n < 100:
80
- tens, unit = divmod(n, 10)
81
- return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
82
- if n < 1000:
83
- hundreds, rem = divmod(n, 100)
84
- return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" iyo " + number_to_words(rem) if rem else "")
85
- if n < 1_000_000:
86
- th, rem = divmod(n, 1000)
87
- return (number_to_words(th) + " kun") + (" iyo " + number_to_words(rem) if rem else "")
88
- return str(n)
89
-
90
- def replace_numbers_with_words(text):
91
- return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
92
-
93
- def normalize_text(text):
94
- text = text.lower()
95
- text = replace_numbers_with_words(text)
96
- text = re.sub(r'[^\w\s\']', '', text)
97
- return text
98
-
99
- # --- Shaqada ugu Muhiimsan (TTS Function) ---
100
  def text_to_speech(text, voice_choice):
101
- """ Hadda shaqadani waxay qaadanaysaa qoraalka iyo codka la doortay """
102
- if not text or not voice_choice:
103
- gr.Warning("Fadlan geli qoraal oo dooro cod.")
104
- return None
105
-
106
- # Soo qaado astaanta codka la doortay
107
- speaker_embedding = get_speaker_embedding(voice_choice)
108
-
109
- clean_text = normalize_text(text)
110
- inputs = processor(text=clean_text, return_tensors="pt").to(device)
111
-
112
- with torch.no_grad():
113
- waveform = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
114
-
115
- # Si ku meel gaar ah u keydi faylka codka la abuuray
116
- os.makedirs("/tmp/tts_outputs", exist_ok=True)
117
- out_path = f"/tmp/tts_outputs/{uuid.uuid4().hex}.wav"
118
- sf.write(out_path, waveform.cpu().numpy(), 16000)
119
- return out_path
120
 
121
- # --- Interface-ka Gradio ---
122
- # Hadda wuxuu leeyahay meel qoraalka la geliyo iyo meel codka laga doorto
123
  iface = gr.Interface(
124
- fn=text_to_speech,
125
- inputs=[
126
- gr.Textbox(label="Geli qoraalka af Soomaali"),
127
- gr.Dropdown(
128
- choices=VOICE_SAMPLE_FILES,
129
- label="Dooro Codkaaga (Select Your Voice)",
130
- value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None,
131
- info="Dooro mid ka mid ah codadkaaga aad diyaarisay."
132
- )
133
- ],
134
- outputs=gr.Audio(label="Codka La Abuuray", type="filepath"),
135
- title="Soomaali Text-to-Speech (Codad Kala Duwan)",
136
- description="Ku qor qoraal Soomaali ah, dooro codka aad rabto, kadibna riix 'Submit' si aad cod ugu dhageysato."
137
  )
138
 
139
- # Diyaari codadka ka hor inta aan barnaamijka la furin
140
  if __name__ == "__main__":
141
  print("Hubinta faylasha codadka...")
142
- if not VOICE_SAMPLE_FILES or not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
143
- raise FileNotFoundError("Mid ka mid ah faylasha ku jira 'VOICE_SAMPLE_FILES' lama helin. Fadlan hubi magacyada iyo meesha ay ku jiraan.")
 
 
144
 
145
- print("Diyaarinta codadkaaga...")
146
- for voice in VOICE_SAMPLE_FILES:
147
- get_speaker_embedding(voice) # Tani waxay abuuraysaa astaamaha codka haddii aysan jirin
148
- print("Dhammaan waa diyaar. Barnaamijku wuu furmayaa.")
149
 
150
  iface.launch(share=True)
 
1
+ import gradio as gr
 
 
2
  import torch
3
  import torchaudio
4
+ import re
5
+ import os
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+ from speechbrain.pretrained import EncoderClassifier
8
+ import numpy as np
9
 
10
  # --- Configuration ---
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ # --- HUBI INAAD SOO GELISAY FAYLASHAN ---
14
+ # Faylashan waa inay ku jiraan Hugging Face Spaces, isla galka uu ku jiro "app.py"
15
+ VOICE_SAMPLE_FILES = ["1.wav"]
 
16
 
17
+ # Directory to store speaker embedding files
18
+ EMBEDDING_DIR = "speaker_embeddings"
19
+ os.makedirs(EMBEDDING_DIR, exist_ok=True)
 
 
20
 
21
+ # --- Load Models ---
22
  try:
23
+ print("Loading models... This may take a moment.")
24
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
25
+ model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
26
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
27
  speaker_model = EncoderClassifier.from_hparams(
28
  source="speechbrain/spkrec-xvect-voxceleb",
29
  run_opts={"device": device},
30
+ savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
31
  )
32
+ print("Models loaded successfully.")
33
  except Exception as e:
34
+ raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
35
 
36
+ speaker_embeddings_cache = {}
 
 
 
 
 
 
37
 
38
+ def get_speaker_embedding(wav_file_path):
39
+ if wav_file_path in speaker_embeddings_cache:
40
+ return speaker_embeddings_cache[wav_file_path]
41
+ embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
42
  if os.path.exists(embedding_path):
43
+ embedding = torch.load(embedding_path, map_location=device)
44
+ speaker_embeddings_cache[wav_file_path] = embedding
45
+ return embedding
46
  if not os.path.exists(wav_file_path):
47
+ # Kani waa qaladka dhacay. Markaad faylasha soo geliso, meeshan wuu ka gudbayaa.
48
+ raise FileNotFoundError(f"Lama helin faylka codka: {wav_file_path}")
49
+ try:
50
+ audio, sr = torchaudio.load(wav_file_path)
51
+ if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
52
+ if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
53
+ with torch.no_grad():
54
+ embedding = speaker_model.encode_batch(audio.to(device))
55
+ embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
56
+ torch.save(embedding.cpu(), embedding_path)
57
+ speaker_embeddings_cache[wav_file_path] = embedding.to(device)
58
+ return embedding.to(device)
59
+ except Exception as e:
60
+ raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
61
+
62
+ # ... (Inta kale ee koodhka way saxantahay) ...
63
+
64
+ # --- Main Text-to-Speech Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def text_to_speech(text, voice_choice):
66
+ # ... (sidaadii hore) ...
67
+ pass # Koodhka intiisa kale halkan geli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # --- Gradio Interface ---
 
70
  iface = gr.Interface(
71
+ # ... (sidaadii hore) ...
72
+ pass # Koodhka intiisa kale halkan geli
 
 
 
 
 
 
 
 
 
 
 
73
  )
74
 
75
+ # --- Launch the web interface ---
76
  if __name__ == "__main__":
77
  print("Hubinta faylasha codadka...")
78
+ for f in VOICE_SAMPLE_FILES:
79
+ if not os.path.exists(f):
80
+ # Qaladku halkan ayuu ka bilaabmayaa
81
+ raise FileNotFoundError(f"Mid ka mid ah faylasha lama helin: '{f}'. Fadlan hubi inaad soo gelisay Hugging Face Spaces.")
82
 
83
+ print("Diyaarinta astaamaha codadka...")
84
+ for voice_file in VOICE_SAMPLE_FILES:
85
+ get_speaker_embedding(voice_file)
86
+ print("Dhammaan codadka waa diyaar. Waxaa la furayaa interface-ka.")
87
 
88
  iface.launch(share=True)