Somalitts commited on
Commit
83609f5
·
verified ·
1 Parent(s): f76188c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -112
app.py CHANGED
@@ -1,155 +1,93 @@
1
- import gradio as gr
 
 
2
  import torch
3
  import torchaudio
4
- import re
5
- import os
6
- import numpy as np
7
- import scipy.io.wavfile
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
- from speechbrain.pretrained import EncoderClassifier
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
12
 
13
  # Load models
14
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
15
- model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
16
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
17
 
18
  speaker_model = EncoderClassifier.from_hparams(
19
  source="speechbrain/spkrec-xvect-voxceleb",
20
  run_opts={"device": device},
21
- savedir="./spk_model"
22
  )
23
 
24
  # Speaker embedding
25
- EMB_PATH = "speaker_embedding.pt"
26
- if os.path.exists(EMB_PATH):
27
- speaker_embedding = torch.load(EMB_PATH).to(device)
28
- else:
29
- audio, sr = torchaudio.load("1.wav")
30
  audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
31
  with torch.no_grad():
32
  emb = speaker_model.encode_batch(audio)
33
  emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
34
- torch.save(emb.cpu(), EMB_PATH)
35
- speaker_embedding = emb
 
 
36
 
37
- # Number conversion (Somali)
38
  number_words = {
39
  0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
40
  6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
41
- 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
42
- 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
43
- 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
44
  20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
45
  60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
46
  100: "boqol", 1000: "kun"
47
  }
48
 
49
- shortcut_map = {
50
- "asc": "asalaamu caleykum",
51
- "wcs": "wacaleykum salaam",
52
- "fcn": "fiican",
53
- "xld": "xaaladda ka waran",
54
- "kwrn": "kawaran",
55
- "scw": "salalaahu caleyhi wa salam",
56
- "alx": "alxamdu lilaahi",
57
- "m.a": "maasha allah",
58
- "sthy": "side tahey",
59
- "sxp": "saaxiib"
60
- }
61
-
62
- def number_to_words(number):
63
- number = int(number)
64
- if number < 20:
65
- return number_words[number]
66
- elif number < 100:
67
- tens, unit = divmod(number, 10)
68
- return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
69
- elif number < 1000:
70
- hundreds, remainder = divmod(number, 100)
71
- part = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
72
- if remainder:
73
- part += " iyo " + number_to_words(remainder)
74
- return part
75
- elif number < 1000000:
76
- thousands, remainder = divmod(number, 1000)
77
- words = []
78
- if thousands == 1:
79
- words.append("kun")
80
- else:
81
- words.append(number_to_words(thousands) + " kun")
82
- if remainder >= 100:
83
- hundreds, rem2 = divmod(remainder, 100)
84
- if hundreds:
85
- boqol_text = (number_words[hundreds] + " boqol") if hundreds > 1 else "boqol"
86
- words.append(boqol_text)
87
- if rem2:
88
- words.append("iyo " + number_to_words(rem2))
89
- elif remainder:
90
- words.append("iyo " + number_to_words(remainder))
91
- return " ".join(words)
92
- elif number < 1000000000:
93
- millions, remainder = divmod(number, 1000000)
94
- words = []
95
- if millions == 1:
96
- words.append("milyan")
97
- else:
98
- words.append(number_to_words(millions) + " milyan")
99
- if remainder:
100
- words.append(number_to_words(remainder))
101
- return " ".join(words)
102
  else:
103
- return str(number)
104
 
105
  def replace_numbers_with_words(text):
106
- def replace(match):
107
- number = int(match.group())
108
- return number_to_words(number)
109
- return re.sub(r'\b\d+\b', replace, text)
110
 
111
  def normalize_text(text):
112
  text = text.lower()
113
  text = replace_numbers_with_words(text)
114
-
115
- def replace_shortcuts(match):
116
- word = match.group(0).lower()
117
- return shortcut_map.get(word, word)
118
-
119
- pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
120
- text = pattern.sub(replace_shortcuts, text)
121
-
122
- def replace_countries(match):
123
- word = match.group(0).lower()
124
- return country_map.get(word, word)
125
-
126
- country_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in country_map.keys()) + r')\b', re.IGNORECASE)
127
- text = country_pattern.sub(replace_countries, text)
128
-
129
- text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
130
- text = re.sub(r'\.\d+', '', text)
131
-
132
- symbol_map = {
133
- '$': 'doolar',
134
- '=': 'egwal',
135
- '+': 'balaas',
136
- '#': 'haash'
137
- }
138
- for sym, word in symbol_map.items():
139
- text = text.replace(sym, ' ' + word + ' ')
140
-
141
  text = re.sub(r'[^\w\s]', '', text)
142
-
143
  return text
144
 
 
 
 
 
 
 
 
145
 
 
 
 
146
 
 
147
  iface = gr.Interface(
148
- fn=text_to_speech,
149
- inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
150
- outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
151
- title="Somali TTS",
152
- description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (1.wav)"
153
  )
154
 
155
  iface.launch()
 
1
+ import os
2
+ import re
3
+ import uuid
4
  import torch
5
  import torchaudio
6
+ import soundfile as sf
7
+ import gradio as gr
 
 
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from speechbrain.inference.speaker import EncoderClassifier
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ CACHE_DIR = "/tmp/hf-cache"
13
 
14
  # Load models
15
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
16
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
17
+ model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
18
 
19
  speaker_model = EncoderClassifier.from_hparams(
20
  source="speechbrain/spkrec-xvect-voxceleb",
21
  run_opts={"device": device},
22
+ savedir="/tmp/spk_model"
23
  )
24
 
25
  # Speaker embedding
26
+ def get_embedding(wav_path, pt_path):
27
+ if os.path.exists(pt_path):
28
+ return torch.load(pt_path).to(device)
29
+ audio, sr = torchaudio.load(wav_path)
 
30
  audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
31
  with torch.no_grad():
32
  emb = speaker_model.encode_batch(audio)
33
  emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
34
+ torch.save(emb.cpu(), pt_path)
35
+ return emb
36
+
37
+ embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
38
 
39
+ # Text normalization
40
  number_words = {
41
  0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
42
  6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
 
 
 
43
  20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
44
  60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
45
  100: "boqol", 1000: "kun"
46
  }
47
 
48
+ def number_to_words(n):
49
+ if n < 20:
50
+ return number_words.get(n, str(n))
51
+ elif n < 100:
52
+ tens, unit = divmod(n, 10)
53
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
54
+ elif n < 1000:
55
+ hundreds, rem = divmod(n, 100)
56
+ return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
57
+ elif n < 1_000_000:
58
+ th, rem = divmod(n, 1000)
59
+ return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  else:
61
+ return str(n)
62
 
63
  def replace_numbers_with_words(text):
64
+ return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 
 
 
65
 
66
  def normalize_text(text):
67
  text = text.lower()
68
  text = replace_numbers_with_words(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  text = re.sub(r'[^\w\s]', '', text)
 
70
  return text
71
 
72
+ # Gradio TTS Function
73
+ def tts(text):
74
+ clean_text = normalize_text(text)
75
+ inputs = processor(text=clean_text, return_tensors="pt").to(device)
76
+
77
+ with torch.no_grad():
78
+ waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)
79
 
80
+ out_path = f"/tmp/{uuid.uuid4().hex}.wav"
81
+ sf.write(out_path, waveform.cpu().numpy(), 16000)
82
+ return out_path
83
 
84
+ # Gradio Interface
85
  iface = gr.Interface(
86
+ fn=tts,
87
+ inputs=gr.Textbox(label="Geli qoraalka af Soomaali"),
88
+ outputs=gr.Audio(label="Codka", type="filepath"),
89
+ title="Somali Text-to-Speech",
90
+ description="Ku qor qoraal Soomaali ah si aad cod ugu dhageysato (Female voice only)."
91
  )
92
 
93
  iface.launch()