Somalitts commited on
Commit
b7217f7
·
verified ·
1 Parent(s): 3eb3991

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py CHANGED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import re
5
+ import os
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+ from speechbrain.pretrained import EncoderClassifier
8
+
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ # Load models
12
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
13
+ model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
14
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
15
+
16
+ speaker_model = EncoderClassifier.from_hparams(
17
+ source="speechbrain/spkrec-xvect-voxceleb",
18
+ run_opts={"device": device},
19
+ savedir="./spk_model"
20
+ )
21
+
22
+ # Speaker embedding
23
+ EMB_PATH = "speaker_embedding.pt"
24
+ if os.path.exists(EMB_PATH):
25
+ speaker_embedding = torch.load(EMB_PATH).to(device)
26
+ else:
27
+ audio, sr = torchaudio.load("730.wav")
28
+ audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
29
+ with torch.no_grad():
30
+ emb = speaker_model.encode_batch(audio)
31
+ emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
32
+ torch.save(emb.cpu(), EMB_PATH)
33
+ speaker_embedding = emb
34
+
35
+ # Number conversion (Somali)
36
+ number_words = {
37
+ 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
38
+ 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
39
+ 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
40
+ 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
41
+ 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
42
+ 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
43
+ 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
44
+ 100: "boqol", 1000: "kun",
45
+ }
46
+
47
+ def number_to_words(number):
48
+ if number < 20:
49
+ return number_words[number]
50
+ elif number < 100:
51
+ tens, unit = divmod(number, 10)
52
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
53
+ elif number < 1000:
54
+ hundreds, remainder = divmod(number, 100)
55
+ return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
56
+ elif number < 1000000:
57
+ thousands, remainder = divmod(number, 1000)
58
+ return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
59
+ elif number < 1000000000:
60
+ millions, remainder = divmod(number, 1000000)
61
+ return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
62
+ elif number < 1000000000000:
63
+ billions, remainder = divmod(number, 1000000000)
64
+ return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
65
+ else:
66
+ return str(number)
67
+
68
+ def replace_numbers_with_words(text):
69
+ def replace(match):
70
+ number = int(match.group())
71
+ return number_to_words(number)
72
+ return re.sub(r'\b\d+\b', replace, text)
73
+
74
+ def normalize_text(text):
75
+ text = text.lower()
76
+ text = replace_numbers_with_words(text)
77
+ text = re.sub(r'[^\w\s]', '', text)
78
+ return text
79
+
80
+ # TTS function
81
+ def text_to_speech(text):
82
+ text = normalize_text(text)
83
+ inputs = processor(text=text, return_tensors="pt").to(device)
84
+ with torch.no_grad():
85
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
86
+ return (16000, speech.cpu().numpy())
87
+
88
+ # Gradio Interface
89
+ iface = gr.Interface(
90
+ fn=text_to_speech,
91
+ inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
92
+ outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
93
+ title="Somali TTS",
94
+ description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (11.wav)"
95
+ )
96
+
97
+ iface.launch()