Upload 2 files
Browse files
app.py
CHANGED
|
@@ -12,22 +12,22 @@ import spaces
|
|
| 12 |
import torch
|
| 13 |
import yaml
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
| 18 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
_ = [model[key].to(device) for key in model]
|
| 22 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
-
PARAM_COUNT = sum(p.numel() for value in
|
| 31 |
assert PARAM_COUNT < 82_000_000, PARAM_COUNT
|
| 32 |
|
| 33 |
random_texts = {}
|
|
@@ -118,7 +118,7 @@ def phonemize(text, voice, norm=True):
|
|
| 118 |
ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
|
| 119 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
| 120 |
if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
|
| 121 |
-
gr.Warning('Japanese tokenizer does not handle English letters
|
| 122 |
return ps.strip()
|
| 123 |
|
| 124 |
def length_to_mask(lengths):
|
|
@@ -154,7 +154,7 @@ CHOICES = {
|
|
| 154 |
'🇬🇧 🚹 Lewis 🧪': 'bm_lewis',
|
| 155 |
'🇯🇵 🚺 Japanese Female': 'jf_0',
|
| 156 |
}
|
| 157 |
-
VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
|
| 158 |
|
| 159 |
np_log_99 = np.log(99)
|
| 160 |
def s_curve(p):
|
|
@@ -168,19 +168,18 @@ def s_curve(p):
|
|
| 168 |
|
| 169 |
SAMPLE_RATE = 24000
|
| 170 |
|
| 171 |
-
@spaces.GPU(duration=10)
|
| 172 |
@torch.no_grad()
|
| 173 |
-
def forward(tokens, voice, speed):
|
| 174 |
-
ref_s = VOICES[voice][len(tokens)]
|
| 175 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 176 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 177 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 178 |
-
bert_dur =
|
| 179 |
-
d_en =
|
| 180 |
s = ref_s[:, 128:]
|
| 181 |
-
d =
|
| 182 |
-
x, _ =
|
| 183 |
-
duration =
|
| 184 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 185 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 186 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
|
@@ -189,12 +188,16 @@ def forward(tokens, voice, speed):
|
|
| 189 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 190 |
c_frame += pred_dur[0,i].item()
|
| 191 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 192 |
-
F0_pred, N_pred =
|
| 193 |
-
t_en =
|
| 194 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 195 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000,
|
| 198 |
if voice not in VOICES:
|
| 199 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 200 |
voice = 'af'
|
|
@@ -206,7 +209,10 @@ def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000
|
|
| 206 |
tokens = tokens[:510]
|
| 207 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 208 |
try:
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
| 210 |
except gr.exceptions.Error as e:
|
| 211 |
raise gr.Error(e)
|
| 212 |
return (None, '')
|
|
@@ -222,23 +228,15 @@ def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000
|
|
| 222 |
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 223 |
for i in range(ease_out):
|
| 224 |
out[-i-1] *= s_curve(i / ease_out)
|
| 225 |
-
pad_before = int(pad_before / speed)
|
| 226 |
-
if pad_before > 0:
|
| 227 |
-
out = np.concatenate([np.zeros(pad_before), out])
|
| 228 |
-
pad_after = int(pad_after / speed)
|
| 229 |
-
if pad_after > 0:
|
| 230 |
-
out = np.concatenate([out, np.zeros(pad_after)])
|
| 231 |
return ((SAMPLE_RATE, out), ps)
|
| 232 |
|
| 233 |
def toggle_autoplay(autoplay):
|
| 234 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
| 235 |
|
| 236 |
with gr.Blocks() as basic_tts:
|
| 237 |
-
with gr.Row():
|
| 238 |
-
gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
|
| 239 |
with gr.Row():
|
| 240 |
with gr.Column():
|
| 241 |
-
text = gr.Textbox(label='Input Text')
|
| 242 |
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 243 |
with gr.Row():
|
| 244 |
random_btn = gr.Button('Random Text', variant='secondary')
|
|
@@ -252,36 +250,36 @@ with gr.Blocks() as basic_tts:
|
|
| 252 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 253 |
with gr.Column():
|
| 254 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
|
|
|
|
|
|
| 255 |
with gr.Accordion('Output Tokens', open=True):
|
| 256 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
with gr.Accordion('Audio Settings', open=False):
|
| 258 |
with gr.Row():
|
| 259 |
-
|
| 260 |
-
with gr.Row():
|
| 261 |
-
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 262 |
with gr.Row():
|
| 263 |
with gr.Column():
|
| 264 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='
|
| 265 |
with gr.Column():
|
| 266 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='
|
| 267 |
with gr.Row():
|
| 268 |
with gr.Column():
|
| 269 |
-
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='
|
| 270 |
with gr.Column():
|
| 271 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
pad_before = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
| 275 |
-
with gr.Column():
|
| 276 |
-
pad_after = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
| 277 |
-
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 278 |
-
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 279 |
-
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 280 |
|
| 281 |
-
@spaces.GPU
|
| 282 |
@torch.no_grad()
|
| 283 |
-
def lf_forward(token_lists, voice, speed):
|
| 284 |
-
voicepack = VOICES[voice]
|
| 285 |
outs = []
|
| 286 |
for tokens in token_lists:
|
| 287 |
ref_s = voicepack[len(tokens)]
|
|
@@ -289,11 +287,11 @@ def lf_forward(token_lists, voice, speed):
|
|
| 289 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 290 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 291 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 292 |
-
bert_dur =
|
| 293 |
-
d_en =
|
| 294 |
-
d =
|
| 295 |
-
x, _ =
|
| 296 |
-
duration =
|
| 297 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 298 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 299 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
|
@@ -302,12 +300,16 @@ def lf_forward(token_lists, voice, speed):
|
|
| 302 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 303 |
c_frame += pred_dur[0,i].item()
|
| 304 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 305 |
-
F0_pred, N_pred =
|
| 306 |
-
t_en =
|
| 307 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 308 |
-
outs.append(
|
| 309 |
return outs
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
def resplit_strings(arr):
|
| 312 |
# Handle edge cases
|
| 313 |
if not arr:
|
|
@@ -360,7 +362,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 360 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 361 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 362 |
|
| 363 |
-
def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000,
|
| 364 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 365 |
wavs = []
|
| 366 |
opening_cut = int(opening_cut / speed)
|
|
@@ -369,7 +371,10 @@ def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000,
|
|
| 369 |
batch_size = 100
|
| 370 |
for i in range(0, len(token_lists), batch_size):
|
| 371 |
try:
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
| 373 |
except gr.exceptions.Error as e:
|
| 374 |
if wavs:
|
| 375 |
gr.Warning(str(e))
|
|
@@ -390,12 +395,6 @@ def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000,
|
|
| 390 |
if wavs and pad_between > 0:
|
| 391 |
wavs.append(np.zeros(pad_between))
|
| 392 |
wavs.append(out)
|
| 393 |
-
pad_before = int(pad_before / speed)
|
| 394 |
-
if pad_before > 0:
|
| 395 |
-
wavs.insert(0, np.zeros(pad_before))
|
| 396 |
-
pad_after = int(pad_after / speed)
|
| 397 |
-
if pad_after > 0:
|
| 398 |
-
wavs.append(np.zeros(pad_after))
|
| 399 |
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
| 400 |
|
| 401 |
def did_change_segments(segments):
|
|
@@ -416,47 +415,45 @@ def extract_text(file):
|
|
| 416 |
return None
|
| 417 |
|
| 418 |
with gr.Blocks() as lf_tts:
|
| 419 |
-
with gr.Row():
|
| 420 |
-
gr.Markdown('Generate speech in batches of 100 text segments and automatically join them together. This may exhaust your ZeroGPU quota.')
|
| 421 |
with gr.Row():
|
| 422 |
with gr.Column():
|
| 423 |
file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
|
| 424 |
-
text = gr.Textbox(label='Input Text')
|
| 425 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 426 |
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 427 |
with gr.Accordion('Text Settings', open=False):
|
| 428 |
-
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations
|
| 429 |
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
| 430 |
with gr.Row():
|
| 431 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
| 432 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
| 433 |
with gr.Column():
|
| 434 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
with gr.Accordion('Audio Settings', open=False):
|
| 436 |
with gr.Row():
|
| 437 |
-
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='
|
| 438 |
-
with gr.Row():
|
| 439 |
-
with gr.Column():
|
| 440 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Cut this many samples from the start.')
|
| 441 |
-
with gr.Column():
|
| 442 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Cut this many samples from the end.')
|
| 443 |
with gr.Row():
|
| 444 |
with gr.Column():
|
| 445 |
-
|
| 446 |
with gr.Column():
|
| 447 |
-
|
| 448 |
with gr.Row():
|
| 449 |
with gr.Column():
|
| 450 |
-
|
| 451 |
with gr.Column():
|
| 452 |
-
|
| 453 |
with gr.Row():
|
| 454 |
-
pad_between = gr.Slider(minimum=0, maximum=24000, value=10000, step=1000, label='Pad Between', info='
|
| 455 |
with gr.Row():
|
| 456 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 457 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 458 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 459 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out,
|
| 460 |
|
| 461 |
with gr.Blocks() as about:
|
| 462 |
gr.Markdown("""
|
|
|
|
| 12 |
import torch
|
| 13 |
import yaml
|
| 14 |
|
| 15 |
+
CUDA_AVAILABLE = torch.cuda.is_available()
|
| 16 |
|
| 17 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
| 18 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
| 19 |
+
|
| 20 |
+
models = {device: build_model(config['model_params'], device) for device in ['cpu'] + (['cuda'] if CUDA_AVAILABLE else [])}
|
|
|
|
| 21 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
| 22 |
+
for device in models:
|
| 23 |
+
assert key in models[device], key
|
| 24 |
+
try:
|
| 25 |
+
models[device][key].load_state_dict(state_dict)
|
| 26 |
+
except:
|
| 27 |
+
state_dict = {k[7:]: v for k, v in state_dict.items()}
|
| 28 |
+
models[device][key].load_state_dict(state_dict, strict=False)
|
| 29 |
|
| 30 |
+
PARAM_COUNT = sum(p.numel() for value in models['cpu'].values() for p in value.parameters())
|
| 31 |
assert PARAM_COUNT < 82_000_000, PARAM_COUNT
|
| 32 |
|
| 33 |
random_texts = {}
|
|
|
|
| 118 |
ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
|
| 119 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
| 120 |
if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
|
| 121 |
+
gr.Warning('Japanese tokenizer does not handle English letters')
|
| 122 |
return ps.strip()
|
| 123 |
|
| 124 |
def length_to_mask(lengths):
|
|
|
|
| 154 |
'🇬🇧 🚹 Lewis 🧪': 'bm_lewis',
|
| 155 |
'🇯🇵 🚺 Japanese Female': 'jf_0',
|
| 156 |
}
|
| 157 |
+
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
| 158 |
|
| 159 |
np_log_99 = np.log(99)
|
| 160 |
def s_curve(p):
|
|
|
|
| 168 |
|
| 169 |
SAMPLE_RATE = 24000
|
| 170 |
|
|
|
|
| 171 |
@torch.no_grad()
|
| 172 |
+
def forward(tokens, voice, speed, device='cpu'):
|
| 173 |
+
ref_s = VOICES[device][voice][len(tokens)]
|
| 174 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 175 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 176 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 177 |
+
bert_dur = models[device].bert(tokens, attention_mask=(~text_mask).int())
|
| 178 |
+
d_en = models[device].bert_encoder(bert_dur).transpose(-1, -2)
|
| 179 |
s = ref_s[:, 128:]
|
| 180 |
+
d = models[device].predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 181 |
+
x, _ = models[device].predictor.lstm(d)
|
| 182 |
+
duration = models[device].predictor.duration_proj(x)
|
| 183 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 184 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 185 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
|
|
|
| 188 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 189 |
c_frame += pred_dur[0,i].item()
|
| 190 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 191 |
+
F0_pred, N_pred = models[device].predictor.F0Ntrain(en, s)
|
| 192 |
+
t_en = models[device].text_encoder(tokens, input_lengths, text_mask)
|
| 193 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 194 |
+
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 195 |
+
|
| 196 |
+
@spaces.GPU(duration=10)
|
| 197 |
+
def forward_gpu(tokens, voice, speed):
|
| 198 |
+
return forward(tokens, voice, speed, device='cuda')
|
| 199 |
|
| 200 |
+
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, use_gpu=None):
|
| 201 |
if voice not in VOICES:
|
| 202 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 203 |
voice = 'af'
|
|
|
|
| 209 |
tokens = tokens[:510]
|
| 210 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 211 |
try:
|
| 212 |
+
if use_gpu or (use_gpu is None and len(ps) > 99):
|
| 213 |
+
out = forward_gpu(tokens, voice, speed)
|
| 214 |
+
else:
|
| 215 |
+
out = forward(tokens, voice, speed)
|
| 216 |
except gr.exceptions.Error as e:
|
| 217 |
raise gr.Error(e)
|
| 218 |
return (None, '')
|
|
|
|
| 228 |
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 229 |
for i in range(ease_out):
|
| 230 |
out[-i-1] *= s_curve(i / ease_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
return ((SAMPLE_RATE, out), ps)
|
| 232 |
|
| 233 |
def toggle_autoplay(autoplay):
|
| 234 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
| 235 |
|
| 236 |
with gr.Blocks() as basic_tts:
|
|
|
|
|
|
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column():
|
| 239 |
+
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters.')
|
| 240 |
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 241 |
with gr.Row():
|
| 242 |
random_btn = gr.Button('Random Text', variant='secondary')
|
|
|
|
| 250 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 251 |
with gr.Column():
|
| 252 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 253 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 254 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 255 |
with gr.Accordion('Output Tokens', open=True):
|
| 256 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 257 |
+
with gr.Row():
|
| 258 |
+
use_gpu = gr.Radio(
|
| 259 |
+
[('CPU', False), ('Force GPU', True), ('Dynamic', None)],
|
| 260 |
+
value=None if CUDA_AVAILABLE else False, label='⚙️ Hardware',
|
| 261 |
+
info='CPU: unlimited, ~faster <100 tokens. GPU: limited usage quota, ~faster 100+ tokens. Dynamic: switches based on # of tokens.',
|
| 262 |
+
interactive=CUDA_AVAILABLE
|
| 263 |
+
)
|
| 264 |
with gr.Accordion('Audio Settings', open=False):
|
| 265 |
with gr.Row():
|
| 266 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
|
|
|
|
|
|
|
| 267 |
with gr.Row():
|
| 268 |
with gr.Column():
|
| 269 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut this many samples from the start')
|
| 270 |
with gr.Column():
|
| 271 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut this many samples from the end')
|
| 272 |
with gr.Row():
|
| 273 |
with gr.Column():
|
| 274 |
+
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in for this many samples, after opening cut')
|
| 275 |
with gr.Column():
|
| 276 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out for this many samples, before closing cut')
|
| 277 |
+
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
| 278 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
|
|
|
| 280 |
@torch.no_grad()
|
| 281 |
+
def lf_forward(token_lists, voice, speed, device='cpu'):
|
| 282 |
+
voicepack = VOICES[device][voice]
|
| 283 |
outs = []
|
| 284 |
for tokens in token_lists:
|
| 285 |
ref_s = voicepack[len(tokens)]
|
|
|
|
| 287 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 288 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 289 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 290 |
+
bert_dur = models[device].bert(tokens, attention_mask=(~text_mask).int())
|
| 291 |
+
d_en = models[device].bert_encoder(bert_dur).transpose(-1, -2)
|
| 292 |
+
d = models[device].predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 293 |
+
x, _ = models[device].predictor.lstm(d)
|
| 294 |
+
duration = models[device].predictor.duration_proj(x)
|
| 295 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 296 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 297 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
|
|
|
| 300 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 301 |
c_frame += pred_dur[0,i].item()
|
| 302 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 303 |
+
F0_pred, N_pred = models[device].predictor.F0Ntrain(en, s)
|
| 304 |
+
t_en = models[device].text_encoder(tokens, input_lengths, text_mask)
|
| 305 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 306 |
+
outs.append(models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy())
|
| 307 |
return outs
|
| 308 |
|
| 309 |
+
@spaces.GPU
|
| 310 |
+
def lf_forward_gpu(token_lists, voice, speed):
|
| 311 |
+
return lf_forward(token_lists, voice, speed, device='cuda')
|
| 312 |
+
|
| 313 |
def resplit_strings(arr):
|
| 314 |
# Handle edge cases
|
| 315 |
if not arr:
|
|
|
|
| 362 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 363 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 364 |
|
| 365 |
+
def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_between=10000, use_gpu=True):
|
| 366 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 367 |
wavs = []
|
| 368 |
opening_cut = int(opening_cut / speed)
|
|
|
|
| 371 |
batch_size = 100
|
| 372 |
for i in range(0, len(token_lists), batch_size):
|
| 373 |
try:
|
| 374 |
+
if use_gpu:
|
| 375 |
+
outs = lf_forward_gpu(token_lists[i:i+batch_size], voice, speed)
|
| 376 |
+
else:
|
| 377 |
+
outs = lf_forward(token_lists[i:i+batch_size], voice, speed)
|
| 378 |
except gr.exceptions.Error as e:
|
| 379 |
if wavs:
|
| 380 |
gr.Warning(str(e))
|
|
|
|
| 395 |
if wavs and pad_between > 0:
|
| 396 |
wavs.append(np.zeros(pad_between))
|
| 397 |
wavs.append(out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
| 399 |
|
| 400 |
def did_change_segments(segments):
|
|
|
|
| 415 |
return None
|
| 416 |
|
| 417 |
with gr.Blocks() as lf_tts:
|
|
|
|
|
|
|
| 418 |
with gr.Row():
|
| 419 |
with gr.Column():
|
| 420 |
file_input = gr.File(file_types=['.pdf', '.txt'], label='Input File: pdf or txt')
|
| 421 |
+
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together.')
|
| 422 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 423 |
voice = gr.Dropdown(list(CHOICES.items()), label='Voice', info='⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.')
|
| 424 |
with gr.Accordion('Text Settings', open=False):
|
| 425 |
+
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations')
|
| 426 |
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
| 427 |
with gr.Row():
|
| 428 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
| 429 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
| 430 |
with gr.Column():
|
| 431 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 432 |
+
use_gpu = gr.Checkbox(value=CUDA_AVAILABLE, label='Use ZeroGPU', info='🚀 ZeroGPU is fast but has a limited usage quota', interactive=CUDA_AVAILABLE)
|
| 433 |
+
use_gpu.change(
|
| 434 |
+
fn=lambda v: gr.Checkbox(value=v, label='Use ZeroGPU', info='🚀 ZeroGPU is fast but has a limited usage quota' if v else '🐌 CPU is slow but unlimited'),
|
| 435 |
+
inputs=[use_gpu], outputs=[use_gpu]
|
| 436 |
+
)
|
| 437 |
with gr.Accordion('Audio Settings', open=False):
|
| 438 |
with gr.Row():
|
| 439 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
with gr.Row():
|
| 441 |
with gr.Column():
|
| 442 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut this many samples from the start')
|
| 443 |
with gr.Column():
|
| 444 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut this many samples from the end')
|
| 445 |
with gr.Row():
|
| 446 |
with gr.Column():
|
| 447 |
+
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in for this many samples, after opening cut')
|
| 448 |
with gr.Column():
|
| 449 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out for this many samples, before closing cut')
|
| 450 |
with gr.Row():
|
| 451 |
+
pad_between = gr.Slider(minimum=0, maximum=24000, value=10000, step=1000, label='🔇 Pad Between', info='How many samples of silence to insert between segments')
|
| 452 |
with gr.Row():
|
| 453 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 454 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 455 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 456 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_between, use_gpu], outputs=[audio])
|
| 457 |
|
| 458 |
with gr.Blocks() as about:
|
| 459 |
gr.Markdown("""
|
models.py
CHANGED
|
@@ -549,7 +549,7 @@ def recursive_munch(d):
|
|
| 549 |
else:
|
| 550 |
return d
|
| 551 |
|
| 552 |
-
def build_model(args):
|
| 553 |
args = recursive_munch(args)
|
| 554 |
assert args.decoder.type == 'istftnet', 'Decoder type unknown'
|
| 555 |
decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
|
|
@@ -562,10 +562,16 @@ def build_model(args):
|
|
| 562 |
text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
|
| 563 |
predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
|
| 564 |
bert = load_plbert()
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
)
|
|
|
|
|
|
| 549 |
else:
|
| 550 |
return d
|
| 551 |
|
| 552 |
+
def build_model(args, device):
|
| 553 |
args = recursive_munch(args)
|
| 554 |
assert args.decoder.type == 'istftnet', 'Decoder type unknown'
|
| 555 |
decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
|
|
|
|
| 562 |
text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
|
| 563 |
predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
|
| 564 |
bert = load_plbert()
|
| 565 |
+
bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
|
| 566 |
+
for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
|
| 567 |
+
for child in parent.children():
|
| 568 |
+
if isinstance(child, nn.RNNBase):
|
| 569 |
+
child.flatten_parameters()
|
| 570 |
+
model = Munch(
|
| 571 |
+
bert=bert.to(device).eval(),
|
| 572 |
+
bert_encoder=bert_encoder.to(device).eval(),
|
| 573 |
+
predictor=predictor.to(device).eval(),
|
| 574 |
+
decoder=decoder.to(device).eval(),
|
| 575 |
+
text_encoder=text_encoder.to(device).eval(),
|
| 576 |
)
|
| 577 |
+
return model
|