Upload app.py
Browse files
app.py
CHANGED
|
@@ -204,7 +204,8 @@ def tokenize(ps):
|
|
| 204 |
SAMPLE_RATE = 24000
|
| 205 |
|
| 206 |
@torch.no_grad()
|
| 207 |
-
def forward(tokens, voices, speed, device='cpu'):
|
|
|
|
| 208 |
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
| 209 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 210 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
|
@@ -229,8 +230,8 @@ def forward(tokens, voices, speed, device='cpu'):
|
|
| 229 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 230 |
|
| 231 |
@spaces.GPU(duration=10)
|
| 232 |
-
def forward_gpu(tokens, voices, speed):
|
| 233 |
-
return forward(tokens, voices, speed, device='cuda')
|
| 234 |
|
| 235 |
def clamp_speed(speed):
|
| 236 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
@@ -257,18 +258,18 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
|
|
| 257 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 258 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
| 259 |
if sk != os.environ['SK']:
|
| 260 |
-
print('❌', datetime.now(), text, voices, ps,
|
| 261 |
return (None, '')
|
| 262 |
try:
|
| 263 |
if use_gpu:
|
| 264 |
-
out = forward_gpu(tokens, voices, speed)
|
| 265 |
else:
|
| 266 |
-
out = forward(tokens, voices, speed)
|
| 267 |
except gr.exceptions.Error as e:
|
| 268 |
if use_gpu:
|
| 269 |
gr.Warning(str(e))
|
| 270 |
gr.Info('Switching to CPU')
|
| 271 |
-
out = forward(tokens, voices, speed)
|
| 272 |
else:
|
| 273 |
raise gr.Error(e)
|
| 274 |
print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
|
|
@@ -342,7 +343,8 @@ with gr.Blocks() as basic_tts:
|
|
| 342 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
| 343 |
|
| 344 |
@torch.no_grad()
|
| 345 |
-
def lf_forward(token_lists, voices, speed, device='cpu'):
|
|
|
|
| 346 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
| 347 |
outs = []
|
| 348 |
for tokens in token_lists:
|
|
@@ -371,8 +373,8 @@ def lf_forward(token_lists, voices, speed, device='cpu'):
|
|
| 371 |
return outs
|
| 372 |
|
| 373 |
@spaces.GPU
|
| 374 |
-
def lf_forward_gpu(token_lists, voices, speed):
|
| 375 |
-
return lf_forward(token_lists, voices, speed, device='cuda')
|
| 376 |
|
| 377 |
def resplit_strings(arr):
|
| 378 |
# Handle edge cases
|
|
@@ -426,7 +428,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 426 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 427 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 428 |
|
| 429 |
-
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
| 430 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 431 |
voices = resolve_voices(voice)
|
| 432 |
speed = clamp_speed(speed)
|
|
@@ -435,20 +437,23 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
|
| 435 |
use_gpu = True
|
| 436 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
| 437 |
i = 0
|
|
|
|
|
|
|
|
|
|
| 438 |
while i < len(token_lists):
|
| 439 |
bs = batch_sizes.pop() if batch_sizes else 100
|
| 440 |
tokens = token_lists[i:i+bs]
|
| 441 |
print('📖', datetime.now(), len(tokens), voices, use_gpu)
|
| 442 |
try:
|
| 443 |
if use_gpu:
|
| 444 |
-
outs = lf_forward_gpu(tokens, voices, speed)
|
| 445 |
else:
|
| 446 |
-
outs = lf_forward(tokens, voices, speed)
|
| 447 |
except gr.exceptions.Error as e:
|
| 448 |
if use_gpu:
|
| 449 |
gr.Warning(str(e))
|
| 450 |
gr.Info('Switching to CPU')
|
| 451 |
-
outs = lf_forward(tokens, voices, speed)
|
| 452 |
use_gpu = False
|
| 453 |
elif outs:
|
| 454 |
gr.Warning(repr(e))
|
|
@@ -513,8 +518,11 @@ with gr.Blocks() as lf_tts:
|
|
| 513 |
with gr.Row():
|
| 514 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 515 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
|
|
|
|
|
|
|
|
|
| 516 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 517 |
-
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
|
| 518 |
stop_btn.click(fn=None, cancels=generate_event)
|
| 519 |
|
| 520 |
with gr.Blocks() as about:
|
|
@@ -539,7 +547,8 @@ Vast was chosen over other compute providers due to its competitive on-demand ho
|
|
| 539 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 540 |
|
| 541 |
### Gradio API
|
| 542 |
-
|
|
|
|
| 543 |
```
|
| 544 |
# 1️⃣ Install the Gradio Python client
|
| 545 |
!pip install -q gradio_client
|
|
@@ -569,6 +578,7 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
|
|
| 569 |
with gr.Blocks() as changelog:
|
| 570 |
gr.Markdown('''
|
| 571 |
**28 Nov 2024**<br/>
|
|
|
|
| 572 |
🌊 Long Form streaming and stop button
|
| 573 |
|
| 574 |
**25 Nov 2024**<br/>
|
|
|
|
| 204 |
SAMPLE_RATE = 24000
|
| 205 |
|
| 206 |
@torch.no_grad()
|
| 207 |
+
def forward(tokens, voices, speed, sk, device='cpu'):
|
| 208 |
+
assert sk == os.environ['SK'], sk
|
| 209 |
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
| 210 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 211 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
|
|
|
| 230 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 231 |
|
| 232 |
@spaces.GPU(duration=10)
|
| 233 |
+
def forward_gpu(tokens, voices, speed, sk):
|
| 234 |
+
return forward(tokens, voices, speed, sk, device='cuda')
|
| 235 |
|
| 236 |
def clamp_speed(speed):
|
| 237 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
|
|
| 258 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 259 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
| 260 |
if sk != os.environ['SK']:
|
| 261 |
+
print('❌', datetime.now(), text, voices, ps, sk)
|
| 262 |
return (None, '')
|
| 263 |
try:
|
| 264 |
if use_gpu:
|
| 265 |
+
out = forward_gpu(tokens, voices, speed, sk)
|
| 266 |
else:
|
| 267 |
+
out = forward(tokens, voices, speed, sk)
|
| 268 |
except gr.exceptions.Error as e:
|
| 269 |
if use_gpu:
|
| 270 |
gr.Warning(str(e))
|
| 271 |
gr.Info('Switching to CPU')
|
| 272 |
+
out = forward(tokens, voices, speed, sk)
|
| 273 |
else:
|
| 274 |
raise gr.Error(e)
|
| 275 |
print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
|
|
|
|
| 343 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
| 344 |
|
| 345 |
@torch.no_grad()
|
| 346 |
+
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
| 347 |
+
assert sk == os.environ['SK'], sk
|
| 348 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
| 349 |
outs = []
|
| 350 |
for tokens in token_lists:
|
|
|
|
| 373 |
return outs
|
| 374 |
|
| 375 |
@spaces.GPU
|
| 376 |
+
def lf_forward_gpu(token_lists, voices, speed, sk):
|
| 377 |
+
return lf_forward(token_lists, voices, speed, sk, device='cuda')
|
| 378 |
|
| 379 |
def resplit_strings(arr):
|
| 380 |
# Handle edge cases
|
|
|
|
| 428 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 429 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 430 |
|
| 431 |
+
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
|
| 432 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 433 |
voices = resolve_voices(voice)
|
| 434 |
speed = clamp_speed(speed)
|
|
|
|
| 437 |
use_gpu = True
|
| 438 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
| 439 |
i = 0
|
| 440 |
+
if sk != os.environ['SK']:
|
| 441 |
+
print('❌', datetime.now(), len(segments), voices, sk)
|
| 442 |
+
return
|
| 443 |
while i < len(token_lists):
|
| 444 |
bs = batch_sizes.pop() if batch_sizes else 100
|
| 445 |
tokens = token_lists[i:i+bs]
|
| 446 |
print('📖', datetime.now(), len(tokens), voices, use_gpu)
|
| 447 |
try:
|
| 448 |
if use_gpu:
|
| 449 |
+
outs = lf_forward_gpu(tokens, voices, speed, sk)
|
| 450 |
else:
|
| 451 |
+
outs = lf_forward(tokens, voices, speed, sk)
|
| 452 |
except gr.exceptions.Error as e:
|
| 453 |
if use_gpu:
|
| 454 |
gr.Warning(str(e))
|
| 455 |
gr.Info('Switching to CPU')
|
| 456 |
+
outs = lf_forward(tokens, voices, speed, sk)
|
| 457 |
use_gpu = False
|
| 458 |
elif outs:
|
| 459 |
gr.Warning(repr(e))
|
|
|
|
| 518 |
with gr.Row():
|
| 519 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 520 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 521 |
+
with gr.Row():
|
| 522 |
+
sk = gr.Textbox(visible=False)
|
| 523 |
+
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
| 524 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 525 |
+
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
|
| 526 |
stop_btn.click(fn=None, cancels=generate_event)
|
| 527 |
|
| 528 |
with gr.Blocks() as about:
|
|
|
|
| 547 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 548 |
|
| 549 |
### Gradio API
|
| 550 |
+
**The API has been restricted due to high request volume degrading the demo experience.**
|
| 551 |
+
~~This Space can be used via API. The following code block can be copied and run in one Google Colab cell.~~
|
| 552 |
```
|
| 553 |
# 1️⃣ Install the Gradio Python client
|
| 554 |
!pip install -q gradio_client
|
|
|
|
| 578 |
with gr.Blocks() as changelog:
|
| 579 |
gr.Markdown('''
|
| 580 |
**28 Nov 2024**<br/>
|
| 581 |
+
🥈 CPU fallback<br/>
|
| 582 |
🌊 Long Form streaming and stop button
|
| 583 |
|
| 584 |
**25 Nov 2024**<br/>
|