Upload app.py
Browse files
app.py
CHANGED
|
@@ -270,17 +270,17 @@ def clamp_speed(speed):
|
|
| 270 |
return 2
|
| 271 |
return speed
|
| 272 |
|
| 273 |
-
def
|
| 274 |
-
if not isinstance(
|
| 275 |
-
return
|
| 276 |
-
elif
|
| 277 |
-
return
|
| 278 |
-
elif
|
| 279 |
-
return
|
| 280 |
-
return
|
| 281 |
|
| 282 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 283 |
-
def generate(text, voice='af', ps=None, speed=1,
|
| 284 |
ps = ps or phonemize(text, voice)
|
| 285 |
if not sk and (text in sents or ps.strip('"') in harvsents):
|
| 286 |
sk = os.environ['SK']
|
|
@@ -288,7 +288,7 @@ def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=N
|
|
| 288 |
return (None, '')
|
| 289 |
voices = resolve_voices(voice, warn=ps)
|
| 290 |
speed = clamp_speed(speed)
|
| 291 |
-
|
| 292 |
use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
|
| 293 |
tokens = tokenize(ps)
|
| 294 |
if not tokens:
|
|
@@ -312,8 +312,11 @@ def generate(text, voice='af', ps=None, speed=1, top_db=60, use_gpu='auto', sk=N
|
|
| 312 |
raise gr.Error(e)
|
| 313 |
print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
|
| 314 |
return (None, '')
|
| 315 |
-
if
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
| 317 |
print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
|
| 318 |
return ((SAMPLE_RATE, out), ps)
|
| 319 |
|
|
@@ -359,7 +362,7 @@ with gr.Blocks() as basic_tts:
|
|
| 359 |
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 360 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 361 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 362 |
-
|
| 363 |
with gr.Accordion('Output Tokens', open=True):
|
| 364 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 365 |
with gr.Accordion('Voice Mixer', open=False):
|
|
@@ -374,8 +377,8 @@ with gr.Blocks() as basic_tts:
|
|
| 374 |
with gr.Row():
|
| 375 |
sk = gr.Textbox(visible=False)
|
| 376 |
text.change(lambda: os.environ['SK'], outputs=[sk])
|
| 377 |
-
text.submit(generate, inputs=[text, voice, in_ps, speed,
|
| 378 |
-
generate_btn.click(generate, inputs=[text, voice, in_ps, speed,
|
| 379 |
|
| 380 |
@torch.no_grad()
|
| 381 |
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
|
@@ -464,13 +467,13 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 464 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 465 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 466 |
|
| 467 |
-
def lf_generate(segments, voice, speed=1,
|
| 468 |
if sk != os.environ['SK']:
|
| 469 |
return
|
| 470 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 471 |
voices = resolve_voices(voice)
|
| 472 |
speed = clamp_speed(speed)
|
| 473 |
-
|
| 474 |
pad_between = int(pad_between)
|
| 475 |
use_gpu = True
|
| 476 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
|
@@ -496,8 +499,11 @@ def lf_generate(segments, voice, speed=1, top_db=0, pad_between=0, use_gpu=True,
|
|
| 496 |
else:
|
| 497 |
raise gr.Error(e)
|
| 498 |
for out in outs:
|
| 499 |
-
if
|
| 500 |
-
|
|
|
|
|
|
|
|
|
|
| 501 |
if i > 0 and pad_between > 0:
|
| 502 |
yield (SAMPLE_RATE, np.zeros(pad_between))
|
| 503 |
yield (SAMPLE_RATE, out)
|
|
@@ -542,7 +548,7 @@ with gr.Blocks() as lf_tts:
|
|
| 542 |
audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
|
| 543 |
with gr.Accordion('Audio Settings', open=True):
|
| 544 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 545 |
-
|
| 546 |
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
|
| 547 |
with gr.Row():
|
| 548 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
|
@@ -555,7 +561,7 @@ with gr.Blocks() as lf_tts:
|
|
| 555 |
sk = gr.Textbox(visible=False)
|
| 556 |
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
| 557 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 558 |
-
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
| 559 |
stop_btn.click(fn=None, cancels=generate_event)
|
| 560 |
|
| 561 |
with gr.Blocks() as about:
|
|
|
|
| 270 |
return 2
|
| 271 |
return speed
|
| 272 |
|
| 273 |
+
def clamp_trim(trim):
|
| 274 |
+
if not isinstance(trim, float) and not isinstance(trim, int):
|
| 275 |
+
return 0.5
|
| 276 |
+
elif trim < 0:
|
| 277 |
+
return 0
|
| 278 |
+
elif trim > 1:
|
| 279 |
+
return 0.5
|
| 280 |
+
return trim
|
| 281 |
|
| 282 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 283 |
+
def generate(text, voice='af', ps=None, speed=1, trim=0.5, use_gpu='auto', sk=None):
|
| 284 |
ps = ps or phonemize(text, voice)
|
| 285 |
if not sk and (text in sents or ps.strip('"') in harvsents):
|
| 286 |
sk = os.environ['SK']
|
|
|
|
| 288 |
return (None, '')
|
| 289 |
voices = resolve_voices(voice, warn=ps)
|
| 290 |
speed = clamp_speed(speed)
|
| 291 |
+
trim = clamp_trim(trim)
|
| 292 |
use_gpu = use_gpu if use_gpu in ('auto', False, True) else 'auto'
|
| 293 |
tokens = tokenize(ps)
|
| 294 |
if not tokens:
|
|
|
|
| 312 |
raise gr.Error(e)
|
| 313 |
print(debug, datetime.now(), voices, len(ps), use_gpu, repr(e))
|
| 314 |
return (None, '')
|
| 315 |
+
if trim:
|
| 316 |
+
a, b = librosa.effects.trim(out, top_db=30)[1]
|
| 317 |
+
a = int(a*trim)
|
| 318 |
+
b = int(len(out)-(len(out)-b)*trim)
|
| 319 |
+
out = out[a:b]
|
| 320 |
print(debug, datetime.now(), voices, len(ps), use_gpu, len(out))
|
| 321 |
return ((SAMPLE_RATE, out), ps)
|
| 322 |
|
|
|
|
| 362 |
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 363 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 364 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 365 |
+
trim = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label='✂️ Trim', info='How much to cut from both ends')
|
| 366 |
with gr.Accordion('Output Tokens', open=True):
|
| 367 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 368 |
with gr.Accordion('Voice Mixer', open=False):
|
|
|
|
| 377 |
with gr.Row():
|
| 378 |
sk = gr.Textbox(visible=False)
|
| 379 |
text.change(lambda: os.environ['SK'], outputs=[sk])
|
| 380 |
+
text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
| 381 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
| 382 |
|
| 383 |
@torch.no_grad()
|
| 384 |
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
|
|
|
| 467 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 468 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 469 |
|
| 470 |
+
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
|
| 471 |
if sk != os.environ['SK']:
|
| 472 |
return
|
| 473 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 474 |
voices = resolve_voices(voice)
|
| 475 |
speed = clamp_speed(speed)
|
| 476 |
+
trim = clamp_trim(trim)
|
| 477 |
pad_between = int(pad_between)
|
| 478 |
use_gpu = True
|
| 479 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
|
|
|
| 499 |
else:
|
| 500 |
raise gr.Error(e)
|
| 501 |
for out in outs:
|
| 502 |
+
if trim:
|
| 503 |
+
a, b = librosa.effects.trim(out, top_db=30)[1]
|
| 504 |
+
a = int(a*trim)
|
| 505 |
+
b = int(len(out)-(len(out)-b)*trim)
|
| 506 |
+
out = out[a:b]
|
| 507 |
if i > 0 and pad_between > 0:
|
| 508 |
yield (SAMPLE_RATE, np.zeros(pad_between))
|
| 509 |
yield (SAMPLE_RATE, out)
|
|
|
|
| 548 |
audio_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
|
| 549 |
with gr.Accordion('Audio Settings', open=True):
|
| 550 |
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 551 |
+
trim = gr.Slider(minimum=0, maximum=1, value=0, step=0.1, label='✂️ Trim', info='How much to cut from both ends')
|
| 552 |
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How many silent samples to insert between segments')
|
| 553 |
with gr.Row():
|
| 554 |
segment_btn = gr.Button('Tokenize', variant='primary')
|
|
|
|
| 561 |
sk = gr.Textbox(visible=False)
|
| 562 |
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
| 563 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 564 |
+
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
|
| 565 |
stop_btn.click(fn=None, cancels=generate_event)
|
| 566 |
|
| 567 |
with gr.Blocks() as about:
|