Upload app.py
Browse files
app.py
CHANGED
|
@@ -156,16 +156,6 @@ CHOICES = {
|
|
| 156 |
}
|
| 157 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
| 158 |
|
| 159 |
-
np_log_99 = np.log(99)
|
| 160 |
-
def s_curve(p):
|
| 161 |
-
if p <= 0:
|
| 162 |
-
return 0
|
| 163 |
-
elif p >= 1:
|
| 164 |
-
return 1
|
| 165 |
-
s = 1 / (1 + np.exp((1-p*2)*np_log_99))
|
| 166 |
-
s = (s-0.01) * 50/49
|
| 167 |
-
return s
|
| 168 |
-
|
| 169 |
SAMPLE_RATE = 24000
|
| 170 |
|
| 171 |
@torch.no_grad()
|
|
@@ -198,10 +188,10 @@ def forward_gpu(tokens, voice, speed):
|
|
| 198 |
return forward(tokens, voice, speed, device='cuda')
|
| 199 |
|
| 200 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 201 |
-
def generate(text, voice, ps, speed, _reduce_noise,
|
| 202 |
-
return _generate(text, voice, ps, speed,
|
| 203 |
|
| 204 |
-
def _generate(text, voice, ps, speed,
|
| 205 |
if voice not in VOICES['cpu']:
|
| 206 |
voice = 'af'
|
| 207 |
ps = ps or phonemize(text, voice)
|
|
@@ -219,18 +209,11 @@ def _generate(text, voice, ps, speed, opening_cut, closing_cut, ease_in, ease_ou
|
|
| 219 |
except gr.exceptions.Error as e:
|
| 220 |
raise gr.Error(e)
|
| 221 |
return (None, '')
|
| 222 |
-
|
| 223 |
-
if
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
out = out[:-closing_cut]
|
| 228 |
-
ease_in = min(int(ease_in / speed), len(out)//2)
|
| 229 |
-
for i in range(ease_in):
|
| 230 |
-
out[i] *= s_curve(i / ease_in)
|
| 231 |
-
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 232 |
-
for i in range(ease_out):
|
| 233 |
-
out[-i-1] *= s_curve(i / ease_out)
|
| 234 |
return ((SAMPLE_RATE, out), ps)
|
| 235 |
|
| 236 |
def toggle_autoplay(autoplay):
|
|
@@ -271,25 +254,15 @@ with gr.Blocks() as basic_tts:
|
|
| 271 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 272 |
with gr.Column():
|
| 273 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
| 276 |
with gr.Accordion('Output Tokens', open=True):
|
| 277 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speed of the audio; the settings below are auto-scaled by speed')
|
| 281 |
-
with gr.Row():
|
| 282 |
-
with gr.Column():
|
| 283 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
|
| 284 |
-
with gr.Column():
|
| 285 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
|
| 286 |
-
with gr.Row():
|
| 287 |
-
with gr.Column():
|
| 288 |
-
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
|
| 289 |
-
with gr.Column():
|
| 290 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
|
| 291 |
-
text.submit(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
| 292 |
-
generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, use_gpu], outputs=[audio, out_ps])
|
| 293 |
|
| 294 |
@torch.no_grad()
|
| 295 |
def lf_forward(token_lists, voice, speed, device='cpu'):
|
|
@@ -376,11 +349,10 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 376 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 377 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 378 |
|
| 379 |
-
def lf_generate(segments, voice, speed,
|
| 380 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 381 |
wavs = []
|
| 382 |
-
|
| 383 |
-
closing_cut = int(closing_cut / speed)
|
| 384 |
pad_between = int(pad_between / speed)
|
| 385 |
batch_size = 100
|
| 386 |
for i in range(0, len(token_lists), batch_size):
|
|
@@ -396,16 +368,10 @@ def lf_generate(segments, voice, speed, opening_cut, closing_cut, ease_in, ease_
|
|
| 396 |
raise gr.Error(e)
|
| 397 |
break
|
| 398 |
for out in outs:
|
| 399 |
-
if
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
out = out[:-
|
| 403 |
-
ease_in = min(int(ease_in / speed), len(out)//2)
|
| 404 |
-
for i in range(ease_in):
|
| 405 |
-
out[i] *= s_curve(i / ease_in)
|
| 406 |
-
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 407 |
-
for i in range(ease_out):
|
| 408 |
-
out[-i-1] *= s_curve(i / ease_out)
|
| 409 |
if wavs and pad_between > 0:
|
| 410 |
wavs.append(np.zeros(pad_between))
|
| 411 |
wavs.append(out)
|
|
@@ -451,26 +417,15 @@ with gr.Blocks() as lf_tts:
|
|
| 451 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
| 452 |
with gr.Column():
|
| 453 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 454 |
-
with gr.Accordion('Audio Settings', open=
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
with gr.Column():
|
| 459 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Opening Cut', info='Cut samples from the start')
|
| 460 |
-
with gr.Column():
|
| 461 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='🎬 Closing Cut', info='Cut samples from the end')
|
| 462 |
-
with gr.Row():
|
| 463 |
-
with gr.Column():
|
| 464 |
-
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='🎢 Ease In', info='Ease in samples, after opening cut')
|
| 465 |
-
with gr.Column():
|
| 466 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='🛝 Ease Out', info='Ease out samples, before closing cut')
|
| 467 |
-
with gr.Row():
|
| 468 |
-
pad_between = gr.Slider(minimum=0, maximum=24000, value=10000, step=1000, label='🔇 Pad Between', info='How many samples of silence to insert between segments')
|
| 469 |
with gr.Row():
|
| 470 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 471 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 472 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 473 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
| 474 |
|
| 475 |
with gr.Blocks() as about:
|
| 476 |
gr.Markdown("""
|
|
|
|
| 156 |
}
|
| 157 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
SAMPLE_RATE = 24000
|
| 160 |
|
| 161 |
@torch.no_grad()
|
|
|
|
| 188 |
return forward(tokens, voice, speed, device='cuda')
|
| 189 |
|
| 190 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 191 |
+
def generate(text, voice, ps, speed, _reduce_noise, trim, _closing_cut, _ease_in, _ease_out, _pad_before, _pad_after, use_gpu):
|
| 192 |
+
return _generate(text, voice, ps, speed, trim, use_gpu)
|
| 193 |
|
| 194 |
+
def _generate(text, voice, ps, speed, trim, use_gpu):
|
| 195 |
if voice not in VOICES['cpu']:
|
| 196 |
voice = 'af'
|
| 197 |
ps = ps or phonemize(text, voice)
|
|
|
|
| 209 |
except gr.exceptions.Error as e:
|
| 210 |
raise gr.Error(e)
|
| 211 |
return (None, '')
|
| 212 |
+
trim = int(trim / speed)
|
| 213 |
+
if trim > 0:
|
| 214 |
+
if trim * 2 >= len(out):
|
| 215 |
+
return (None, '')
|
| 216 |
+
out = out[trim:-trim]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
return ((SAMPLE_RATE, out), ps)
|
| 218 |
|
| 219 |
def toggle_autoplay(autoplay):
|
|
|
|
| 254 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 255 |
with gr.Column():
|
| 256 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 257 |
+
with gr.Accordion('Audio Settings', open=False):
|
| 258 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 259 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 260 |
+
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 261 |
+
trim = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
| 262 |
with gr.Accordion('Output Tokens', open=True):
|
| 263 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 264 |
+
text.submit(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
| 265 |
+
generate_btn.click(_generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
@torch.no_grad()
|
| 268 |
def lf_forward(token_lists, voice, speed, device='cpu'):
|
|
|
|
| 349 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 350 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 351 |
|
| 352 |
+
def lf_generate(segments, voice, speed, trim, pad_between, use_gpu):
|
| 353 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 354 |
wavs = []
|
| 355 |
+
trim = int(trim / speed)
|
|
|
|
| 356 |
pad_between = int(pad_between / speed)
|
| 357 |
batch_size = 100
|
| 358 |
for i in range(0, len(token_lists), batch_size):
|
|
|
|
| 368 |
raise gr.Error(e)
|
| 369 |
break
|
| 370 |
for out in outs:
|
| 371 |
+
if trim > 0:
|
| 372 |
+
if trim * 2 >= len(out):
|
| 373 |
+
continue
|
| 374 |
+
out = out[trim:-trim]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
if wavs and pad_between > 0:
|
| 376 |
wavs.append(np.zeros(pad_between))
|
| 377 |
wavs.append(out)
|
|
|
|
| 417 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
| 418 |
with gr.Column():
|
| 419 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 420 |
+
with gr.Accordion('Audio Settings', open=True):
|
| 421 |
+
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
|
| 422 |
+
trim = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='✂️ Trim', info='Cut from both ends')
|
| 423 |
+
pad_between = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='🔇 Pad Between', info='How much silence to insert between segments')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
with gr.Row():
|
| 425 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 426 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 427 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 428 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio])
|
| 429 |
|
| 430 |
with gr.Blocks() as about:
|
| 431 |
gr.Markdown("""
|