Upload app.py
Browse files
app.py
CHANGED
|
@@ -157,7 +157,7 @@ def forward(tokens, voice, speed):
|
|
| 157 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 158 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 159 |
|
| 160 |
-
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
|
| 161 |
ps = ps or phonemize(text, voice)
|
| 162 |
tokens = tokenize(ps)
|
| 163 |
if not tokens:
|
|
@@ -172,18 +172,24 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
|
|
| 172 |
return (None, '')
|
| 173 |
if reduce_noise > 0:
|
| 174 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 175 |
-
opening_cut =
|
| 176 |
if opening_cut > 0:
|
| 177 |
-
out[:
|
| 178 |
-
closing_cut =
|
| 179 |
if closing_cut > 0:
|
| 180 |
-
out[
|
| 181 |
-
ease_in = min(int(ease_in / speed), len(out)//2
|
| 182 |
for i in range(ease_in):
|
| 183 |
-
out[i
|
| 184 |
-
ease_out = min(int(ease_out / speed), len(out)//2
|
| 185 |
for i in range(ease_out):
|
| 186 |
-
out[-i-1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
return ((SAMPLE_RATE, out), ps)
|
| 188 |
|
| 189 |
with gr.Blocks() as basic_tts:
|
|
@@ -212,15 +218,20 @@ with gr.Blocks() as basic_tts:
|
|
| 212 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 213 |
with gr.Row():
|
| 214 |
with gr.Column():
|
| 215 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️
|
| 216 |
with gr.Column():
|
| 217 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️
|
| 218 |
with gr.Row():
|
| 219 |
with gr.Column():
|
| 220 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 221 |
with gr.Column():
|
| 222 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
@spaces.GPU
|
| 226 |
@torch.no_grad()
|
|
@@ -303,12 +314,13 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 303 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 304 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 305 |
|
| 306 |
-
def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000,
|
| 307 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 308 |
wavs = []
|
| 309 |
opening_cut = max(0, int(opening_cut / speed))
|
| 310 |
closing_cut = max(0, int(closing_cut / speed))
|
| 311 |
-
|
|
|
|
| 312 |
batch_size = 100
|
| 313 |
for i in range(0, len(token_lists), batch_size):
|
| 314 |
try:
|
|
@@ -323,18 +335,20 @@ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000,
|
|
| 323 |
if reduce_noise > 0:
|
| 324 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 325 |
if opening_cut > 0:
|
| 326 |
-
out[:
|
| 327 |
if closing_cut > 0:
|
| 328 |
-
out[
|
| 329 |
-
ease_in = min(int(ease_in / speed), len(out)//2
|
| 330 |
for i in range(ease_in):
|
| 331 |
-
out[i
|
| 332 |
-
ease_out = min(int(ease_out / speed), len(out)//2
|
| 333 |
for i in range(ease_out):
|
| 334 |
-
out[-i-1
|
| 335 |
-
if
|
| 336 |
-
wavs.append(np.zeros(
|
| 337 |
wavs.append(out)
|
|
|
|
|
|
|
| 338 |
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
| 339 |
|
| 340 |
def did_change_segments(segments):
|
|
@@ -376,21 +390,24 @@ with gr.Blocks() as lf_tts:
|
|
| 376 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 377 |
with gr.Row():
|
| 378 |
with gr.Column():
|
| 379 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️
|
| 380 |
with gr.Column():
|
| 381 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️
|
| 382 |
with gr.Row():
|
| 383 |
with gr.Column():
|
| 384 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 385 |
with gr.Column():
|
| 386 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 387 |
with gr.Row():
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
| 389 |
with gr.Row():
|
| 390 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 391 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 392 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 393 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out,
|
| 394 |
|
| 395 |
with gr.Blocks() as app:
|
| 396 |
gr.TabbedInterface(
|
|
|
|
| 157 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 158 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 159 |
|
| 160 |
+
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
|
| 161 |
ps = ps or phonemize(text, voice)
|
| 162 |
tokens = tokenize(ps)
|
| 163 |
if not tokens:
|
|
|
|
| 172 |
return (None, '')
|
| 173 |
if reduce_noise > 0:
|
| 174 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 175 |
+
opening_cut = int(opening_cut / speed)
|
| 176 |
if opening_cut > 0:
|
| 177 |
+
out = out[opening_cut:]
|
| 178 |
+
closing_cut = int(closing_cut / speed)
|
| 179 |
if closing_cut > 0:
|
| 180 |
+
out = out[:-closing_cut]
|
| 181 |
+
ease_in = min(int(ease_in / speed), len(out)//2)
|
| 182 |
for i in range(ease_in):
|
| 183 |
+
out[i] *= s_curve(i / ease_in)
|
| 184 |
+
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 185 |
for i in range(ease_out):
|
| 186 |
+
out[-i-1] *= s_curve(i / ease_out)
|
| 187 |
+
pad_before = max(0, int(pad_before / speed))
|
| 188 |
+
if pad_before > 0:
|
| 189 |
+
out = np.concatenate([np.zeros(pad_before), out])
|
| 190 |
+
pad_after = max(0, int(pad_after / speed))
|
| 191 |
+
if pad_after > 0:
|
| 192 |
+
out = np.concatenate([out, np.zeros(pad_after)])
|
| 193 |
return ((SAMPLE_RATE, out), ps)
|
| 194 |
|
| 195 |
with gr.Blocks() as basic_tts:
|
|
|
|
| 218 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 219 |
with gr.Row():
|
| 220 |
with gr.Column():
|
| 221 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Cut this many samples from the start.')
|
| 222 |
with gr.Column():
|
| 223 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Cut this many samples from the end.')
|
| 224 |
with gr.Row():
|
| 225 |
with gr.Column():
|
| 226 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 227 |
with gr.Column():
|
| 228 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 229 |
+
with gr.Row():
|
| 230 |
+
with gr.Column():
|
| 231 |
+
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
| 232 |
+
with gr.Column():
|
| 233 |
+
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
| 234 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 235 |
|
| 236 |
@spaces.GPU
|
| 237 |
@torch.no_grad()
|
|
|
|
| 314 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 315 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 316 |
|
| 317 |
+
def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
|
| 318 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 319 |
wavs = []
|
| 320 |
opening_cut = max(0, int(opening_cut / speed))
|
| 321 |
closing_cut = max(0, int(closing_cut / speed))
|
| 322 |
+
pad_before = max(0, int(pad_before / speed))
|
| 323 |
+
pad_after = max(0, int(pad_after / speed))
|
| 324 |
batch_size = 100
|
| 325 |
for i in range(0, len(token_lists), batch_size):
|
| 326 |
try:
|
|
|
|
| 335 |
if reduce_noise > 0:
|
| 336 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 337 |
if opening_cut > 0:
|
| 338 |
+
out = out[opening_cut:]
|
| 339 |
if closing_cut > 0:
|
| 340 |
+
out = out[:-closing_cut]
|
| 341 |
+
ease_in = min(int(ease_in / speed), len(out)//2)
|
| 342 |
for i in range(ease_in):
|
| 343 |
+
out[i] *= s_curve(i / ease_in)
|
| 344 |
+
ease_out = min(int(ease_out / speed), len(out)//2)
|
| 345 |
for i in range(ease_out):
|
| 346 |
+
out[-i-1] *= s_curve(i / ease_out)
|
| 347 |
+
if pad_before > 0:
|
| 348 |
+
wavs.append(np.zeros(pad_before))
|
| 349 |
wavs.append(out)
|
| 350 |
+
if pad_after > 0:
|
| 351 |
+
wavs.append(np.zeros(pad_after))
|
| 352 |
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
| 353 |
|
| 354 |
def did_change_segments(segments):
|
|
|
|
| 390 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 391 |
with gr.Row():
|
| 392 |
with gr.Column():
|
| 393 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Cut this many samples from the start.')
|
| 394 |
with gr.Column():
|
| 395 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Cut this many samples from the end.')
|
| 396 |
with gr.Row():
|
| 397 |
with gr.Column():
|
| 398 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 399 |
with gr.Column():
|
| 400 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 401 |
with gr.Row():
|
| 402 |
+
with gr.Column():
|
| 403 |
+
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before each segment.')
|
| 404 |
+
with gr.Column():
|
| 405 |
+
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after each segment.')
|
| 406 |
with gr.Row():
|
| 407 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 408 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 409 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 410 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio])
|
| 411 |
|
| 412 |
with gr.Blocks() as app:
|
| 413 |
gr.TabbedInterface(
|