Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +33 -29
audio_foundation_models.py
CHANGED
|
@@ -135,11 +135,6 @@ class T2A:
|
|
| 135 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
| 136 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 137 |
|
| 138 |
-
@prompts(name="Generate Audio From User Input Text",
|
| 139 |
-
description="useful for when you want to generate an audio "
|
| 140 |
-
"from a user input text and it saved it to a file."
|
| 141 |
-
"The input to this tool should be a string, "
|
| 142 |
-
"representing the text used to generate audio.")
|
| 143 |
|
| 144 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 145 |
SAMPLE_RATE = 16000
|
|
@@ -168,6 +163,12 @@ class T2A:
|
|
| 168 |
best_wav = select_best_audio(text, wav_list)
|
| 169 |
return best_wav
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 172 |
melbins,mel_len = 80,624
|
| 173 |
with torch.no_grad():
|
|
@@ -188,11 +189,6 @@ class I2A:
|
|
| 188 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
| 189 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 190 |
|
| 191 |
-
@prompts(name="Generate Audio From The Image",
|
| 192 |
-
description="useful for when you want to generate an audio "
|
| 193 |
-
"based on an image. "
|
| 194 |
-
"The input to this tool should be a string, "
|
| 195 |
-
"representing the image_path. ")
|
| 196 |
|
| 197 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 198 |
SAMPLE_RATE = 16000
|
|
@@ -224,6 +220,13 @@ class I2A:
|
|
| 224 |
wav_list.append((SAMPLE_RATE,wav))
|
| 225 |
best_wav = wav_list[0]
|
| 226 |
return best_wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 228 |
melbins,mel_len = 80,624
|
| 229 |
with torch.no_grad():
|
|
@@ -247,7 +250,6 @@ class TTS:
|
|
| 247 |
"representing the text used to be converted to speech.")
|
| 248 |
|
| 249 |
def inference(self, text):
|
| 250 |
-
global temp_audio_filename
|
| 251 |
inp = {"text": text}
|
| 252 |
out = self.inferencer.infer_once(inp)
|
| 253 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
|
@@ -270,6 +272,11 @@ class T2S:
|
|
| 270 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
| 271 |
}
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
@prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
|
| 274 |
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
|
| 275 |
"and save it to a file."
|
|
@@ -278,11 +285,7 @@ class T2S:
|
|
| 278 |
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
| 279 |
"The input to this tool should be a comma seperated string of three, "
|
| 280 |
"representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
|
| 281 |
-
|
| 282 |
-
def set_model_hparams(self):
|
| 283 |
-
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 284 |
-
self.hp = hp
|
| 285 |
-
|
| 286 |
def inference(self, inputs):
|
| 287 |
self.set_model_hparams()
|
| 288 |
val = inputs.split(",")
|
|
@@ -311,13 +314,6 @@ class TTS_OOD:
|
|
| 311 |
self.set_model_hparams()
|
| 312 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 313 |
|
| 314 |
-
@prompts(name="Style Transfer",
|
| 315 |
-
description="useful for when you want to generate speech samples with styles "
|
| 316 |
-
"(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
|
| 317 |
-
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
| 318 |
-
"The input to this tool should be a comma seperated string of two, "
|
| 319 |
-
"representing reference audio path and input text. " )
|
| 320 |
-
|
| 321 |
def set_model_hparams(self):
|
| 322 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 323 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
|
@@ -328,6 +324,13 @@ class TTS_OOD:
|
|
| 328 |
hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
|
| 329 |
self.hp = hp
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
def inference(self, inputs):
|
| 332 |
self.set_model_hparams()
|
| 333 |
key = ['ref_audio', 'text']
|
|
@@ -349,12 +352,6 @@ class Inpaint:
|
|
| 349 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 350 |
self.cmap_transform = matplotlib.cm.viridis
|
| 351 |
|
| 352 |
-
@prompts(name="Audio Inpainting",
|
| 353 |
-
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
|
| 354 |
-
"this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
|
| 355 |
-
"The input to this tool should be a string, "
|
| 356 |
-
"representing the audio_path. " )
|
| 357 |
-
|
| 358 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
| 359 |
|
| 360 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
|
@@ -471,6 +468,13 @@ class Inpaint:
|
|
| 471 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 472 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
| 473 |
return image_filename, audio_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
def inference(self, input_audio_path):
|
| 475 |
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
| 476 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
|
|
|
| 135 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
| 136 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 140 |
SAMPLE_RATE = 16000
|
|
|
|
| 163 |
best_wav = select_best_audio(text, wav_list)
|
| 164 |
return best_wav
|
| 165 |
|
| 166 |
+
@prompts(name="Generate Audio From User Input Text",
|
| 167 |
+
description="useful for when you want to generate an audio "
|
| 168 |
+
"from a user input text and it saved it to a file."
|
| 169 |
+
"The input to this tool should be a string, "
|
| 170 |
+
"representing the text used to generate audio.")
|
| 171 |
+
|
| 172 |
def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 173 |
melbins,mel_len = 80,624
|
| 174 |
with torch.no_grad():
|
|
|
|
| 189 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
| 190 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 194 |
SAMPLE_RATE = 16000
|
|
|
|
| 220 |
wav_list.append((SAMPLE_RATE,wav))
|
| 221 |
best_wav = wav_list[0]
|
| 222 |
return best_wav
|
| 223 |
+
|
| 224 |
+
@prompts(name="Generate Audio From The Image",
|
| 225 |
+
description="useful for when you want to generate an audio "
|
| 226 |
+
"based on an image. "
|
| 227 |
+
"The input to this tool should be a string, "
|
| 228 |
+
"representing the image_path. ")
|
| 229 |
+
|
| 230 |
def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 231 |
melbins,mel_len = 80,624
|
| 232 |
with torch.no_grad():
|
|
|
|
| 250 |
"representing the text used to be converted to speech.")
|
| 251 |
|
| 252 |
def inference(self, text):
|
|
|
|
| 253 |
inp = {"text": text}
|
| 254 |
out = self.inferencer.infer_once(inp)
|
| 255 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
|
|
|
| 272 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
| 273 |
}
|
| 274 |
|
| 275 |
+
|
| 276 |
+
def set_model_hparams(self):
|
| 277 |
+
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 278 |
+
self.hp = hp
|
| 279 |
+
|
| 280 |
@prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
|
| 281 |
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
|
| 282 |
"and save it to a file."
|
|
|
|
| 285 |
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
| 286 |
"The input to this tool should be a comma seperated string of three, "
|
| 287 |
"representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
|
| 288 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
def inference(self, inputs):
|
| 290 |
self.set_model_hparams()
|
| 291 |
val = inputs.split(",")
|
|
|
|
| 314 |
self.set_model_hparams()
|
| 315 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
def set_model_hparams(self):
|
| 318 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 319 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
|
|
|
| 324 |
hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
|
| 325 |
self.hp = hp
|
| 326 |
|
| 327 |
+
@prompts(name="Style Transfer",
|
| 328 |
+
description="useful for when you want to generate speech samples with styles "
|
| 329 |
+
"(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
|
| 330 |
+
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
| 331 |
+
"The input to this tool should be a comma seperated string of two, "
|
| 332 |
+
"representing reference audio path and input text. " )
|
| 333 |
+
|
| 334 |
def inference(self, inputs):
|
| 335 |
self.set_model_hparams()
|
| 336 |
key = ['ref_audio', 'text']
|
|
|
|
| 352 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 353 |
self.cmap_transform = matplotlib.cm.viridis
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
| 356 |
|
| 357 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
|
|
|
| 468 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 469 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
| 470 |
return image_filename, audio_filename
|
| 471 |
+
|
| 472 |
+
@prompts(name="Audio Inpainting",
|
| 473 |
+
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
|
| 474 |
+
"this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
|
| 475 |
+
"The input to this tool should be a string, "
|
| 476 |
+
"representing the audio_path. " )
|
| 477 |
+
|
| 478 |
def inference(self, input_audio_path):
|
| 479 |
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
| 480 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|