Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +81 -23
audio_foundation_models.py
CHANGED
|
@@ -42,6 +42,13 @@ from utils.os_utils import move_file
|
|
| 42 |
import scipy.io.wavfile as wavfile
|
| 43 |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def initialize_model(config, ckpt, device):
|
| 47 |
config = OmegaConf.load(config)
|
|
@@ -64,7 +71,7 @@ def initialize_model_inpaint(config, ckpt):
|
|
| 64 |
sampler = DDIMSampler(model)
|
| 65 |
return sampler
|
| 66 |
def select_best_audio(prompt,wav_list):
|
| 67 |
-
clap_model = CLAPWrapper('
|
| 68 |
text_embeddings = clap_model.get_text_embeddings([prompt])
|
| 69 |
score_list = []
|
| 70 |
for data in wav_list:
|
|
@@ -87,6 +94,11 @@ class T2I:
|
|
| 87 |
self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
|
| 88 |
self.pipe.to(device)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
def inference(self, text):
|
| 91 |
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
| 92 |
refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
|
|
@@ -103,6 +115,13 @@ class ImageCaptioning:
|
|
| 103 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 104 |
self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def inference(self, image_path):
|
| 107 |
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
|
| 108 |
out = self.model.generate(**inputs)
|
|
@@ -113,9 +132,15 @@ class T2A:
|
|
| 113 |
def __init__(self, device):
|
| 114 |
print("Initializing Make-An-Audio to %s" % device)
|
| 115 |
self.device = device
|
| 116 |
-
self.sampler = initialize_model('
|
| 117 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 120 |
SAMPLE_RATE = 16000
|
| 121 |
prng = np.random.RandomState(seed)
|
|
@@ -160,8 +185,15 @@ class I2A:
|
|
| 160 |
def __init__(self, device):
|
| 161 |
print("Initializing Make-An-Audio-Image to %s" % device)
|
| 162 |
self.device = device
|
| 163 |
-
self.sampler = initialize_model('text_to_audio/
|
| 164 |
-
self.vocoder = VocoderBigVGAN('text_to_audio/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 166 |
SAMPLE_RATE = 16000
|
| 167 |
n_samples = 1 # only support 1 sample
|
|
@@ -205,18 +237,6 @@ class I2A:
|
|
| 205 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
| 206 |
return audio_filename
|
| 207 |
|
| 208 |
-
class TTS:
|
| 209 |
-
def __init__(self, device=None):
|
| 210 |
-
self.inferencer = TTSInference(device)
|
| 211 |
-
|
| 212 |
-
def inference(self, text):
|
| 213 |
-
global temp_audio_filename
|
| 214 |
-
inp = {"text": text}
|
| 215 |
-
out = self.inferencer.infer_once(inp)
|
| 216 |
-
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 217 |
-
soundfile.write(audio_filename, out, samplerate = 22050)
|
| 218 |
-
return audio_filename
|
| 219 |
-
|
| 220 |
class T2S:
|
| 221 |
def __init__(self, device= None):
|
| 222 |
if device is None:
|
|
@@ -233,6 +253,15 @@ class T2S:
|
|
| 233 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
| 234 |
}
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
def set_model_hparams(self):
|
| 237 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 238 |
self.hp = hp
|
|
@@ -241,11 +270,13 @@ class T2S:
|
|
| 241 |
self.set_model_hparams()
|
| 242 |
val = inputs.split(",")
|
| 243 |
key = ['text', 'notes', 'notes_duration']
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
inp = self.default_inp
|
| 246 |
-
|
| 247 |
-
inp = {k:v for k,v in zip(key,val)}
|
| 248 |
-
wav = self.pipe.infer_once(inp)
|
| 249 |
wav *= 32767
|
| 250 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 251 |
wavfile.write(audio_filename, self.hp['audio_sample_rate'], wav.astype(np.int16))
|
|
@@ -263,6 +294,13 @@ class TTS_OOD:
|
|
| 263 |
self.set_model_hparams()
|
| 264 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
def set_model_hparams(self):
|
| 267 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 268 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
|
@@ -278,7 +316,6 @@ class TTS_OOD:
|
|
| 278 |
key = ['ref_audio', 'text']
|
| 279 |
val = inputs.split(",")
|
| 280 |
inp = {k: v for k, v in zip(key, val)}
|
| 281 |
-
print(inp)
|
| 282 |
wav = self.pipe.infer_once(inp)
|
| 283 |
wav *= 32767
|
| 284 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
|
@@ -291,9 +328,16 @@ class Inpaint:
|
|
| 291 |
def __init__(self, device):
|
| 292 |
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
| 293 |
self.device = device
|
| 294 |
-
self.sampler = initialize_model_inpaint('text_to_audio/
|
| 295 |
-
self.vocoder = VocoderBigVGAN('
|
| 296 |
self.cmap_transform = matplotlib.cm.viridis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
| 298 |
|
| 299 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
|
@@ -424,6 +468,13 @@ class ASR:
|
|
| 424 |
print("Initializing Whisper to %s" % device)
|
| 425 |
self.device = device
|
| 426 |
self.model = whisper.load_model("base", device=device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
def inference(self, audio_path):
|
| 428 |
audio = whisper.load_audio(audio_path)
|
| 429 |
audio = whisper.pad_or_trim(audio)
|
|
@@ -438,6 +489,13 @@ class A2T:
|
|
| 438 |
print("Initializing Audio-To-Text Model to %s" % device)
|
| 439 |
self.device = device
|
| 440 |
self.model = AudioCapModel("audio_to_text/audiocaps_cntrstv_cnn14rnn_trm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
def inference(self, audio_path):
|
| 442 |
audio = whisper.load_audio(audio_path)
|
| 443 |
caption_text = self.model(audio)
|
|
|
|
| 42 |
import scipy.io.wavfile as wavfile
|
| 43 |
|
| 44 |
|
| 45 |
+
def prompts(name, description):
|
| 46 |
+
def decorator(func):
|
| 47 |
+
func.name = name
|
| 48 |
+
func.description = description
|
| 49 |
+
return func
|
| 50 |
+
|
| 51 |
+
return decorator
|
| 52 |
|
| 53 |
def initialize_model(config, ckpt, device):
|
| 54 |
config = OmegaConf.load(config)
|
|
|
|
| 71 |
sampler = DDIMSampler(model)
|
| 72 |
return sampler
|
| 73 |
def select_best_audio(prompt,wav_list):
|
| 74 |
+
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
| 75 |
text_embeddings = clap_model.get_text_embeddings([prompt])
|
| 76 |
score_list = []
|
| 77 |
for data in wav_list:
|
|
|
|
| 94 |
self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
|
| 95 |
self.pipe.to(device)
|
| 96 |
|
| 97 |
+
@prompts(name="Generate Image From User Input Text",
|
| 98 |
+
description="useful when you want to generate an image from a user input text and save it to a file. "
|
| 99 |
+
"like: generate an image of an object or something, or generate an image that includes some objects. "
|
| 100 |
+
"The input to this tool should be a string, representing the text used to generate image. ")
|
| 101 |
+
|
| 102 |
def inference(self, text):
|
| 103 |
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
| 104 |
refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
|
|
|
|
| 115 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 116 |
self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
|
| 117 |
|
| 118 |
+
|
| 119 |
+
@prompts(name="Remove Something From The Photo",
|
| 120 |
+
description="useful when you want to remove and object or something from the photo "
|
| 121 |
+
"from its description or location. "
|
| 122 |
+
"The input to this tool should be a comma separated string of two, "
|
| 123 |
+
"representing the image_path and the object need to be removed. ")
|
| 124 |
+
|
| 125 |
def inference(self, image_path):
|
| 126 |
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
|
| 127 |
out = self.model.generate(**inputs)
|
|
|
|
| 132 |
def __init__(self, device):
|
| 133 |
print("Initializing Make-An-Audio to %s" % device)
|
| 134 |
self.device = device
|
| 135 |
+
self.sampler = initialize_model('configs/text-to-audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
| 136 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 137 |
|
| 138 |
+
@prompts(name="Generate Audio From User Input Text",
|
| 139 |
+
description="useful for when you want to generate an audio "
|
| 140 |
+
"from a user input text and it saved it to a file."
|
| 141 |
+
"The input to this tool should be a string, "
|
| 142 |
+
"representing the text used to generate audio.")
|
| 143 |
+
|
| 144 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
| 145 |
SAMPLE_RATE = 16000
|
| 146 |
prng = np.random.RandomState(seed)
|
|
|
|
| 185 |
def __init__(self, device):
|
| 186 |
print("Initializing Make-An-Audio-Image to %s" % device)
|
| 187 |
self.device = device
|
| 188 |
+
self.sampler = initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
| 189 |
+
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)
|
| 190 |
+
|
| 191 |
+
@prompts(name="Generate Audio From The Image",
|
| 192 |
+
description="useful for when you want to generate an audio "
|
| 193 |
+
"based on an image.""
|
| 194 |
+
"The input to this tool should be a string, "
|
| 195 |
+
"representing the image_path. ")
|
| 196 |
+
|
| 197 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 198 |
SAMPLE_RATE = 16000
|
| 199 |
n_samples = 1 # only support 1 sample
|
|
|
|
| 237 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
| 238 |
return audio_filename
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
class T2S:
|
| 241 |
def __init__(self, device= None):
|
| 242 |
if device is None:
|
|
|
|
| 253 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
| 254 |
}
|
| 255 |
|
| 256 |
+
@prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
|
| 257 |
+
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
|
| 258 |
+
"and save it to a file.""
|
| 259 |
+
"If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence. "
|
| 260 |
+
"If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
|
| 261 |
+
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
| 262 |
+
"The input to this tool should be a comma seperated string of three, "
|
| 263 |
+
"representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
|
| 264 |
+
|
| 265 |
def set_model_hparams(self):
|
| 266 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 267 |
self.hp = hp
|
|
|
|
| 270 |
self.set_model_hparams()
|
| 271 |
val = inputs.split(",")
|
| 272 |
key = ['text', 'notes', 'notes_duration']
|
| 273 |
+
try:
|
| 274 |
+
inp = {k: v for k, v in zip(key, val)}
|
| 275 |
+
wav = self.pipe.infer_once(inp)
|
| 276 |
+
except:
|
| 277 |
+
print('Error occurs. Generate default audio sample.\n')
|
| 278 |
inp = self.default_inp
|
| 279 |
+
wav = self.pipe.infer_once(inp)
|
|
|
|
|
|
|
| 280 |
wav *= 32767
|
| 281 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 282 |
wavfile.write(audio_filename, self.hp['audio_sample_rate'], wav.astype(np.int16))
|
|
|
|
| 294 |
self.set_model_hparams()
|
| 295 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 296 |
|
| 297 |
+
@prompts(name="Style Transfer",
|
| 298 |
+
description="useful for when you want to generate speech samples with styles "
|
| 299 |
+
"(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
|
| 300 |
+
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
| 301 |
+
"The input to this tool should be a comma seperated string of two, "
|
| 302 |
+
"representing reference audio path and input text. " )
|
| 303 |
+
|
| 304 |
def set_model_hparams(self):
|
| 305 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
| 306 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
|
|
|
| 316 |
key = ['ref_audio', 'text']
|
| 317 |
val = inputs.split(",")
|
| 318 |
inp = {k: v for k, v in zip(key, val)}
|
|
|
|
| 319 |
wav = self.pipe.infer_once(inp)
|
| 320 |
wav *= 32767
|
| 321 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
|
|
|
| 328 |
def __init__(self, device):
|
| 329 |
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
| 330 |
self.device = device
|
| 331 |
+
self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
| 332 |
+
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
| 333 |
self.cmap_transform = matplotlib.cm.viridis
|
| 334 |
+
|
| 335 |
+
@prompts(name="Audio Inpainting",
|
| 336 |
+
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
|
| 337 |
+
"this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
|
| 338 |
+
"The input to this tool should be a string, "
|
| 339 |
+
"representing the audio_path. " )
|
| 340 |
+
|
| 341 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
| 342 |
|
| 343 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
|
|
|
| 468 |
print("Initializing Whisper to %s" % device)
|
| 469 |
self.device = device
|
| 470 |
self.model = whisper.load_model("base", device=device)
|
| 471 |
+
|
| 472 |
+
@prompts(name="Transcribe speech",
|
| 473 |
+
description="useful for when you want to know the text corresponding to a human speech, "
|
| 474 |
+
"receives audio_path as input. "
|
| 475 |
+
"The input to this tool should be a string, "
|
| 476 |
+
"representing the audio_path. " )
|
| 477 |
+
|
| 478 |
def inference(self, audio_path):
|
| 479 |
audio = whisper.load_audio(audio_path)
|
| 480 |
audio = whisper.pad_or_trim(audio)
|
|
|
|
| 489 |
print("Initializing Audio-To-Text Model to %s" % device)
|
| 490 |
self.device = device
|
| 491 |
self.model = AudioCapModel("audio_to_text/audiocaps_cntrstv_cnn14rnn_trm")
|
| 492 |
+
|
| 493 |
+
@prompts(name="Generate Text From The Audio",
|
| 494 |
+
description="useful for when you want to describe an audio in text, "
|
| 495 |
+
"receives audio_path as input. "
|
| 496 |
+
"The input to this tool should be a string, "
|
| 497 |
+
"representing the audio_path. " )
|
| 498 |
+
|
| 499 |
def inference(self, audio_path):
|
| 500 |
audio = whisper.load_audio(audio_path)
|
| 501 |
caption_text = self.model(audio)
|