Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files
audio_foundation_models.py
CHANGED
|
@@ -113,7 +113,7 @@ class T2A:
|
|
| 113 |
def __init__(self, device):
|
| 114 |
print("Initializing Make-An-Audio to %s" % device)
|
| 115 |
self.device = device
|
| 116 |
-
self.sampler = initialize_model('configs/text-to-audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
| 117 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 118 |
|
| 119 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
|
@@ -160,8 +160,8 @@ class I2A:
|
|
| 160 |
def __init__(self, device):
|
| 161 |
print("Initializing Make-An-Audio-Image to %s" % device)
|
| 162 |
self.device = device
|
| 163 |
-
self.sampler = initialize_model('text_to_audio/
|
| 164 |
-
self.vocoder = VocoderBigVGAN('text_to_audio/
|
| 165 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 166 |
SAMPLE_RATE = 16000
|
| 167 |
n_samples = 1 # only support 1 sample
|
|
@@ -224,7 +224,7 @@ class T2S:
|
|
| 224 |
print("Initializing DiffSinger to %s" % device)
|
| 225 |
self.device = device
|
| 226 |
self.exp_name = 'checkpoints/0831_opencpop_ds1000'
|
| 227 |
-
self.config= '
|
| 228 |
self.set_model_hparams()
|
| 229 |
self.pipe = DiffSingerE2EInfer(self.hp, device)
|
| 230 |
self.default_inp = {
|
|
@@ -259,7 +259,7 @@ class TTS_OOD:
|
|
| 259 |
print("Initializing GenerSpeech to %s" % device)
|
| 260 |
self.device = device
|
| 261 |
self.exp_name = 'checkpoints/GenerSpeech'
|
| 262 |
-
self.config = '
|
| 263 |
self.set_model_hparams()
|
| 264 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 265 |
|
|
@@ -291,7 +291,7 @@ class Inpaint:
|
|
| 291 |
def __init__(self, device):
|
| 292 |
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
| 293 |
self.device = device
|
| 294 |
-
self.sampler = initialize_model_inpaint('text_to_audio/
|
| 295 |
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
| 296 |
self.cmap_transform = matplotlib.cm.viridis
|
| 297 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
|
|
|
| 113 |
def __init__(self, device):
|
| 114 |
print("Initializing Make-An-Audio to %s" % device)
|
| 115 |
self.device = device
|
| 116 |
+
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
| 117 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 118 |
|
| 119 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
|
|
|
| 160 |
def __init__(self, device):
|
| 161 |
print("Initializing Make-An-Audio-Image to %s" % device)
|
| 162 |
self.device = device
|
| 163 |
+
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
| 164 |
+
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
| 165 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
| 166 |
SAMPLE_RATE = 16000
|
| 167 |
n_samples = 1 # only support 1 sample
|
|
|
|
| 224 |
print("Initializing DiffSinger to %s" % device)
|
| 225 |
self.device = device
|
| 226 |
self.exp_name = 'checkpoints/0831_opencpop_ds1000'
|
| 227 |
+
self.config= 'NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml'
|
| 228 |
self.set_model_hparams()
|
| 229 |
self.pipe = DiffSingerE2EInfer(self.hp, device)
|
| 230 |
self.default_inp = {
|
|
|
|
| 259 |
print("Initializing GenerSpeech to %s" % device)
|
| 260 |
self.device = device
|
| 261 |
self.exp_name = 'checkpoints/GenerSpeech'
|
| 262 |
+
self.config = 'NeuralSeq/modules/GenerSpeech/config/generspeech.yaml'
|
| 263 |
self.set_model_hparams()
|
| 264 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
| 265 |
|
|
|
|
| 291 |
def __init__(self, device):
|
| 292 |
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
| 293 |
self.device = device
|
| 294 |
+
self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
|
| 295 |
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
| 296 |
self.cmap_transform = matplotlib.cm.viridis
|
| 297 |
def make_batch_sd(self, mel, mask, num_samples=1):
|