Spaces:
Runtime error
Runtime error
Major Advancement on Melody Conditioned.
Browse filesFull length is now consistent
Fades between sections may need more work
- app.py +8 -6
- audiocraft/data/audio_utils.py +59 -0
- audiocraft/models/musicgen.py +6 -6
- audiocraft/utils/extend.py +65 -6
app.py
CHANGED
|
@@ -15,7 +15,7 @@ import time
|
|
| 15 |
import warnings
|
| 16 |
from audiocraft.models import MusicGen
|
| 17 |
from audiocraft.data.audio import audio_write
|
| 18 |
-
from audiocraft.data.audio_utils import apply_fade
|
| 19 |
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
| 20 |
import numpy as np
|
| 21 |
import random
|
|
@@ -165,12 +165,14 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
| 165 |
overlap_samples = overlap * MODEL.sample_rate
|
| 166 |
#stack tracks and fade out/in
|
| 167 |
overlapping_output_fadeout = output[:, :, -overlap_samples:]
|
| 168 |
-
overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.
|
|
|
|
| 169 |
|
| 170 |
overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
|
| 171 |
-
overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.
|
|
|
|
| 172 |
|
| 173 |
-
overlapping_output = (overlapping_output_fadeout
|
| 174 |
print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
| 175 |
##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
|
| 176 |
##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
|
@@ -190,7 +192,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
| 190 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
| 191 |
audio_write(
|
| 192 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
| 193 |
-
loudness_headroom_db=
|
| 194 |
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
| 195 |
if MOVE_TO_CPU:
|
| 196 |
MODEL.to('cpu')
|
|
@@ -245,7 +247,7 @@ def ui(**kwargs):
|
|
| 245 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
| 246 |
with gr.Row():
|
| 247 |
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
| 248 |
-
overlap = gr.Slider(minimum=1, maximum=
|
| 249 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
| 250 |
with gr.Row():
|
| 251 |
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
|
|
|
| 15 |
import warnings
|
| 16 |
from audiocraft.models import MusicGen
|
| 17 |
from audiocraft.data.audio import audio_write
|
| 18 |
+
from audiocraft.data.audio_utils import apply_fade, apply_tafade
|
| 19 |
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
| 20 |
import numpy as np
|
| 21 |
import random
|
|
|
|
| 165 |
overlap_samples = overlap * MODEL.sample_rate
|
| 166 |
#stack tracks and fade out/in
|
| 167 |
overlapping_output_fadeout = output[:, :, -overlap_samples:]
|
| 168 |
+
overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
|
| 169 |
+
#overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="exponential")
|
| 170 |
|
| 171 |
overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
|
| 172 |
+
overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
|
| 173 |
+
#overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
|
| 174 |
|
| 175 |
+
overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
|
| 176 |
print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
| 177 |
##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
|
| 178 |
##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
|
|
|
|
| 192 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
| 193 |
audio_write(
|
| 194 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
| 195 |
+
loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
|
| 196 |
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
| 197 |
if MOVE_TO_CPU:
|
| 198 |
MODEL.to('cpu')
|
|
|
|
| 247 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
| 248 |
with gr.Row():
|
| 249 |
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
| 250 |
+
overlap = gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Overlap", interactive=True)
|
| 251 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
| 252 |
with gr.Row():
|
| 253 |
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
audiocraft/data/audio_utils.py
CHANGED
|
@@ -173,7 +173,66 @@ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
|
|
| 173 |
assert wav.dtype == torch.int16
|
| 174 |
return wav
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
| 178 |
fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
|
| 179 |
|
|
|
|
| 173 |
assert wav.dtype == torch.int16
|
| 174 |
return wav
|
| 175 |
|
| 176 |
+
def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear") -> torch.Tensor:
|
| 177 |
+
"""
|
| 178 |
+
Apply fade-in and/or fade-out effects to the audio tensor.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
audio (torch.Tensor): The input audio tensor of shape (C, L).
|
| 182 |
+
sample_rate (int): The sample rate of the audio.
|
| 183 |
+
duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
|
| 184 |
+
out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
|
| 185 |
+
start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
|
| 186 |
+
shape (str, optional): The shape of the fade. Must be one of: "quarter_sine", "half_sine", "linear", "logarithmic", "exponential". Defaults to "linear".
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
torch.Tensor: The audio tensor with the fade effect applied.
|
| 190 |
+
|
| 191 |
+
"""
|
| 192 |
+
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
| 193 |
+
|
| 194 |
+
# Create the fade transform
|
| 195 |
+
fade_transform = torchaudio.transforms.Fade(fade_in_len=fade_samples, fade_out_len=fade_samples, fade_shape=shape)
|
| 196 |
+
|
| 197 |
+
if out:
|
| 198 |
+
fade_transform.fade_out_len = fade_samples
|
| 199 |
+
fade_transform.fade_out_shape = shape
|
| 200 |
+
|
| 201 |
+
# Select the portion of the audio to apply the fade
|
| 202 |
+
if start:
|
| 203 |
+
audio_fade_section = audio[:, :fade_samples]
|
| 204 |
+
else:
|
| 205 |
+
audio_fade_section = audio[:, -fade_samples:]
|
| 206 |
+
|
| 207 |
+
# Apply the fade transform to the audio section
|
| 208 |
+
audio_faded = fade_transform(audio)
|
| 209 |
+
|
| 210 |
+
# Replace the selected portion of the audio with the faded section
|
| 211 |
+
if start:
|
| 212 |
+
audio_faded[:, :fade_samples] = audio_fade_section
|
| 213 |
+
else:
|
| 214 |
+
audio_faded[:, -fade_samples:] = audio_fade_section
|
| 215 |
+
|
| 216 |
+
return audio_faded
|
| 217 |
+
|
| 218 |
def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
|
| 219 |
+
"""
|
| 220 |
+
Apply fade-in and/or fade-out effects to the audio tensor.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
audio (torch.Tensor): The input audio tensor of shape (C, L).
|
| 224 |
+
sample_rate (int): The sample rate of the audio.
|
| 225 |
+
duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
|
| 226 |
+
out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
|
| 227 |
+
start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
|
| 228 |
+
curve_start (float, optional): The starting amplitude of the fade curve. Defaults to 0.0.
|
| 229 |
+
curve_end (float, optional): The ending amplitude of the fade curve. Defaults to 1.0.
|
| 230 |
+
current_device (str, optional): The device on which the fade curve tensor should be created. Defaults to "cpu".
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
torch.Tensor: The audio tensor with the fade effect applied.
|
| 234 |
+
|
| 235 |
+
"""
|
| 236 |
fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
|
| 237 |
fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device) # Generate linear fade curve
|
| 238 |
|
audiocraft/models/musicgen.py
CHANGED
|
@@ -207,8 +207,8 @@ class MusicGen:
|
|
| 207 |
convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
|
| 208 |
if wav is not None else None
|
| 209 |
for wav in melody_wavs]
|
| 210 |
-
attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
|
| 211 |
-
|
| 212 |
|
| 213 |
if prompt is not None:
|
| 214 |
if prompt.dim() == 2:
|
|
@@ -219,11 +219,11 @@ class MusicGen:
|
|
| 219 |
if descriptions is None:
|
| 220 |
descriptions = [None] * len(prompt)
|
| 221 |
|
| 222 |
-
if prompt is not None:
|
| 223 |
-
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
if prompt is not None:
|
| 228 |
assert prompt_tokens is not None
|
| 229 |
else:
|
|
|
|
| 207 |
convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
|
| 208 |
if wav is not None else None
|
| 209 |
for wav in melody_wavs]
|
| 210 |
+
#attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
|
| 211 |
+
# melody_wavs=melody_wavs)
|
| 212 |
|
| 213 |
if prompt is not None:
|
| 214 |
if prompt.dim() == 2:
|
|
|
|
| 219 |
if descriptions is None:
|
| 220 |
descriptions = [None] * len(prompt)
|
| 221 |
|
| 222 |
+
#if prompt is not None:
|
| 223 |
+
# attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
|
| 224 |
|
| 225 |
+
attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
|
| 226 |
+
melody_wavs=melody_wavs)
|
| 227 |
if prompt is not None:
|
| 228 |
assert prompt_tokens is not None
|
| 229 |
else:
|
audiocraft/utils/extend.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import math
|
| 3 |
from audiocraft.models import MusicGen
|
|
@@ -54,9 +55,21 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
|
|
| 54 |
|
| 55 |
# Calculate the total number of segments
|
| 56 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
|
|
|
|
|
|
| 57 |
#calc excess duration
|
| 58 |
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
| 59 |
-
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
| 62 |
if len(melody_segments) < total_segments:
|
|
@@ -83,24 +96,70 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
|
|
| 83 |
torch.manual_seed(seed)
|
| 84 |
for idx, verse in enumerate(melodys):
|
| 85 |
if INTERRUPTING:
|
| 86 |
-
return output_segments, duration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
| 89 |
-
if output_segments:
|
| 90 |
-
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
| 91 |
-
last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
|
| 92 |
output = MODEL.generate_with_all(
|
| 93 |
descriptions=[text],
|
| 94 |
melody_wavs=verse,
|
| 95 |
sample_rate=sr,
|
| 96 |
progress=False,
|
| 97 |
-
prompt=
|
| 98 |
)
|
| 99 |
|
| 100 |
# Append the generated output to the list of segments
|
| 101 |
#output_segments.append(output[:, :segment_duration])
|
| 102 |
output_segments.append(output)
|
| 103 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
|
|
|
|
|
|
|
|
|
| 104 |
return output_segments, excess_duration
|
| 105 |
|
| 106 |
def save_image(image):
|
|
|
|
| 1 |
+
from tabnanny import verbose
|
| 2 |
import torch
|
| 3 |
import math
|
| 4 |
from audiocraft.models import MusicGen
|
|
|
|
| 55 |
|
| 56 |
# Calculate the total number of segments
|
| 57 |
total_segments = max(math.ceil(duration / segment_duration),1)
|
| 58 |
+
#calculate duration loss from segment overlap
|
| 59 |
+
duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
|
| 60 |
#calc excess duration
|
| 61 |
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
| 62 |
+
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
|
| 63 |
+
duration += duration_loss
|
| 64 |
+
while excess_duration + duration_loss > segment_duration:
|
| 65 |
+
total_segments += 1
|
| 66 |
+
#calculate duration loss from segment overlap
|
| 67 |
+
duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
|
| 68 |
+
#calc excess duration
|
| 69 |
+
excess_duration = segment_duration - (total_segments * segment_duration - duration)
|
| 70 |
+
print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
|
| 71 |
+
if excess_duration + duration_loss > segment_duration:
|
| 72 |
+
duration += duration_loss
|
| 73 |
|
| 74 |
# If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
|
| 75 |
if len(melody_segments) < total_segments:
|
|
|
|
| 96 |
torch.manual_seed(seed)
|
| 97 |
for idx, verse in enumerate(melodys):
|
| 98 |
if INTERRUPTING:
|
| 99 |
+
return output_segments, duration
|
| 100 |
+
|
| 101 |
+
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
|
| 102 |
+
# Compensate for the length of final segment
|
| 103 |
+
if (idx + 1) == len(melodys):
|
| 104 |
+
print(f'Modify Last verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
|
| 105 |
+
MODEL.set_generation_params(
|
| 106 |
+
use_sampling=True,
|
| 107 |
+
top_k=MODEL.generation_params["top_k"],
|
| 108 |
+
top_p=MODEL.generation_params["top_p"],
|
| 109 |
+
temperature=MODEL.generation_params["temp"],
|
| 110 |
+
cfg_coef=MODEL.generation_params["cfg_coef"],
|
| 111 |
+
duration=duration,
|
| 112 |
+
two_step_cfg=False,
|
| 113 |
+
rep_penalty=0.5
|
| 114 |
+
)
|
| 115 |
+
try:
|
| 116 |
+
# get last chunk
|
| 117 |
+
verse = verse[:, :, -duration*MODEL.sample_rate:]
|
| 118 |
+
prompt_segment = prompt_segment[:, :, -duration*MODEL.sample_rate:]
|
| 119 |
+
except:
|
| 120 |
+
# get first chunk
|
| 121 |
+
verse = verse[:, :, :duration*MODEL.sample_rate]
|
| 122 |
+
prompt_segment = prompt_segment[:, :, :duration*MODEL.sample_rate]
|
| 123 |
+
|
| 124 |
+
else:
|
| 125 |
+
MODEL.set_generation_params(
|
| 126 |
+
use_sampling=True,
|
| 127 |
+
top_k=MODEL.generation_params["top_k"],
|
| 128 |
+
top_p=MODEL.generation_params["top_p"],
|
| 129 |
+
temperature=MODEL.generation_params["temp"],
|
| 130 |
+
cfg_coef=MODEL.generation_params["cfg_coef"],
|
| 131 |
+
duration=segment_duration,
|
| 132 |
+
two_step_cfg=False,
|
| 133 |
+
rep_penalty=0.5
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Generate a new prompt segment based on the first verse. This will be applied to all segments for consistency
|
| 137 |
+
if idx == 0:
|
| 138 |
+
print(f"Generating New Prompt Segment: {text}\r")
|
| 139 |
+
prompt_segment = MODEL.generate_with_all(
|
| 140 |
+
descriptions=[text],
|
| 141 |
+
melody_wavs=verse,
|
| 142 |
+
sample_rate=sr,
|
| 143 |
+
progress=False,
|
| 144 |
+
prompt=None,
|
| 145 |
+
)
|
| 146 |
|
| 147 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
|
|
|
|
|
|
|
|
|
| 148 |
output = MODEL.generate_with_all(
|
| 149 |
descriptions=[text],
|
| 150 |
melody_wavs=verse,
|
| 151 |
sample_rate=sr,
|
| 152 |
progress=False,
|
| 153 |
+
prompt=prompt_segment,
|
| 154 |
)
|
| 155 |
|
| 156 |
# Append the generated output to the list of segments
|
| 157 |
#output_segments.append(output[:, :segment_duration])
|
| 158 |
output_segments.append(output)
|
| 159 |
print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
|
| 160 |
+
#track duration
|
| 161 |
+
if duration > segment_duration:
|
| 162 |
+
duration -= segment_duration
|
| 163 |
return output_segments, excess_duration
|
| 164 |
|
| 165 |
def save_image(image):
|