Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import soundfile as sf | |
| import numpy as np | |
| import yaml | |
| from inference import MasteringStyleTransfer | |
| from utils import download_youtube_audio | |
| from config import args | |
| import pyloudnorm as pyln | |
| import tempfile | |
| import os | |
| import matplotlib.pyplot as plt | |
| import io | |
| mastering_transfer = MasteringStyleTransfer(args) | |
| def denormalize_audio(audio, dtype=np.int16): | |
| """ | |
| Denormalize the audio from the range [-1, 1] to the full range of the specified dtype. | |
| """ | |
| if dtype == np.int16: | |
| audio = np.clip(audio, -1, 1) # Ensure the input is in the range [-1, 1] | |
| return (audio * 32767).astype(np.int16) | |
| elif dtype == np.float32: | |
| return audio.astype(np.float32) | |
| else: | |
| raise ValueError("Unsupported dtype. Use np.int16 or np.float32.") | |
| def loudness_normalize(audio, sample_rate, target_loudness=-12.0): | |
| # Ensure audio is float32 | |
| if audio.dtype != np.float32: | |
| audio = audio.astype(np.float32) | |
| # If audio is mono, reshape to (samples, 1) | |
| if audio.ndim == 1: | |
| audio = audio.reshape(-1, 1) | |
| meter = pyln.Meter(sample_rate) # create BS.1770 meter | |
| loudness = meter.integrated_loudness(audio) | |
| loudness_normalized_audio = pyln.normalize.loudness(audio, loudness, target_loudness) | |
| return loudness_normalized_audio | |
| def process_audio(input_audio, reference_audio): | |
| output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio( | |
| input_audio, reference_audio, reference_audio, {}, False | |
| ) | |
| param_output = mastering_transfer.get_param_output_string(predicted_params) | |
| # Convert output_audio to numpy array if it's a tensor | |
| if isinstance(output_audio, torch.Tensor): | |
| output_audio = output_audio.cpu().numpy() | |
| # # Normalize output audio | |
| # output_audio = loudness_normalize(output_audio, sr) | |
| # Denormalize the audio to int16 | |
| output_audio = denormalize_audio(output_audio, dtype=np.int16) | |
| if output_audio.ndim == 1: | |
| output_audio = output_audio.reshape(-1, 1) | |
| elif output_audio.ndim > 2: | |
| output_audio = output_audio.squeeze() | |
| # Ensure the audio is in the correct shape (samples, channels) | |
| if output_audio.shape[1] > output_audio.shape[0]: | |
| output_audio = output_audio.transpose(1,0) | |
| print(output_audio.shape) | |
| print(param_output) | |
| return (sr, output_audio), param_output | |
| def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): | |
| if ito_reference_audio is None: | |
| ito_reference_audio = reference_audio | |
| ito_config = { | |
| 'optimizer': optimizer, | |
| 'learning_rate': learning_rate, | |
| 'num_steps': num_steps, | |
| 'af_weights': af_weights, | |
| 'sample_rate': args.sample_rate | |
| } | |
| input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate) | |
| reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate) | |
| ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate) | |
| initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor) | |
| ito_log = "" | |
| loss_values = [] | |
| for log_entry, current_output, current_params, step, loss in mastering_transfer.inference_time_optimization( | |
| input_tensor, ito_reference_tensor, ito_config, initial_reference_feature | |
| ): | |
| ito_log += log_entry | |
| ito_param_output = mastering_transfer.get_param_output_string(current_params) | |
| loss_values.append(loss) | |
| # Convert current_output to numpy array if it's a tensor | |
| if isinstance(current_output, torch.Tensor): | |
| current_output = current_output.cpu().numpy() | |
| # Normalize output audio | |
| current_output = loudness_normalize(current_output, args.sample_rate) | |
| # Denormalize the audio to int16 | |
| current_output = denormalize_audio(current_output, dtype=np.int16) | |
| # Ensure the audio is in the correct shape (samples, channels) | |
| if current_output.ndim == 1: | |
| current_output = current_output.reshape(-1, 1) | |
| elif current_output.ndim > 2: | |
| current_output = current_output.squeeze() | |
| yield (args.sample_rate, current_output), ito_param_output, step, ito_log, loss_values | |
| def plot_loss_curve(loss_values): | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(loss_values) | |
| plt.title('ITO Loss Curve') | |
| plt.xlabel('Step') | |
| plt.ylabel('Loss') | |
| plt.grid(True) | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| return buf | |
| """ APP display """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Mastering Style Transfer Demo") | |
| with gr.Tab("Upload Audio"): | |
| with gr.Row(): | |
| input_audio = gr.Audio(label="Input Audio") | |
| reference_audio = gr.Audio(label="Reference Audio") | |
| process_button = gr.Button("Process Mastering Style Transfer") | |
| with gr.Row(): | |
| output_audio = gr.Audio(label="Output Audio", type='numpy') | |
| param_output = gr.Textbox(label="Predicted Parameters", lines=5) | |
| process_button.click( | |
| process_audio, | |
| inputs=[input_audio, reference_audio], | |
| outputs=[output_audio, param_output] | |
| ) | |
| gr.Markdown("## Inference Time Optimization (ITO)") | |
| with gr.Row(): | |
| ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)") | |
| with gr.Column(): | |
| num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps") | |
| optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer") | |
| learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate") | |
| af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1") | |
| ito_button = gr.Button("Perform ITO") | |
| with gr.Row(): | |
| with gr.Column(): | |
| ito_output_audio = gr.Audio(label="ITO Output Audio") | |
| ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=15) | |
| with gr.Column(): | |
| ito_steps_taken = gr.Number(label="ITO Steps Taken") | |
| ito_loss_plot = gr.Image(label="ITO Loss Curve") | |
| ito_log = gr.Textbox(label="ITO Log", lines=10) | |
| def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): | |
| af_weights = [float(w.strip()) for w in af_weights.split(',')] | |
| ito_generator = perform_ito( | |
| input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights | |
| ) | |
| # Initialize variables to store the final results | |
| final_audio = None | |
| final_params = None | |
| final_steps = 0 | |
| final_log = "" | |
| # Iterate through the generator to get the final results | |
| for audio, params, steps, log, losses in ito_generator: | |
| final_audio = audio | |
| final_params = params | |
| final_steps = steps | |
| final_log = log | |
| loss_values = losses | |
| loss_plot = plot_loss_curve(loss_values) | |
| return final_audio, final_params, final_steps, final_log, loss_plot | |
| ito_button.click( | |
| run_ito, | |
| inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights], | |
| outputs=[ito_output_audio, ito_param_output, ito_steps_taken, ito_log, ito_loss_plot] | |
| ) | |
| demo.launch() | |