Spaces:
Runtime error
Runtime error
modify fx norm
Browse files- app.py +22 -39
- inference.py +0 -2
app.py
CHANGED
|
@@ -63,31 +63,34 @@ def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio,
|
|
| 63 |
|
| 64 |
return process_audio(input_audio, reference_audio)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
def process_audio(input_audio, reference_audio):
|
| 67 |
output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
|
| 68 |
input_audio, reference_audio
|
| 69 |
)
|
| 70 |
|
| 71 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
| 72 |
-
|
| 73 |
-
# Convert output_audio to numpy array if it's a tensor
|
| 74 |
-
if isinstance(output_audio, torch.Tensor):
|
| 75 |
-
output_audio = output_audio.cpu().numpy()
|
| 76 |
-
|
| 77 |
-
if output_audio.ndim == 1:
|
| 78 |
-
output_audio = output_audio.reshape(-1, 1)
|
| 79 |
-
elif output_audio.ndim > 2:
|
| 80 |
-
output_audio = output_audio.squeeze()
|
| 81 |
-
|
| 82 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
| 83 |
-
if output_audio.shape[1] > output_audio.shape[0]:
|
| 84 |
-
output_audio = output_audio.transpose(1,0)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
| 86 |
# Normalize output audio
|
| 87 |
-
output_audio = loudness_normalize(output_audio, sr)
|
| 88 |
# Denormalize the audio to int16
|
| 89 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
| 90 |
-
normalized_input = denormalize_audio(normalized_input, dtype=np.int16)
|
| 91 |
|
| 92 |
return (sr, output_audio), param_output, (sr, normalized_input)
|
| 93 |
|
|
@@ -125,18 +128,8 @@ def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, op
|
|
| 125 |
current_output = last_result['audio']
|
| 126 |
ito_param_output = mastering_transfer.get_param_output_string(last_result['params'])
|
| 127 |
|
| 128 |
-
# Convert
|
| 129 |
-
|
| 130 |
-
current_output = current_output.cpu().numpy()
|
| 131 |
-
|
| 132 |
-
if current_output.ndim == 1:
|
| 133 |
-
current_output = current_output.reshape(-1, 1)
|
| 134 |
-
elif current_output.ndim > 2:
|
| 135 |
-
current_output = current_output.squeeze()
|
| 136 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
| 137 |
-
if current_output.shape[1] > current_output.shape[0]:
|
| 138 |
-
current_output = current_output.transpose(1,0)
|
| 139 |
-
|
| 140 |
# Loudness normalize output audio
|
| 141 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
| 142 |
# Denormalize the audio to int16
|
|
@@ -149,18 +142,8 @@ def update_ito_output(all_results, selected_step):
|
|
| 149 |
current_output = selected_result['audio']
|
| 150 |
ito_param_output = mastering_transfer.get_param_output_string(selected_result['params'])
|
| 151 |
|
| 152 |
-
# Convert
|
| 153 |
-
|
| 154 |
-
current_output = current_output.cpu().numpy()
|
| 155 |
-
|
| 156 |
-
if current_output.ndim == 1:
|
| 157 |
-
current_output = current_output.reshape(-1, 1)
|
| 158 |
-
elif current_output.ndim > 2:
|
| 159 |
-
current_output = current_output.squeeze()
|
| 160 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
| 161 |
-
if current_output.shape[1] > current_output.shape[0]:
|
| 162 |
-
current_output = current_output.transpose(1,0)
|
| 163 |
-
|
| 164 |
# Loudness normalize output audio
|
| 165 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
| 166 |
# Denormalize the audio to int16
|
|
|
|
| 63 |
|
| 64 |
return process_audio(input_audio, reference_audio)
|
| 65 |
|
| 66 |
+
def to_numpy_audio(audio):
|
| 67 |
+
# Convert output_audio to numpy array if it's a tensor
|
| 68 |
+
if isinstance(audio, torch.Tensor):
|
| 69 |
+
audio = audio.cpu().numpy()
|
| 70 |
+
# check dimension
|
| 71 |
+
if audio.ndim == 1:
|
| 72 |
+
audio = audio.reshape(-1, 1)
|
| 73 |
+
elif audio.ndim > 2:
|
| 74 |
+
audio = audio.squeeze()
|
| 75 |
+
# Ensure the audio is in the correct shape (samples, channels)
|
| 76 |
+
if audio.shape[1] > audio.shape[0]:
|
| 77 |
+
audio = audio.transpose(1,0)
|
| 78 |
+
return audio
|
| 79 |
+
|
| 80 |
def process_audio(input_audio, reference_audio):
|
| 81 |
output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
|
| 82 |
input_audio, reference_audio
|
| 83 |
)
|
| 84 |
|
| 85 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
# Convert to numpy audio
|
| 88 |
+
output_audio = to_numpy_audio(output_audio)
|
| 89 |
+
normalized_input = to_numpy_audio(normalized_input)
|
| 90 |
# Normalize output audio
|
| 91 |
+
output_audio = loudness_normalize(output_audio, sr)
|
| 92 |
# Denormalize the audio to int16
|
| 93 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
|
|
|
| 94 |
|
| 95 |
return (sr, output_audio), param_output, (sr, normalized_input)
|
| 96 |
|
|
|
|
| 128 |
current_output = last_result['audio']
|
| 129 |
ito_param_output = mastering_transfer.get_param_output_string(last_result['params'])
|
| 130 |
|
| 131 |
+
# Convert to numpy audio
|
| 132 |
+
current_output = to_numpy_audio(current_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
# Loudness normalize output audio
|
| 134 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
| 135 |
# Denormalize the audio to int16
|
|
|
|
| 142 |
current_output = selected_result['audio']
|
| 143 |
ito_param_output = mastering_transfer.get_param_output_string(selected_result['params'])
|
| 144 |
|
| 145 |
+
# Convert to numpy audio
|
| 146 |
+
current_output = to_numpy_audio(current_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
# Loudness normalize output audio
|
| 148 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
| 149 |
# Denormalize the audio to int16
|
inference.py
CHANGED
|
@@ -153,8 +153,6 @@ class MasteringStyleTransfer:
|
|
| 153 |
def process_audio(self, input_audio, reference_audio):
|
| 154 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, normalize=True)
|
| 155 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
| 156 |
-
print(f"input_tensor: {input_tensor.shape}")
|
| 157 |
-
print(f"reference_tensor: {reference_tensor.shape}")
|
| 158 |
|
| 159 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
| 160 |
|
|
|
|
| 153 |
def process_audio(self, input_audio, reference_audio):
|
| 154 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, normalize=True)
|
| 155 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
|
|
|
|
|
|
| 156 |
|
| 157 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
| 158 |
|