Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,611 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import json
|
| 4 |
+
import torch
|
| 5 |
+
import torchaudio
|
| 6 |
+
import numpy as np
|
| 7 |
+
import logging
|
| 8 |
+
import warnings
|
| 9 |
+
import subprocess
|
| 10 |
+
import math
|
| 11 |
+
import random
|
| 12 |
+
import time
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
from PIL import Image
|
| 16 |
+
from huggingface_hub import snapshot_download
|
| 17 |
+
from omegaconf import DictConfig
|
| 18 |
+
import hydra
|
| 19 |
+
from hydra.utils import to_absolute_path
|
| 20 |
+
from transformers import Wav2Vec2FeatureExtractor, AutoModel
|
| 21 |
+
import mir_eval
|
| 22 |
+
import pretty_midi as pm
|
| 23 |
+
import gradio as gr
|
| 24 |
+
from gradio import Markdown
|
| 25 |
+
from music21 import converter
|
| 26 |
+
import torchaudio.transforms as T
|
| 27 |
+
|
| 28 |
+
# Custom utility imports
|
| 29 |
+
from utils import logger
|
| 30 |
+
from utils.btc_model import BTC_model
|
| 31 |
+
from utils.transformer_modules import *
|
| 32 |
+
from utils.transformer_modules import _gen_timing_signal, _gen_bias_mask
|
| 33 |
+
from utils.hparams import HParams
|
| 34 |
+
from utils.mir_eval_modules import (
|
| 35 |
+
audio_file_to_features, idx2chord, idx2voca_chord,
|
| 36 |
+
get_audio_paths, get_lab_paths
|
| 37 |
+
)
|
| 38 |
+
from utils.mert import FeatureExtractorMERT
|
| 39 |
+
from model.linear_mt_attn_ck import FeedforwardModelMTAttnCK
|
| 40 |
+
|
| 41 |
+
# Suppress unnecessary warnings and logs
|
| 42 |
+
warnings.filterwarnings("ignore")
|
| 43 |
+
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
|
| 44 |
+
|
| 45 |
+
# from gradio import Markdown
|
| 46 |
+
|
| 47 |
+
PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
| 48 |
+
|
| 49 |
+
pitch_num_dic = {
|
| 50 |
+
'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5,
|
| 51 |
+
'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
minor_major_dic = {
|
| 55 |
+
'D-':'C#', 'E-':'D#', 'G-':'F#', 'A-':'G#', 'B-':'A#'
|
| 56 |
+
}
|
| 57 |
+
minor_major_dic2 = {
|
| 58 |
+
'Db':'C#', 'Eb':'D#', 'Gb':'F#', 'Ab':'G#', 'Bb':'A#'
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
shift_major_dic = {
|
| 62 |
+
'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5,
|
| 63 |
+
'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
shift_minor_dic = {
|
| 67 |
+
'A': 0, 'A#': 1, 'B': 2, 'C': 3, 'C#': 4, 'D': 5,
|
| 68 |
+
'D#': 6, 'E': 7, 'F': 8, 'F#': 9, 'G': 10, 'G#': 11,
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
flat_to_sharp_mapping = {
|
| 72 |
+
"Cb": "B",
|
| 73 |
+
"Db": "C#",
|
| 74 |
+
"Eb": "D#",
|
| 75 |
+
"Fb": "E",
|
| 76 |
+
"Gb": "F#",
|
| 77 |
+
"Ab": "G#",
|
| 78 |
+
"Bb": "A#"
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
segment_duration = 30
|
| 82 |
+
resample_rate = 24000
|
| 83 |
+
is_split = True
|
| 84 |
+
|
| 85 |
+
def normalize_chord(file_path, key, key_type='major'):
|
| 86 |
+
with open(file_path, 'r') as f:
|
| 87 |
+
lines = f.readlines()
|
| 88 |
+
|
| 89 |
+
if key == "None":
|
| 90 |
+
new_key = "C major"
|
| 91 |
+
shift = 0
|
| 92 |
+
else:
|
| 93 |
+
#print ("asdas",key)
|
| 94 |
+
if len(key) == 1:
|
| 95 |
+
key = key[0].upper()
|
| 96 |
+
else:
|
| 97 |
+
key = key[0].upper() + key[1:]
|
| 98 |
+
|
| 99 |
+
if key in minor_major_dic2:
|
| 100 |
+
key = minor_major_dic2[key]
|
| 101 |
+
|
| 102 |
+
shift = 0
|
| 103 |
+
|
| 104 |
+
if key_type == "major":
|
| 105 |
+
new_key = "C major"
|
| 106 |
+
|
| 107 |
+
shift = shift_major_dic[key]
|
| 108 |
+
else:
|
| 109 |
+
new_key = "A minor"
|
| 110 |
+
shift = shift_minor_dic[key]
|
| 111 |
+
|
| 112 |
+
converted_lines = []
|
| 113 |
+
for line in lines:
|
| 114 |
+
if line.strip(): # Skip empty lines
|
| 115 |
+
parts = line.split()
|
| 116 |
+
start_time = parts[0]
|
| 117 |
+
end_time = parts[1]
|
| 118 |
+
chord = parts[2] # The chord is in the 3rd column
|
| 119 |
+
if chord == "N":
|
| 120 |
+
newchordnorm = "N"
|
| 121 |
+
elif chord == "X":
|
| 122 |
+
newchordnorm = "X"
|
| 123 |
+
elif ":" in chord:
|
| 124 |
+
pitch = chord.split(":")[0]
|
| 125 |
+
attr = chord.split(":")[1]
|
| 126 |
+
pnum = pitch_num_dic [pitch]
|
| 127 |
+
new_idx = (pnum - shift)%12
|
| 128 |
+
newchord = PITCH_CLASS[new_idx]
|
| 129 |
+
newchordnorm = newchord + ":" + attr
|
| 130 |
+
else:
|
| 131 |
+
pitch = chord
|
| 132 |
+
pnum = pitch_num_dic [pitch]
|
| 133 |
+
new_idx = (pnum - shift)%12
|
| 134 |
+
newchord = PITCH_CLASS[new_idx]
|
| 135 |
+
newchordnorm = newchord
|
| 136 |
+
|
| 137 |
+
converted_lines.append(f"{start_time} {end_time} {newchordnorm}\n")
|
| 138 |
+
|
| 139 |
+
return converted_lines
|
| 140 |
+
|
| 141 |
+
def sanitize_key_signature(key):
|
| 142 |
+
return key.replace('-', 'b')
|
| 143 |
+
|
| 144 |
+
def resample_waveform(waveform, original_sample_rate, target_sample_rate):
|
| 145 |
+
if original_sample_rate != target_sample_rate:
|
| 146 |
+
resampler = T.Resample(original_sample_rate, target_sample_rate)
|
| 147 |
+
return resampler(waveform), target_sample_rate
|
| 148 |
+
return waveform, original_sample_rate
|
| 149 |
+
|
| 150 |
+
def split_audio(waveform, sample_rate):
|
| 151 |
+
segment_samples = segment_duration * sample_rate
|
| 152 |
+
total_samples = waveform.size(0)
|
| 153 |
+
|
| 154 |
+
segments = []
|
| 155 |
+
for start in range(0, total_samples, segment_samples):
|
| 156 |
+
end = start + segment_samples
|
| 157 |
+
if end <= total_samples:
|
| 158 |
+
segment = waveform[start:end]
|
| 159 |
+
segments.append(segment)
|
| 160 |
+
|
| 161 |
+
# In case audio length is shorter than segment length.
|
| 162 |
+
if len(segments) == 0:
|
| 163 |
+
segment = waveform
|
| 164 |
+
segments.append(segment)
|
| 165 |
+
|
| 166 |
+
return segments
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class Music2emo:
|
| 171 |
+
def __init__(
|
| 172 |
+
self,
|
| 173 |
+
name="amaai-lab/music2emo",
|
| 174 |
+
device="cuda:0",
|
| 175 |
+
cache_dir=None,
|
| 176 |
+
local_files_only=False,
|
| 177 |
+
):
|
| 178 |
+
|
| 179 |
+
# use_cuda = torch.cuda.is_available()
|
| 180 |
+
# self.device = torch.device("cuda" if use_cuda else "cpu")
|
| 181 |
+
model_weights = "saved_models/J_all.ckpt"
|
| 182 |
+
self.device = device
|
| 183 |
+
|
| 184 |
+
self.feature_extractor = FeatureExtractorMERT(model_name='m-a-p/MERT-v1-95M', device=self.device, sr=resample_rate)
|
| 185 |
+
self.model_weights = model_weights
|
| 186 |
+
|
| 187 |
+
self.music2emo_model = FeedforwardModelMTAttnCK(
|
| 188 |
+
input_size= 768 * 2,
|
| 189 |
+
output_size_classification=56,
|
| 190 |
+
output_size_regression=2
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
checkpoint = torch.load(self.model_weights, map_location=self.device, weights_only=False)
|
| 194 |
+
state_dict = checkpoint["state_dict"]
|
| 195 |
+
|
| 196 |
+
# Adjust the keys in the state_dict
|
| 197 |
+
state_dict = {key.replace("model.", ""): value for key, value in state_dict.items()}
|
| 198 |
+
|
| 199 |
+
# Filter state_dict to match model's keys
|
| 200 |
+
model_keys = set(self.music2emo_model.state_dict().keys())
|
| 201 |
+
filtered_state_dict = {key: value for key, value in state_dict.items() if key in model_keys}
|
| 202 |
+
|
| 203 |
+
# Load the filtered state_dict and set the model to evaluation mode
|
| 204 |
+
self.music2emo_model.load_state_dict(filtered_state_dict)
|
| 205 |
+
|
| 206 |
+
self.music2emo_model.to(self.device)
|
| 207 |
+
self.music2emo_model.eval()
|
| 208 |
+
|
| 209 |
+
def predict(self, audio, threshold = 0.5):
|
| 210 |
+
|
| 211 |
+
feature_dir = Path("./temp_out")
|
| 212 |
+
output_dir = Path("./output")
|
| 213 |
+
|
| 214 |
+
if feature_dir.exists():
|
| 215 |
+
shutil.rmtree(str(feature_dir))
|
| 216 |
+
if output_dir.exists():
|
| 217 |
+
shutil.rmtree(str(output_dir))
|
| 218 |
+
|
| 219 |
+
feature_dir.mkdir(parents=True)
|
| 220 |
+
output_dir.mkdir(parents=True)
|
| 221 |
+
|
| 222 |
+
warnings.filterwarnings('ignore')
|
| 223 |
+
logger.logging_verbosity(1)
|
| 224 |
+
|
| 225 |
+
mert_dir = feature_dir / "mert"
|
| 226 |
+
mert_dir.mkdir(parents=True)
|
| 227 |
+
|
| 228 |
+
waveform, sample_rate = torchaudio.load(audio)
|
| 229 |
+
if waveform.shape[0] > 1:
|
| 230 |
+
waveform = waveform.mean(dim=0).unsqueeze(0)
|
| 231 |
+
waveform = waveform.squeeze()
|
| 232 |
+
waveform, sample_rate = resample_waveform(waveform, sample_rate, resample_rate)
|
| 233 |
+
|
| 234 |
+
if is_split:
|
| 235 |
+
segments = split_audio(waveform, sample_rate)
|
| 236 |
+
for i, segment in enumerate(segments):
|
| 237 |
+
segment_save_path = os.path.join(mert_dir, f"segment_{i}.npy")
|
| 238 |
+
self.feature_extractor.extract_features_from_segment(segment, sample_rate, segment_save_path)
|
| 239 |
+
else:
|
| 240 |
+
segment_save_path = os.path.join(mert_dir, f"segment_0.npy")
|
| 241 |
+
self.feature_extractor.extract_features_from_segment(waveform, sample_rate, segment_save_path)
|
| 242 |
+
|
| 243 |
+
embeddings = []
|
| 244 |
+
layers_to_extract = [5,6]
|
| 245 |
+
segment_embeddings = []
|
| 246 |
+
for filename in sorted(os.listdir(mert_dir)): # Sort files to ensure sequential order
|
| 247 |
+
file_path = os.path.join(mert_dir, filename)
|
| 248 |
+
if os.path.isfile(file_path) and filename.endswith('.npy'):
|
| 249 |
+
segment = np.load(file_path)
|
| 250 |
+
concatenated_features = np.concatenate(
|
| 251 |
+
[segment[:, layer_idx, :] for layer_idx in layers_to_extract], axis=1
|
| 252 |
+
)
|
| 253 |
+
concatenated_features = np.squeeze(concatenated_features) # Shape: 768 * 2 = 1536
|
| 254 |
+
segment_embeddings.append(concatenated_features)
|
| 255 |
+
|
| 256 |
+
segment_embeddings = np.array(segment_embeddings)
|
| 257 |
+
if len(segment_embeddings) > 0:
|
| 258 |
+
final_embedding_mert = np.mean(segment_embeddings, axis=0)
|
| 259 |
+
else:
|
| 260 |
+
final_embedding_mert = np.zeros((1536,))
|
| 261 |
+
|
| 262 |
+
final_embedding_mert = torch.from_numpy(final_embedding_mert)
|
| 263 |
+
final_embedding_mert.to(self.device)
|
| 264 |
+
|
| 265 |
+
# --- Chord feature extract ---
|
| 266 |
+
config = HParams.load("./inference/data/run_config.yaml")
|
| 267 |
+
config.feature['large_voca'] = True
|
| 268 |
+
config.model['num_chords'] = 170
|
| 269 |
+
model_file = './inference/data/btc_model_large_voca.pt'
|
| 270 |
+
idx_to_chord = idx2voca_chord()
|
| 271 |
+
model = BTC_model(config=config.model).to(self.device)
|
| 272 |
+
|
| 273 |
+
if os.path.isfile(model_file):
|
| 274 |
+
checkpoint = torch.load(model_file)
|
| 275 |
+
mean = checkpoint['mean']
|
| 276 |
+
std = checkpoint['std']
|
| 277 |
+
model.load_state_dict(checkpoint['model'])
|
| 278 |
+
|
| 279 |
+
audio_path = audio
|
| 280 |
+
audio_id = audio_path.split("/")[-1][:-4]
|
| 281 |
+
try:
|
| 282 |
+
feature, feature_per_second, song_length_second = audio_file_to_features(audio_path, config)
|
| 283 |
+
except:
|
| 284 |
+
logger.info("audio file failed to load : %s" % audio_path)
|
| 285 |
+
assert(False)
|
| 286 |
+
|
| 287 |
+
logger.info("audio file loaded and feature computation success : %s" % audio_path)
|
| 288 |
+
|
| 289 |
+
feature = feature.T
|
| 290 |
+
feature = (feature - mean) / std
|
| 291 |
+
time_unit = feature_per_second
|
| 292 |
+
n_timestep = config.model['timestep']
|
| 293 |
+
|
| 294 |
+
num_pad = n_timestep - (feature.shape[0] % n_timestep)
|
| 295 |
+
feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
|
| 296 |
+
num_instance = feature.shape[0] // n_timestep
|
| 297 |
+
|
| 298 |
+
start_time = 0.0
|
| 299 |
+
lines = []
|
| 300 |
+
with torch.no_grad():
|
| 301 |
+
model.eval()
|
| 302 |
+
feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(self.device)
|
| 303 |
+
for t in range(num_instance):
|
| 304 |
+
self_attn_output, _ = model.self_attn_layers(feature[:, n_timestep * t:n_timestep * (t + 1), :])
|
| 305 |
+
prediction, _ = model.output_layer(self_attn_output)
|
| 306 |
+
prediction = prediction.squeeze()
|
| 307 |
+
for i in range(n_timestep):
|
| 308 |
+
if t == 0 and i == 0:
|
| 309 |
+
prev_chord = prediction[i].item()
|
| 310 |
+
continue
|
| 311 |
+
if prediction[i].item() != prev_chord:
|
| 312 |
+
lines.append(
|
| 313 |
+
'%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), idx_to_chord[prev_chord]))
|
| 314 |
+
start_time = time_unit * (n_timestep * t + i)
|
| 315 |
+
prev_chord = prediction[i].item()
|
| 316 |
+
if t == num_instance - 1 and i + num_pad == n_timestep:
|
| 317 |
+
if start_time != time_unit * (n_timestep * t + i):
|
| 318 |
+
lines.append('%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), idx_to_chord[prev_chord]))
|
| 319 |
+
break
|
| 320 |
+
|
| 321 |
+
save_path = os.path.join(feature_dir, os.path.split(audio_path)[-1].replace('.mp3', '').replace('.wav', '') + '.lab')
|
| 322 |
+
with open(save_path, 'w') as f:
|
| 323 |
+
for line in lines:
|
| 324 |
+
f.write(line)
|
| 325 |
+
|
| 326 |
+
# logger.info("label file saved : %s" % save_path)
|
| 327 |
+
|
| 328 |
+
# lab file to midi file
|
| 329 |
+
starts, ends, pitchs = list(), list(), list()
|
| 330 |
+
|
| 331 |
+
intervals, chords = mir_eval.io.load_labeled_intervals(save_path)
|
| 332 |
+
for p in range(12):
|
| 333 |
+
for i, (interval, chord) in enumerate(zip(intervals, chords)):
|
| 334 |
+
root_num, relative_bitmap, _ = mir_eval.chord.encode(chord)
|
| 335 |
+
tmp_label = mir_eval.chord.rotate_bitmap_to_root(relative_bitmap, root_num)[p]
|
| 336 |
+
if i == 0:
|
| 337 |
+
start_time = interval[0]
|
| 338 |
+
label = tmp_label
|
| 339 |
+
continue
|
| 340 |
+
if tmp_label != label:
|
| 341 |
+
if label == 1.0:
|
| 342 |
+
starts.append(start_time), ends.append(interval[0]), pitchs.append(p + 48)
|
| 343 |
+
start_time = interval[0]
|
| 344 |
+
label = tmp_label
|
| 345 |
+
if i == (len(intervals) - 1):
|
| 346 |
+
if label == 1.0:
|
| 347 |
+
starts.append(start_time), ends.append(interval[1]), pitchs.append(p + 48)
|
| 348 |
+
|
| 349 |
+
midi = pm.PrettyMIDI()
|
| 350 |
+
instrument = pm.Instrument(program=0)
|
| 351 |
+
|
| 352 |
+
for start, end, pitch in zip(starts, ends, pitchs):
|
| 353 |
+
pm_note = pm.Note(velocity=120, pitch=pitch, start=start, end=end)
|
| 354 |
+
instrument.notes.append(pm_note)
|
| 355 |
+
|
| 356 |
+
midi.instruments.append(instrument)
|
| 357 |
+
midi.write(save_path.replace('.lab', '.midi'))
|
| 358 |
+
|
| 359 |
+
tonic_signatures = ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"]
|
| 360 |
+
mode_signatures = ["major", "minor"] # Major and minor modes
|
| 361 |
+
|
| 362 |
+
tonic_to_idx = {tonic: idx for idx, tonic in enumerate(tonic_signatures)}
|
| 363 |
+
mode_to_idx = {mode: idx for idx, mode in enumerate(mode_signatures)}
|
| 364 |
+
idx_to_tonic = {idx: tonic for tonic, idx in tonic_to_idx.items()}
|
| 365 |
+
idx_to_mode = {idx: mode for mode, idx in mode_to_idx.items()}
|
| 366 |
+
|
| 367 |
+
with open('inference/data/chord.json', 'r') as f:
|
| 368 |
+
chord_to_idx = json.load(f)
|
| 369 |
+
with open('inference/data/chord_inv.json', 'r') as f:
|
| 370 |
+
idx_to_chord = json.load(f)
|
| 371 |
+
idx_to_chord = {int(k): v for k, v in idx_to_chord.items()} # Ensure keys are ints
|
| 372 |
+
with open('inference/data/chord_root.json') as json_file:
|
| 373 |
+
chordRootDic = json.load(json_file)
|
| 374 |
+
with open('inference/data/chord_attr.json') as json_file:
|
| 375 |
+
chordAttrDic = json.load(json_file)
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
midi_file = converter.parse(save_path.replace('.lab', '.midi'))
|
| 379 |
+
key_signature = str(midi_file.analyze('key'))
|
| 380 |
+
except Exception as e:
|
| 381 |
+
key_signature = "None"
|
| 382 |
+
|
| 383 |
+
key_parts = key_signature.split()
|
| 384 |
+
key_signature = sanitize_key_signature(key_parts[0]) # Sanitize key signature
|
| 385 |
+
key_type = key_parts[1] if len(key_parts) > 1 else 'major'
|
| 386 |
+
|
| 387 |
+
# --- Key feature (Tonic and Mode separation) ---
|
| 388 |
+
if key_signature == "None":
|
| 389 |
+
mode = "major"
|
| 390 |
+
else:
|
| 391 |
+
mode = key_signature.split()[-1]
|
| 392 |
+
|
| 393 |
+
encoded_mode = mode_to_idx.get(mode, 0)
|
| 394 |
+
mode_tensor = torch.tensor([encoded_mode], dtype=torch.long).to(self.device)
|
| 395 |
+
|
| 396 |
+
converted_lines = normalize_chord(save_path, key_signature, key_type)
|
| 397 |
+
|
| 398 |
+
lab_norm_path = save_path[:-4] + "_norm.lab"
|
| 399 |
+
|
| 400 |
+
# Write the converted lines to the new file
|
| 401 |
+
with open(lab_norm_path, 'w') as f:
|
| 402 |
+
f.writelines(converted_lines)
|
| 403 |
+
|
| 404 |
+
chords = []
|
| 405 |
+
|
| 406 |
+
if not os.path.exists(lab_norm_path):
|
| 407 |
+
chords.append((float(0), float(0), "N"))
|
| 408 |
+
else:
|
| 409 |
+
with open(lab_norm_path, 'r') as file:
|
| 410 |
+
for line in file:
|
| 411 |
+
start, end, chord = line.strip().split()
|
| 412 |
+
chords.append((float(start), float(end), chord))
|
| 413 |
+
|
| 414 |
+
encoded = []
|
| 415 |
+
encoded_root= []
|
| 416 |
+
encoded_attr=[]
|
| 417 |
+
durations = []
|
| 418 |
+
|
| 419 |
+
for start, end, chord in chords:
|
| 420 |
+
chord_arr = chord.split(":")
|
| 421 |
+
if len(chord_arr) == 1:
|
| 422 |
+
chordRootID = chordRootDic[chord_arr[0]]
|
| 423 |
+
if chord_arr[0] == "N" or chord_arr[0] == "X":
|
| 424 |
+
chordAttrID = 0
|
| 425 |
+
else:
|
| 426 |
+
chordAttrID = 1
|
| 427 |
+
elif len(chord_arr) == 2:
|
| 428 |
+
chordRootID = chordRootDic[chord_arr[0]]
|
| 429 |
+
chordAttrID = chordAttrDic[chord_arr[1]]
|
| 430 |
+
encoded_root.append(chordRootID)
|
| 431 |
+
encoded_attr.append(chordAttrID)
|
| 432 |
+
|
| 433 |
+
if chord in chord_to_idx:
|
| 434 |
+
encoded.append(chord_to_idx[chord])
|
| 435 |
+
else:
|
| 436 |
+
print(f"Warning: Chord {chord} not found in chord.json. Skipping.")
|
| 437 |
+
|
| 438 |
+
durations.append(end - start) # Compute duration
|
| 439 |
+
|
| 440 |
+
encoded_chords = np.array(encoded)
|
| 441 |
+
encoded_chords_root = np.array(encoded_root)
|
| 442 |
+
encoded_chords_attr = np.array(encoded_attr)
|
| 443 |
+
|
| 444 |
+
# Maximum sequence length for chords
|
| 445 |
+
max_sequence_length = 100 # Define this globally or as a parameter
|
| 446 |
+
|
| 447 |
+
# Truncate or pad chord sequences
|
| 448 |
+
if len(encoded_chords) > max_sequence_length:
|
| 449 |
+
# Truncate to max length
|
| 450 |
+
encoded_chords = encoded_chords[:max_sequence_length]
|
| 451 |
+
encoded_chords_root = encoded_chords_root[:max_sequence_length]
|
| 452 |
+
encoded_chords_attr = encoded_chords_attr[:max_sequence_length]
|
| 453 |
+
|
| 454 |
+
else:
|
| 455 |
+
# Pad with zeros (padding value for chords)
|
| 456 |
+
padding = [0] * (max_sequence_length - len(encoded_chords))
|
| 457 |
+
encoded_chords = np.concatenate([encoded_chords, padding])
|
| 458 |
+
encoded_chords_root = np.concatenate([encoded_chords_root, padding])
|
| 459 |
+
encoded_chords_attr = np.concatenate([encoded_chords_attr, padding])
|
| 460 |
+
|
| 461 |
+
# Convert to tensor
|
| 462 |
+
chords_tensor = torch.tensor(encoded_chords, dtype=torch.long).to(self.device)
|
| 463 |
+
chords_root_tensor = torch.tensor(encoded_chords_root, dtype=torch.long).to(self.device)
|
| 464 |
+
chords_attr_tensor = torch.tensor(encoded_chords_attr, dtype=torch.long).to(self.device)
|
| 465 |
+
|
| 466 |
+
model_input_dic = {
|
| 467 |
+
"x_mert": final_embedding_mert.unsqueeze(0),
|
| 468 |
+
"x_chord": chords_tensor.unsqueeze(0),
|
| 469 |
+
"x_chord_root": chords_root_tensor.unsqueeze(0),
|
| 470 |
+
"x_chord_attr": chords_attr_tensor.unsqueeze(0),
|
| 471 |
+
"x_key": mode_tensor.unsqueeze(0)
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
model_input_dic = {k: v.to(self.device) for k, v in model_input_dic.items()}
|
| 475 |
+
classification_output, regression_output = self.music2emo_model(model_input_dic)
|
| 476 |
+
probs = torch.sigmoid(classification_output)
|
| 477 |
+
|
| 478 |
+
tag_list = np.load ( "./inference/data/tag_list.npy")
|
| 479 |
+
tag_list = tag_list[127:]
|
| 480 |
+
mood_list = [t.replace("mood/theme---", "") for t in tag_list]
|
| 481 |
+
threshold = threshold
|
| 482 |
+
predicted_moods = [mood_list[i] for i, p in enumerate(probs.squeeze().tolist()) if p > threshold]
|
| 483 |
+
valence, arousal = regression_output.squeeze().tolist()
|
| 484 |
+
|
| 485 |
+
model_output_dic = {
|
| 486 |
+
"valence": valence,
|
| 487 |
+
"arousal": arousal,
|
| 488 |
+
"predicted_moods": predicted_moods
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
return model_output_dic
|
| 492 |
+
|
| 493 |
+
# Initialize Mustango
|
| 494 |
+
if torch.cuda.is_available():
|
| 495 |
+
music2emo = Music2emo()
|
| 496 |
+
else:
|
| 497 |
+
music2emo = Music2emo(device="cpu")
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def format_prediction(model_output_dic):
|
| 501 |
+
"""Format the model output in a more readable and attractive format"""
|
| 502 |
+
valence = model_output_dic["valence"]
|
| 503 |
+
arousal = model_output_dic["arousal"]
|
| 504 |
+
moods = model_output_dic["predicted_moods"]
|
| 505 |
+
|
| 506 |
+
# Create a formatted string with emojis and proper formatting
|
| 507 |
+
output_text = """
|
| 508 |
+
π΅ **Music Emotion Recognition Results** π΅
|
| 509 |
+
--------------------------------------------------
|
| 510 |
+
π **Predicted Mood Tags:** {}
|
| 511 |
+
π **Valence:** {:.2f} (Scale: 1-9)
|
| 512 |
+
β‘ **Arousal:** {:.2f} (Scale: 1-9)
|
| 513 |
+
--------------------------------------------------
|
| 514 |
+
""".format(
|
| 515 |
+
', '.join(moods) if moods else 'None',
|
| 516 |
+
valence,
|
| 517 |
+
arousal
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
return output_text
|
| 521 |
+
|
| 522 |
+
title = "Music2Emo: Towards Unified Music Emotion Recognition across Dimensional and Categorical Models"
|
| 523 |
+
description_text = """
|
| 524 |
+
<p>
|
| 525 |
+
Upload an audio file to analyze its emotional characteristics using Music2Emo.
|
| 526 |
+
The model will predict:
|
| 527 |
+
β’ Mood tags describing the emotional content
|
| 528 |
+
β’ Valence score (1-9 scale, representing emotional positivity)
|
| 529 |
+
β’ Arousal score (1-9 scale, representing emotional intensity)
|
| 530 |
+
</p>
|
| 531 |
+
"""
|
| 532 |
+
|
| 533 |
+
css = """
|
| 534 |
+
#output-text {
|
| 535 |
+
font-family: monospace;
|
| 536 |
+
white-space: pre-wrap;
|
| 537 |
+
font-size: 16px;
|
| 538 |
+
background-color: #333333;
|
| 539 |
+
padding: 20px;
|
| 540 |
+
border-radius: 10px;
|
| 541 |
+
margin: 10px 0;
|
| 542 |
+
}
|
| 543 |
+
.gradio-container {
|
| 544 |
+
font-family: 'Inter', -apple-system, system-ui, sans-serif;
|
| 545 |
+
}
|
| 546 |
+
.gr-button {
|
| 547 |
+
color: white;
|
| 548 |
+
background: #1565c0;
|
| 549 |
+
border-radius: 100vh;
|
| 550 |
+
}
|
| 551 |
+
"""
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
# Initialize Music2Emo
|
| 557 |
+
if torch.cuda.is_available():
|
| 558 |
+
music2emo = Music2emo()
|
| 559 |
+
else:
|
| 560 |
+
music2emo = Music2emo(device="cpu")
|
| 561 |
+
|
| 562 |
+
with gr.Blocks(css=css) as demo:
|
| 563 |
+
gr.HTML(f"<h1><center>{title}</center></h1>")
|
| 564 |
+
gr.Markdown(description_text)
|
| 565 |
+
|
| 566 |
+
with gr.Row():
|
| 567 |
+
with gr.Column(scale=1):
|
| 568 |
+
input_audio = gr.Audio(
|
| 569 |
+
label="Upload Audio File",
|
| 570 |
+
type="filepath" # Removed 'source' parameter
|
| 571 |
+
)
|
| 572 |
+
threshold = gr.Slider(
|
| 573 |
+
minimum=0.0,
|
| 574 |
+
maximum=1.0,
|
| 575 |
+
value=0.5,
|
| 576 |
+
step=0.01,
|
| 577 |
+
label="Mood Detection Threshold",
|
| 578 |
+
info="Adjust threshold for mood detection (0.0 to 1.0)"
|
| 579 |
+
)
|
| 580 |
+
predict_btn = gr.Button("π Analyze Emotions", variant="primary")
|
| 581 |
+
|
| 582 |
+
with gr.Column(scale=1):
|
| 583 |
+
output_text = gr.Markdown(
|
| 584 |
+
label="Analysis Results",
|
| 585 |
+
elem_id="output-text"
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# Add example usage
|
| 589 |
+
gr.Examples(
|
| 590 |
+
examples=["inference/input/test.mp3"],
|
| 591 |
+
inputs=input_audio,
|
| 592 |
+
outputs=output_text,
|
| 593 |
+
fn=lambda x: format_prediction(music2emo.predict(x, 0.5)),
|
| 594 |
+
cache_examples=True
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
predict_btn.click(
|
| 598 |
+
fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),
|
| 599 |
+
inputs=[input_audio, threshold],
|
| 600 |
+
outputs=output_text
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
gr.Markdown("""
|
| 604 |
+
### π Notes:
|
| 605 |
+
- Supported audio formats: MP3, WAV
|
| 606 |
+
- For best results, use high-quality audio files
|
| 607 |
+
- Processing may take a few moments depending on file size
|
| 608 |
+
""")
|
| 609 |
+
|
| 610 |
+
# Launch the demo
|
| 611 |
+
demo.queue().launch()
|