Spaces:

amaai-lab
/

music2emo

Running on Zero

App Files Files Community

kjysmu commited on Feb 10

Commit

4c53a91

verified ·

1 Parent(s): 6ad6801

Upload 14 files

Browse files

Files changed (14) hide show

model/__init__.py +3 -0
model/__pycache__/__init__.cpython-310.pyc +0 -0
model/__pycache__/crnn.cpython-310.pyc +0 -0
model/__pycache__/linear.cpython-310.pyc +0 -0
model/__pycache__/linear_mt.cpython-310.pyc +0 -0
model/__pycache__/linear_mt_multitask.cpython-310.pyc +0 -0
model/__pycache__/linear_multitask.cpython-310.pyc +0 -0
model/__pycache__/linear_small.cpython-310.pyc +0 -0
model/__pycache__/linear_small_multitask.cpython-310.pyc +0 -0
model/__pycache__/transformer.cpython-310.pyc +0 -0
model/__pycache__/transformer_multitask.cpython-310.pyc +0 -0
model/linear.py +39 -0
model/linear_attn_ck.py +92 -0
model/linear_mt_attn_ck.py +100 -0

model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ "Import all submodules"
2	+
3	+ # from model import

model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (171 Bytes). View file

model/__pycache__/crnn.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

model/__pycache__/linear.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

model/__pycache__/linear_mt.cpython-310.pyc ADDED Viewed

Binary file (989 Bytes). View file

model/__pycache__/linear_mt_multitask.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

model/__pycache__/linear_multitask.cpython-310.pyc ADDED Viewed

Binary file (1.41 kB). View file

model/__pycache__/linear_small.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

model/__pycache__/linear_small_multitask.cpython-310.pyc ADDED Viewed

Binary file (1.34 kB). View file

model/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (1.62 kB). View file

model/__pycache__/transformer_multitask.cpython-310.pyc ADDED Viewed

Binary file (2.07 kB). View file

model/linear.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from sklearn import metrics
+from transformers import AutoModelForAudioClassification
+import numpy as np
+class FeedforwardModel(nn.Module):
+    def __init__(self, input_size, output_size):
+        super(FeedforwardModel, self).__init__()
+        self.model = nn.Sequential(
+            nn.Linear(input_size, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(1024, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, 128),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(128, output_size),
+        )
+    def forward(self, x):
+        logit = self.model(x)
+        return logit

model/linear_attn_ck.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from sklearn import metrics
+from transformers import AutoModelForAudioClassification
+import numpy as np
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=100):
+        super().__init__()
+        self.encoding = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        self.encoding[:, 0::2] = torch.sin(position * div_term)
+        self.encoding[:, 1::2] = torch.cos(position * div_term)
+        self.encoding = self.encoding.unsqueeze(0)  # Shape: (1, max_len, d_model)
+    def forward(self, x):
+        seq_len = x.size(1)
+        return x + self.encoding[:, :seq_len, :].to(x.device)
+class FeedforwardModelAttnCK(nn.Module):
+    def __init__(self, input_size, output_size, nhead=8, num_layers=1, dropout_rate=0.1,
+                 num_key = 2, num_chords=158, num_chords_root=14, num_chords_attr=14,
+                 key_emb_dim=4, chord_emb_dim=8, chord_root_emb_dim=4, chord_attr_emb_dim=4):
+        super().__init__()
+        self.d_model = 512
+        self.d_model_transformer = chord_root_emb_dim + chord_attr_emb_dim
+        # Embedding layers for chords and keys
+        self.chord_root_embedding = nn.Embedding(num_chords_root, chord_root_emb_dim)
+        self.chord_attr_embedding = nn.Embedding(num_chords_attr, chord_attr_emb_dim)
+        nn.init.xavier_uniform_(self.chord_root_embedding.weight)
+        nn.init.xavier_uniform_(self.chord_attr_embedding.weight)
+        # Positional encoding for chord progression
+        self.positional_encoding = PositionalEncoding(self.d_model_transformer)
+        # Transformer for chord progression modeling
+        self.chord_transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=self.d_model_transformer, nhead=nhead, dim_feedforward= 64, dropout=0.1, batch_first=True),
+            num_layers=2
+        )
+        # Input projection for latent features
+        self.input_proj = nn.Sequential(
+            nn.Linear(input_size +  self.d_model_transformer + 1, self.d_model),
+            nn.ReLU(),
+        )
+        # Output projection
+        self.output_proj = nn.Sequential(
+            nn.Linear(self.d_model, 256),
+            nn.ReLU(),
+            nn.Linear(256, output_size),
+        )
+    def forward(self, model_input_dic ):
+        x_mert = model_input_dic["x_mert"]
+        x_chord_root = model_input_dic["x_chord_root"]
+        x_chord_attr = model_input_dic["x_chord_attr"]
+        x_key = model_input_dic["x_key"]
+        key_embedding = x_key.float()
+        chord_root_embedding = self.chord_root_embedding(x_chord_root)  # Shape: (batch_size, seq_len, chord_root_emb_dim)
+        chord_attr_embedding = self.chord_attr_embedding(x_chord_attr)  # Shape: (batch_size, seq_len, chord_attr_emb_dim)
+        # Concatenate root and attribute embeddings
+        chord_combined_embedding = torch.cat(
+            (chord_root_embedding, chord_attr_embedding), dim=-1
+        )  # Shape: (batch_size, seq_len, chord_root_emb_dim + chord_attr_emb_dim)
+        # Positional encoding and chord transformer
+        chord_combined_embedding = self.positional_encoding(chord_combined_embedding)
+        cls_token = torch.zeros_like(chord_combined_embedding[:, :1, :])
+        chord_embedding_with_cls = torch.cat([cls_token, chord_combined_embedding], dim=1)  # Add CLS at the start
+        chord_embedding_transformed = self.chord_transformer(chord_embedding_with_cls)  # Shape: (seq_len+1, batch_size, chord_emb_dim)
+        chord_embedding_cls = chord_embedding_transformed[:,0,:]  # Shape: (batch_size, chord_emb_dim)
+        # Combine all features
+        combined_features = torch.cat((x_mert, chord_embedding_cls, key_embedding), dim=1)
+        # Input projection
+        combined_features = self.input_proj(combined_features)  # Shape: (batch_size, d_model)
+        output = self.output_proj(combined_features)  # Shape: (batch_size, output_size)
+        return output

model/linear_mt_attn_ck.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from sklearn import metrics
+from transformers import AutoModelForAudioClassification
+import numpy as np
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=100):
+        super().__init__()
+        self.encoding = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        self.encoding[:, 0::2] = torch.sin(position * div_term)
+        self.encoding[:, 1::2] = torch.cos(position * div_term)
+        self.encoding = self.encoding.unsqueeze(0)  # Shape: (1, max_len, d_model)
+    def forward(self, x):
+        seq_len = x.size(1)
+        return x + self.encoding[:, :seq_len, :].to(x.device)
+class FeedforwardModelMTAttnCK(nn.Module):
+    def __init__(self, input_size, output_size_classification, output_size_regression, nhead=8, num_layers=1, dropout_rate=0.1,
+                 num_key = 2, num_chords=158, num_chords_root=14, num_chords_attr=14,
+                 key_emb_dim=4, chord_emb_dim=8, chord_root_emb_dim=4, chord_attr_emb_dim=4):
+        super().__init__()
+        self.d_model = 512
+        self.d_model_transformer = chord_root_emb_dim + chord_attr_emb_dim
+        # Embedding layers for chords and keys
+        self.chord_root_embedding = nn.Embedding(num_chords_root, chord_root_emb_dim)
+        self.chord_attr_embedding = nn.Embedding(num_chords_attr, chord_attr_emb_dim)
+        nn.init.xavier_uniform_(self.chord_root_embedding.weight)
+        nn.init.xavier_uniform_(self.chord_attr_embedding.weight)
+        # Positional encoding for chord progression
+        self.positional_encoding = PositionalEncoding(self.d_model_transformer)
+        # Transformer for chord progression modeling
+        self.chord_transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=self.d_model_transformer, nhead=nhead, dim_feedforward= 64, dropout=0.1, batch_first=True),
+            num_layers=2
+        )
+        self.input_proj = nn.Sequential(
+            nn.Linear(input_size +  self.d_model_transformer + 1, self.d_model),
+            nn.ReLU(),
+        )
+        # Classification branch (e.g., Jamendo - mood classification with 56 classes)
+        self.classification_branch = nn.Sequential(
+            nn.Linear(self.d_model, 256),
+            nn.ReLU(),
+            nn.Linear(256, output_size_classification)  # Output: 56 classes
+        )
+        # Regression branch (e.g., DMDD - valence-arousal prediction, including std values)
+        self.regression_branch = nn.Sequential(
+            nn.Linear(self.d_model, 256),
+            nn.ReLU(),
+            nn.Linear(256, output_size_regression)  # Output: [mean, std] for valence-arousal
+        )
+    def forward(self, model_input_dic ):
+        x_mert = model_input_dic["x_mert"]
+        x_chord_root = model_input_dic["x_chord_root"]
+        x_chord_attr = model_input_dic["x_chord_attr"]
+        x_key = model_input_dic["x_key"]
+        key_embedding = x_key.float()
+        chord_root_embedding = self.chord_root_embedding(x_chord_root)  # Shape: (batch_size, seq_len, chord_root_emb_dim)
+        chord_attr_embedding = self.chord_attr_embedding(x_chord_attr)  # Shape: (batch_size, seq_len, chord_attr_emb_dim)
+        # Concatenate root and attribute embeddings
+        chord_combined_embedding = torch.cat(
+            (chord_root_embedding, chord_attr_embedding), dim=-1
+        )  # Shape: (batch_size, seq_len, chord_root_emb_dim + chord_attr_emb_dim)
+        chord_combined_embedding = self.positional_encoding(chord_combined_embedding)
+        cls_token = torch.zeros_like(chord_combined_embedding[:, :1, :])
+        chord_embedding_with_cls = torch.cat([cls_token, chord_combined_embedding], dim=1)  # Add CLS at the start
+        chord_embedding_transformed = self.chord_transformer(chord_embedding_with_cls)  # Shape: (seq_len+1, batch_size, chord_emb_dim)
+        chord_embedding_cls = chord_embedding_transformed[:,0,:]  # Shape: (batch_size, chord_emb_dim)
+        # Combine all features
+        combined_features = torch.cat((x_mert, chord_embedding_cls, key_embedding), dim=1)
+        # Input projection
+        combined_features = self.input_proj(combined_features)  # Shape: (batch_size, d_model)
+        classification_output = self.classification_branch(combined_features)
+        regression_output = self.regression_branch(combined_features)
+        return classification_output, regression_output