In [None]:
pip install transformers

In [None]:
from transformers import WhisperForAudioClassification
# Load pre-trained Whisper model
model = WhisperForAudioClassification.from_pretrained("openai/whisper-medium")

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('dataset.csv')

In [None]:
from transformers import WhisperProcessor

# Initialize the Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")

In [None]:
import librosa
import torch

# Create a custom dataset class
class LispDataset(torch.utils.data.Dataset):
 def __init__(self, df):
 self.df = df

 def __len__(self):
 return len(self.df)
 
 def __getitem__(self, idx):
 row = self.df.iloc[idx]
 audio_path = row['file_path']
 label = row['label']

 audio, original_sr = librosa.load(audio_path, sr=44100)

 # Resample to target sample rate (if needed)
 target_sr = 16000
 if original_sr != target_sr:
 audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)

 # Extract mel features
 mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
 mel_spectrogram_db = librosa.power_to_db(mel_spectrogram) # Convert to decibels

 # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
 max_len = 3000 # Replace with your desired maximum length
 pad_width = (0, max_len - mel_spectrogram_db.shape[1]) # Calculate padding width
 mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(), 
 pad_width, mode='constant', value=0)

 # Convert to tensor
 input_features = mel_spectrogram_db_padded

 # # Convert to tensor
 # input_features = torch.from_numpy(mel_spectrogram_db_padded).float()

 # Create dictionary with expected key
 return {'input_features': input_features, 'labels': label}
 
# Create a DataLoader
train_dataset = LispDataset(df)

In [None]:
from transformers import TrainingArguments

# Training arguments (adjust learning rate as needed)
training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=10,
 per_device_train_batch_size=2,
 learning_rate=5e-5,
 fp16=True,
 use_cpu=True,
 warmup_ratio=0.1,
 metric_for_best_model="accuracy",
 gradient_accumulation_steps=1 # No gradient accumulation (equivalent to no_auto_optimize=True)
)

In [None]:
from torch.optim import AdamW # Import AdamW from PyTorch

# Create the optimizer (adjust other hyperparameters as needed)
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

In [None]:
from torch.optim.lr_scheduler import LambdaLR

lambda1 = lambda epoch: epoch // 30
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])

optimizertuple = (optimizer,scheduler)

In [None]:
from transformers import Trainer

# Trainer instance
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 optimizers=optimizertuple, # Wrap optimizer in a tuple
)

# Start training
trainer.train()