from transformers import WhisperForAudioClassification

# Load pre-trained Whisper model
model = WhisperForAudioClassification.from_pretrained("openai/whisper-medium")

import pandas as pd

# Load the CSV file
df = pd.read_csv('dataset.csv')

from transformers import WhisperProcessor

# Initialize the Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")

import librosa
import torch

# Create a custom dataset class
class LispDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df

  def __len__(self):
    return len(self.df)
  
  def __getitem__(self, idx):
    row = self.df.iloc[idx]
    audio_path = row['file_path']
    label = row['label']

    audio, original_sr = librosa.load(audio_path, sr=44100)

    # Resample to target sample rate (if needed)
    target_sr = 16000
    if original_sr != target_sr:
        audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)

    # Extract mel features
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)  # Convert to decibels

    # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
    max_len = 3000  # Replace with your desired maximum length
    pad_width = (0, max_len - mel_spectrogram_db.shape[1])  # Calculate padding width
    mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(), 
                                                        pad_width, mode='constant', value=0)

   # Convert to tensor
    input_features = mel_spectrogram_db_padded

    # # Convert to tensor
    # input_features = torch.from_numpy(mel_spectrogram_db_padded).float()

    # Create dictionary with expected key
    return {'input_features': input_features, 'labels': label}
 
# Create a DataLoader
train_dataset = LispDataset(df)

from transformers import TrainingArguments

# Training arguments (adjust learning rate as needed)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    fp16=True,
    use_cpu=True,
    warmup_ratio=0.1,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=1  # No gradient accumulation (equivalent to no_auto_optimize=True)
)

from torch.optim import AdamW  # Import AdamW from PyTorch

# Create the optimizer (adjust other hyperparameters as needed)
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

from torch.optim.lr_scheduler import LambdaLR

lambda1 = lambda epoch: epoch // 30
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])

optimizertuple = (optimizer,scheduler)

from transformers import Trainer

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=optimizertuple,  # Wrap optimizer in a tuple
)

# Start training
trainer.train()

# import soundfile as sf

""" # Define a custom collate function to handle variable-length audio samples
def collate_fn(batch):
    # Pad audio samples to the same length
    input_lengths = [len(sample[0]) for sample in batch]
    max_length = max(input_lengths)
    padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
    attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])

    return {
        "inputs": padded_inputs,
        "attention_mask": attention_mask,
        "labels": torch.tensor([sample[1] for sample in batch])
    }
 """
""" 
def collate_fn(batch):
  # Pad audio samples to the same length
  input_lengths = [len(sample[0]) for sample in batch]
  max_length = max(input_lengths)
  padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
  attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])

  # Convert each element in batch to a dictionary
  batch = [{'inputs': padded_inputs, 'attention_mask': attention_mask, 'labels': label} for inp, mask, label in zip(padded_inputs, attention_mask, batch)]
  print (batch)

  return batch """

""" 
# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# lambda2 = lambda epoch: 0.95 ** epoch

# Load the audio file
audio, original_sr = librosa.load("dataset/lisp/sample_01.wav", sr=44100)

# Target sample rate
target_sr = 16000

# Resample the audio
audio_resampled = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr) """

""" inputs = processor(
    audio_resampled, sampling_rate=target_sr, return_tensors="pt"
)

# Forward pass
with torch.no_grad():
    logits = model(**inputs).logits

# Predict the class (0 for normal, 1 for lisp)
predicted_class = torch.argmax(logits, dim=1).item() """