File size: 5,127 Bytes
18f2c0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from transformers import WhisperForAudioClassification
# Load pre-trained Whisper model
model = WhisperForAudioClassification.from_pretrained("openai/whisper-medium")
import pandas as pd
# Load the CSV file
df = pd.read_csv('dataset.csv')
from transformers import WhisperProcessor
# Initialize the Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
import librosa
import torch
# Create a custom dataset class
class LispDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.df = df
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
audio_path = row['file_path']
label = row['label']
audio, original_sr = librosa.load(audio_path, sr=44100)
# Resample to target sample rate (if needed)
target_sr = 16000
if original_sr != target_sr:
audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
# Extract mel features
mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram) # Convert to decibels
# Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
max_len = 3000 # Replace with your desired maximum length
pad_width = (0, max_len - mel_spectrogram_db.shape[1]) # Calculate padding width
mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(),
pad_width, mode='constant', value=0)
# Convert to tensor
input_features = mel_spectrogram_db_padded
# # Convert to tensor
# input_features = torch.from_numpy(mel_spectrogram_db_padded).float()
# Create dictionary with expected key
return {'input_features': input_features, 'labels': label}
# Create a DataLoader
train_dataset = LispDataset(df)
from transformers import TrainingArguments
# Training arguments (adjust learning rate as needed)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=10,
per_device_train_batch_size=2,
learning_rate=5e-5,
fp16=True,
use_cpu=True,
warmup_ratio=0.1,
metric_for_best_model="accuracy",
gradient_accumulation_steps=1 # No gradient accumulation (equivalent to no_auto_optimize=True)
)
from torch.optim import AdamW # Import AdamW from PyTorch
# Create the optimizer (adjust other hyperparameters as needed)
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
from torch.optim.lr_scheduler import LambdaLR
lambda1 = lambda epoch: epoch // 30
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])
optimizertuple = (optimizer,scheduler)
from transformers import Trainer
# Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
optimizers=optimizertuple, # Wrap optimizer in a tuple
)
# Start training
trainer.train()
# import soundfile as sf
""" # Define a custom collate function to handle variable-length audio samples
def collate_fn(batch):
# Pad audio samples to the same length
input_lengths = [len(sample[0]) for sample in batch]
max_length = max(input_lengths)
padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
return {
"inputs": padded_inputs,
"attention_mask": attention_mask,
"labels": torch.tensor([sample[1] for sample in batch])
}
"""
"""
def collate_fn(batch):
# Pad audio samples to the same length
input_lengths = [len(sample[0]) for sample in batch]
max_length = max(input_lengths)
padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
# Convert each element in batch to a dictionary
batch = [{'inputs': padded_inputs, 'attention_mask': attention_mask, 'labels': label} for inp, mask, label in zip(padded_inputs, attention_mask, batch)]
print (batch)
return batch """
"""
# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
# lambda2 = lambda epoch: 0.95 ** epoch
# Load the audio file
audio, original_sr = librosa.load("dataset/lisp/sample_01.wav", sr=44100)
# Target sample rate
target_sr = 16000
# Resample the audio
audio_resampled = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr) """
""" inputs = processor(
audio_resampled, sampling_rate=target_sr, return_tensors="pt"
)
# Forward pass
with torch.no_grad():
logits = model(**inputs).logits
# Predict the class (0 for normal, 1 for lisp)
predicted_class = torch.argmax(logits, dim=1).item() """ |