Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +6 -0
dataset.csv +7 -0
dataset/lisp/sample_01.wav +3 -0
dataset/lisp/sample_02.wav +3 -0
dataset/lisp/sample_03.wav +3 -0
dataset/normal/sample_01.wav +3 -0
dataset/normal/sample_02.wav +3 -0
dataset/normal/sample_03.wav +3 -0
detect.py +47 -0
train.py +154 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+dataset/lisp/sample_01.wav filter=lfs diff=lfs merge=lfs -text
+dataset/lisp/sample_02.wav filter=lfs diff=lfs merge=lfs -text
+dataset/lisp/sample_03.wav filter=lfs diff=lfs merge=lfs -text
+dataset/normal/sample_01.wav filter=lfs diff=lfs merge=lfs -text
+dataset/normal/sample_02.wav filter=lfs diff=lfs merge=lfs -text
+dataset/normal/sample_03.wav filter=lfs diff=lfs merge=lfs -text

dataset.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+file_path,label
+dataset/lisp/sample_01.wav,1
+dataset/normal/sample_01.wav,0
+dataset/lisp/sample_02.wav,1
+dataset/normal/sample_02.wav,0
+dataset/lisp/sample_03.wav,1
+dataset/normal/sample_03.wav,0

dataset/lisp/sample_01.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8f40dae49c7b3edd939d4f240465b35bb43c08953f5d2e28dc3642809d99f2c
+size 1153196

dataset/lisp/sample_02.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c54fb29e02a3083375eb172a6444733cb5b44706892a29809ac586659f45928
+size 1491060

dataset/lisp/sample_03.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef7f6af78d81a3368791d209847cae8b449f98c8530d52c2ada1ce138785ba8
+size 2064500

dataset/normal/sample_01.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60bdcad2a236b4a94f230c9013e0c28315d4cc27536f0f41d5ebadeb777b1fb6
+size 1065132

dataset/normal/sample_02.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b35ef83c6885777c03897f28fe73f46c7a4749d1df42cdc33908406c6e2c9608
+size 2625652

dataset/normal/sample_03.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29c4c050a60620896c3812151daacbd076e5f45e6b0bd385d322e63ec8bf986
+size 2400372

detect.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import librosa
+from transformers import WhisperForAudioClassification
+# Load the trained model
+model = WhisperForAudioClassification.from_pretrained("results/checkpoint-30")
+# Load audio file
+audio_path = "dataset/lisp/sample_01.wav"
+audio, original_sr = librosa.load(audio_path, sr=44100)
+# Resample to target sample rate (if needed)
+target_sr = 16000
+if original_sr != target_sr:
+    audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
+# Extract features
+mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
+mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
+import torch
+ # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
+max_len = 3000
+pad_width = (0, max_len - mel_spectrogram_db.shape[1])  # Calculate padding width
+mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float().unsqueeze(1),
+                                                    pad_width, mode='constant', value=0)
+# print(mel_spectrogram_db_padded.shape)
+input_features = mel_spectrogram_db_padded
+# Permute dimensions to match expected format
+input_features = input_features.permute(1, 0, 2)  # Permute dimensions to (batch_size, feature_dimension, sequence_length)
+# print(input_features.shape)
+# Create input dictionary with expected key
+inputs = {'input_features': input_features}
+# Make prediction
+with torch.no_grad():
+    outputs = model(**inputs)
+    logits = outputs.logits
+    predicted_class_ids = torch.argmax(logits).item()
+    predicted_label = model.config.id2label[predicted_class_ids]
+print("Predicted label:", predicted_label)

train.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from transformers import WhisperForAudioClassification
+# Load pre-trained Whisper model
+model = WhisperForAudioClassification.from_pretrained("openai/whisper-medium")
+import pandas as pd
+# Load the CSV file
+df = pd.read_csv('dataset.csv')
+from transformers import WhisperProcessor
+# Initialize the Whisper processor
+processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
+import librosa
+import torch
+# Create a custom dataset class
+class LispDataset(torch.utils.data.Dataset):
+  def __init__(self, df):
+    self.df = df
+  def __len__(self):
+    return len(self.df)
+  def __getitem__(self, idx):
+    row = self.df.iloc[idx]
+    audio_path = row['file_path']
+    label = row['label']
+    audio, original_sr = librosa.load(audio_path, sr=44100)
+    # Resample to target sample rate (if needed)
+    target_sr = 16000
+    if original_sr != target_sr:
+        audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
+    # Extract mel features
+    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
+    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)  # Convert to decibels
+    # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
+    max_len = 3000  # Replace with your desired maximum length
+    pad_width = (0, max_len - mel_spectrogram_db.shape[1])  # Calculate padding width
+    mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(),
+                                                        pad_width, mode='constant', value=0)
+   # Convert to tensor
+    input_features = mel_spectrogram_db_padded
+    # # Convert to tensor
+    # input_features = torch.from_numpy(mel_spectrogram_db_padded).float()
+    # Create dictionary with expected key
+    return {'input_features': input_features, 'labels': label}
+# Create a DataLoader
+train_dataset = LispDataset(df)
+from transformers import TrainingArguments
+# Training arguments (adjust learning rate as needed)
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=10,
+    per_device_train_batch_size=2,
+    learning_rate=5e-5,
+    fp16=True,
+    use_cpu=True,
+    warmup_ratio=0.1,
+    metric_for_best_model="accuracy",
+    gradient_accumulation_steps=1  # No gradient accumulation (equivalent to no_auto_optimize=True)
+)
+from torch.optim import AdamW  # Import AdamW from PyTorch
+# Create the optimizer (adjust other hyperparameters as needed)
+optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
+from torch.optim.lr_scheduler import LambdaLR
+lambda1 = lambda epoch: epoch // 30
+scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])
+optimizertuple = (optimizer,scheduler)
+from transformers import Trainer
+# Trainer instance
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    optimizers=optimizertuple,  # Wrap optimizer in a tuple
+)
+# Start training
+trainer.train()
+# import soundfile as sf
+""" # Define a custom collate function to handle variable-length audio samples
+def collate_fn(batch):
+    # Pad audio samples to the same length
+    input_lengths = [len(sample[0]) for sample in batch]
+    max_length = max(input_lengths)
+    padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
+    attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
+    return {
+        "inputs": padded_inputs,
+        "attention_mask": attention_mask,
+        "labels": torch.tensor([sample[1] for sample in batch])
+    }
+ """
+"""
+def collate_fn(batch):
+  # Pad audio samples to the same length
+  input_lengths = [len(sample[0]) for sample in batch]
+  max_length = max(input_lengths)
+  padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
+  attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
+  # Convert each element in batch to a dictionary
+  batch = [{'inputs': padded_inputs, 'attention_mask': attention_mask, 'labels': label} for inp, mask, label in zip(padded_inputs, attention_mask, batch)]
+  print (batch)
+  return batch """
+"""
+# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
+# lambda2 = lambda epoch: 0.95 ** epoch
+# Load the audio file
+audio, original_sr = librosa.load("dataset/lisp/sample_01.wav", sr=44100)
+# Target sample rate
+target_sr = 16000
+# Resample the audio
+audio_resampled = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr) """
+""" inputs = processor(
+    audio_resampled, sampling_rate=target_sr, return_tensors="pt"
+)
+# Forward pass
+with torch.no_grad():
+    logits = model(**inputs).logits
+# Predict the class (0 for normal, 1 for lisp)
+predicted_class = torch.argmax(logits, dim=1).item() """