rawag commited on
Commit
18f2c0e
·
verified ·
1 Parent(s): 68c2667

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset/lisp/sample_01.wav filter=lfs diff=lfs merge=lfs -text
37
+ dataset/lisp/sample_02.wav filter=lfs diff=lfs merge=lfs -text
38
+ dataset/lisp/sample_03.wav filter=lfs diff=lfs merge=lfs -text
39
+ dataset/normal/sample_01.wav filter=lfs diff=lfs merge=lfs -text
40
+ dataset/normal/sample_02.wav filter=lfs diff=lfs merge=lfs -text
41
+ dataset/normal/sample_03.wav filter=lfs diff=lfs merge=lfs -text
dataset.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ file_path,label
2
+ dataset/lisp/sample_01.wav,1
3
+ dataset/normal/sample_01.wav,0
4
+ dataset/lisp/sample_02.wav,1
5
+ dataset/normal/sample_02.wav,0
6
+ dataset/lisp/sample_03.wav,1
7
+ dataset/normal/sample_03.wav,0
dataset/lisp/sample_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8f40dae49c7b3edd939d4f240465b35bb43c08953f5d2e28dc3642809d99f2c
3
+ size 1153196
dataset/lisp/sample_02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c54fb29e02a3083375eb172a6444733cb5b44706892a29809ac586659f45928
3
+ size 1491060
dataset/lisp/sample_03.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ef7f6af78d81a3368791d209847cae8b449f98c8530d52c2ada1ce138785ba8
3
+ size 2064500
dataset/normal/sample_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60bdcad2a236b4a94f230c9013e0c28315d4cc27536f0f41d5ebadeb777b1fb6
3
+ size 1065132
dataset/normal/sample_02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b35ef83c6885777c03897f28fe73f46c7a4749d1df42cdc33908406c6e2c9608
3
+ size 2625652
dataset/normal/sample_03.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29c4c050a60620896c3812151daacbd076e5f45e6b0bd385d322e63ec8bf986
3
+ size 2400372
detect.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ from transformers import WhisperForAudioClassification
3
+
4
+ # Load the trained model
5
+ model = WhisperForAudioClassification.from_pretrained("results/checkpoint-30")
6
+
7
+ # Load audio file
8
+ audio_path = "dataset/lisp/sample_01.wav"
9
+ audio, original_sr = librosa.load(audio_path, sr=44100)
10
+
11
+ # Resample to target sample rate (if needed)
12
+ target_sr = 16000
13
+ if original_sr != target_sr:
14
+ audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
15
+
16
+ # Extract features
17
+ mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
18
+ mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
19
+
20
+ import torch
21
+
22
+ # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
23
+ max_len = 3000
24
+ pad_width = (0, max_len - mel_spectrogram_db.shape[1]) # Calculate padding width
25
+ mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float().unsqueeze(1),
26
+ pad_width, mode='constant', value=0)
27
+
28
+ # print(mel_spectrogram_db_padded.shape)
29
+
30
+ input_features = mel_spectrogram_db_padded
31
+
32
+ # Permute dimensions to match expected format
33
+ input_features = input_features.permute(1, 0, 2) # Permute dimensions to (batch_size, feature_dimension, sequence_length)
34
+
35
+ # print(input_features.shape)
36
+
37
+ # Create input dictionary with expected key
38
+ inputs = {'input_features': input_features}
39
+
40
+ # Make prediction
41
+ with torch.no_grad():
42
+ outputs = model(**inputs)
43
+ logits = outputs.logits
44
+ predicted_class_ids = torch.argmax(logits).item()
45
+ predicted_label = model.config.id2label[predicted_class_ids]
46
+
47
+ print("Predicted label:", predicted_label)
train.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperForAudioClassification
2
+
3
+ # Load pre-trained Whisper model
4
+ model = WhisperForAudioClassification.from_pretrained("openai/whisper-medium")
5
+
6
+ import pandas as pd
7
+
8
+ # Load the CSV file
9
+ df = pd.read_csv('dataset.csv')
10
+
11
+ from transformers import WhisperProcessor
12
+
13
+ # Initialize the Whisper processor
14
+ processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
15
+
16
+ import librosa
17
+ import torch
18
+
19
+ # Create a custom dataset class
20
+ class LispDataset(torch.utils.data.Dataset):
21
+ def __init__(self, df):
22
+ self.df = df
23
+
24
+ def __len__(self):
25
+ return len(self.df)
26
+
27
+ def __getitem__(self, idx):
28
+ row = self.df.iloc[idx]
29
+ audio_path = row['file_path']
30
+ label = row['label']
31
+
32
+ audio, original_sr = librosa.load(audio_path, sr=44100)
33
+
34
+ # Resample to target sample rate (if needed)
35
+ target_sr = 16000
36
+ if original_sr != target_sr:
37
+ audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
38
+
39
+ # Extract mel features
40
+ mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)
41
+ mel_spectrogram_db = librosa.power_to_db(mel_spectrogram) # Convert to decibels
42
+
43
+ # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)
44
+ max_len = 3000 # Replace with your desired maximum length
45
+ pad_width = (0, max_len - mel_spectrogram_db.shape[1]) # Calculate padding width
46
+ mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(),
47
+ pad_width, mode='constant', value=0)
48
+
49
+ # Convert to tensor
50
+ input_features = mel_spectrogram_db_padded
51
+
52
+ # # Convert to tensor
53
+ # input_features = torch.from_numpy(mel_spectrogram_db_padded).float()
54
+
55
+ # Create dictionary with expected key
56
+ return {'input_features': input_features, 'labels': label}
57
+
58
+ # Create a DataLoader
59
+ train_dataset = LispDataset(df)
60
+
61
+ from transformers import TrainingArguments
62
+
63
+ # Training arguments (adjust learning rate as needed)
64
+ training_args = TrainingArguments(
65
+ output_dir="./results",
66
+ num_train_epochs=10,
67
+ per_device_train_batch_size=2,
68
+ learning_rate=5e-5,
69
+ fp16=True,
70
+ use_cpu=True,
71
+ warmup_ratio=0.1,
72
+ metric_for_best_model="accuracy",
73
+ gradient_accumulation_steps=1 # No gradient accumulation (equivalent to no_auto_optimize=True)
74
+ )
75
+
76
+ from torch.optim import AdamW # Import AdamW from PyTorch
77
+
78
+ # Create the optimizer (adjust other hyperparameters as needed)
79
+ optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
80
+
81
+ from torch.optim.lr_scheduler import LambdaLR
82
+
83
+ lambda1 = lambda epoch: epoch // 30
84
+ scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])
85
+
86
+ optimizertuple = (optimizer,scheduler)
87
+
88
+ from transformers import Trainer
89
+
90
+ # Trainer instance
91
+ trainer = Trainer(
92
+ model=model,
93
+ args=training_args,
94
+ train_dataset=train_dataset,
95
+ optimizers=optimizertuple, # Wrap optimizer in a tuple
96
+ )
97
+
98
+ # Start training
99
+ trainer.train()
100
+
101
+ # import soundfile as sf
102
+
103
+ """ # Define a custom collate function to handle variable-length audio samples
104
+ def collate_fn(batch):
105
+ # Pad audio samples to the same length
106
+ input_lengths = [len(sample[0]) for sample in batch]
107
+ max_length = max(input_lengths)
108
+ padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
109
+ attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
110
+
111
+ return {
112
+ "inputs": padded_inputs,
113
+ "attention_mask": attention_mask,
114
+ "labels": torch.tensor([sample[1] for sample in batch])
115
+ }
116
+ """
117
+ """
118
+ def collate_fn(batch):
119
+ # Pad audio samples to the same length
120
+ input_lengths = [len(sample[0]) for sample in batch]
121
+ max_length = max(input_lengths)
122
+ padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.tensor(sample[0]) for sample in batch], batch_first=True, padding_value=0)
123
+ attention_mask = torch.tensor([[1] * length + [0] * (max_length - length) for length in input_lengths])
124
+
125
+ # Convert each element in batch to a dictionary
126
+ batch = [{'inputs': padded_inputs, 'attention_mask': attention_mask, 'labels': label} for inp, mask, label in zip(padded_inputs, attention_mask, batch)]
127
+ print (batch)
128
+
129
+ return batch """
130
+
131
+ """
132
+ # train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
133
+
134
+ # lambda2 = lambda epoch: 0.95 ** epoch
135
+
136
+ # Load the audio file
137
+ audio, original_sr = librosa.load("dataset/lisp/sample_01.wav", sr=44100)
138
+
139
+ # Target sample rate
140
+ target_sr = 16000
141
+
142
+ # Resample the audio
143
+ audio_resampled = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr) """
144
+
145
+ """ inputs = processor(
146
+ audio_resampled, sampling_rate=target_sr, return_tensors="pt"
147
+ )
148
+
149
+ # Forward pass
150
+ with torch.no_grad():
151
+ logits = model(**inputs).logits
152
+
153
+ # Predict the class (0 for normal, 1 for lisp)
154
+ predicted_class = torch.argmax(logits, dim=1).item() """