Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

App Files Files Community

Yilin0601 commited on Mar 20

Commit

96e64e2

verified ·

1 Parent(s): df4b4f2

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -18

app.py CHANGED Viewed

@@ -6,35 +6,44 @@ import numpy as np
 import librosa
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
-# 1. Load your fine-tuned model & feature extractor from the Hugging Face Hub or local path
-# Replace "YourUsername/YourModelRepo" with the actual repo ID where your fine-tuned model is hosted
-model_name = "YourUsername/YourModelRepo"
-model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model.eval()
 def classify_accuracy(audio):
     """
-    audio: Gradio provides a tuple (sample_rate, data) when type='numpy'.
-    We'll convert to the correct format, run inference, and return the predicted level.
     """
     if audio is None:
         return "No audio provided."
     sample_rate, data = audio
-    # Ensure the audio is a NumPy array
     if not isinstance(data, np.ndarray):
         data = np.array(data)
-    # Resample if needed (model expects 16kHz)
     target_sr = 16000
     if sample_rate != target_sr:
         data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
         sample_rate = target_sr
-    # Convert to batch of size 1
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
@@ -47,28 +56,29 @@ def classify_accuracy(audio):
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
-    # Map model output (0..7) back to your desired scale (3..10) if needed
     accuracy_level = predicted_id + 3
     return f"Predicted Accuracy Level: {accuracy_level}"
-# 2. Build Gradio Interface
-title = "Speech Accuracy Classifier"
 description = (
     "Upload an audio file (or record audio) on the left. "
-    "The model will classify the audio's accuracy level on the right."
 )
-# Gradio Interface:
 demo = gr.Interface(
     fn=classify_accuracy,
-    inputs=gr.Audio(source="upload", type="numpy"),  # left side: audio upload
-    outputs="text",                                  # right side: classification result
     title=title,
     description=description,
-    allow_flagging="never"  # disable user flagging if you prefer
 )
-# 3. Launch Gradio App
 if __name__ == "__main__":
     demo.launch()

 import librosa
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
+# ------------------------------------------------
+# 1. Load base Wav2Vec2 model + classification head
+# ------------------------------------------------
+model_name = "facebook/wav2vec2-base-960h"
+# We specify num_labels=8 to create a random classification head on top
+model = Wav2Vec2ForSequenceClassification.from_pretrained(
+    model_name,
+    num_labels=8
+)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model.eval()
+# ------------------------------------------------
+# 2. Define inference function
+# ------------------------------------------------
 def classify_accuracy(audio):
     """
+    Receives a tuple (sample_rate, data) from Gradio when type='numpy'.
+    We'll resample if needed, run a forward pass, and return a 'level'.
     """
     if audio is None:
         return "No audio provided."
     sample_rate, data = audio
+    # Ensure we have a NumPy array
     if not isinstance(data, np.ndarray):
         data = np.array(data)
+    # Resample if the model expects 16kHz
     target_sr = 16000
     if sample_rate != target_sr:
         data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
         sample_rate = target_sr
+    # Extract features
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
+    # Map 0..7 → 3..10 if you want a "level" in that range
     accuracy_level = predicted_id + 3
     return f"Predicted Accuracy Level: {accuracy_level}"
+# ------------------------------------------------
+# 3. Build Gradio interface
+# ------------------------------------------------
+title = "Speech Accuracy Classifier (Base Wav2Vec2)"
 description = (
     "Upload an audio file (or record audio) on the left. "
+    "The base model is NOT fine-tuned for classification, so results may be random. "
+    "This demo simply illustrates how to attach a classification head."
 )
 demo = gr.Interface(
     fn=classify_accuracy,
+    inputs=gr.Audio(source="upload", type="numpy"),
+    outputs="text",
     title=title,
     description=description,
+    allow_flagging="never"
 )
 if __name__ == "__main__":
     demo.launch()