Spaces:

01Yassine
/

MoulSot

Runtime error

App Files Files Community

01Yassine commited on Oct 5

Commit

b5458cd

verified ·

1 Parent(s): e04057a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -21

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
 import torchaudio
 from transformers import pipeline
 # Preload both models
@@ -8,14 +10,28 @@ models = {
     "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
 }
-# Adjust generation configs for both
 for m in models.values():
     m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
     m.model.generation_config.forced_decoder_ids = None
 def ensure_mono_16k(audio_path):
     """Convert audio to mono + 16 kHz"""
-    waveform, sr = torchaudio.load(audio_path)
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
     if sr != 16000:
@@ -26,11 +42,7 @@ def ensure_mono_16k(audio_path):
 def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
-    """
-    Remove leading silence from waveform, keeping at most `keep_ms` milliseconds.
-    `threshold` controls what is considered silence.
-    """
-    # Compute energy-based mask
     energy = waveform.abs().mean(dim=0)
     non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
     if len(non_silence_idx) == 0:
@@ -38,8 +50,7 @@ def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
     first_non_silence = non_silence_idx[0].item()
     keep_samples = int(sr * (keep_ms / 1000.0))
     start = max(0, first_non_silence - keep_samples)
-    trimmed = waveform[:, start:]
-    return trimmed
 def preprocess_audio(audio_path):
@@ -49,33 +60,34 @@ def preprocess_audio(audio_path):
     torchaudio.save(tmp_path, waveform, sr)
     return tmp_path
 def transcribe(audio, selected_model):
     if audio is None:
         return "Please record or upload an audio file.", "Please record or upload an audio file."
-    # Convert uploaded/recorded audio to mono 16kHz
     processed_audio = preprocess_audio(audio)
-    # Selected + other model
     pipe_selected = models[selected_model]
     other_model = [k for k in models if k != selected_model][0]
     pipe_other = models[other_model]
-    # Run inference
     result_selected = pipe_selected(processed_audio)["text"]
     result_other = pipe_other(processed_audio)["text"]
     return result_selected, result_other
-title = "🎙️ MoulSot Comparison"
 description = """
-Compare two fine-tuned models for **Darija ASR**:
 - 🟩 **moulsot_v0.1_2500**
 - 🟦 **moulsot_v0.2_1000**
-You can **record** or **upload** an audio sample (automatically resampled to 16 kHz mono),
-then view transcriptions from both models side by side.
 """
 with gr.Blocks(title=title) as demo:
@@ -85,7 +97,7 @@ with gr.Blocks(title=title) as demo:
         audio_input = gr.Audio(
             sources=["microphone", "upload"],
             type="filepath",
-            label="🎤 Record or Upload Audio (auto 16 kHz mono)"
         )
         model_choice = gr.Radio(
             ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
@@ -105,6 +117,5 @@ with gr.Blocks(title=title) as demo:
         outputs=[output_selected, output_other]
     )
-# Local launch
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torchaudio
+import soundfile as sf
+import torch
 from transformers import pipeline
 # Preload both models
     "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
 }
+# Adjust generation configs
 for m in models.values():
     m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
     m.model.generation_config.forced_decoder_ids = None
+def load_audio(audio_path):
+    """Robustly load any audio file into (waveform, sr)"""
+    try:
+        waveform, sr = torchaudio.load(audio_path)
+    except Exception:
+        # fallback for unknown backends
+        data, sr = sf.read(audio_path)
+        waveform = torch.tensor(data, dtype=torch.float32).T
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)
+    return waveform, sr
 def ensure_mono_16k(audio_path):
     """Convert audio to mono + 16 kHz"""
+    waveform, sr = load_audio(audio_path)
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
     if sr != 16000:
 def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
+    """Trim leading silence, keep ≤ keep_ms ms"""
     energy = waveform.abs().mean(dim=0)
     non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
     if len(non_silence_idx) == 0:
     first_non_silence = non_silence_idx[0].item()
     keep_samples = int(sr * (keep_ms / 1000.0))
     start = max(0, first_non_silence - keep_samples)
+    return waveform[:, start:]
 def preprocess_audio(audio_path):
     torchaudio.save(tmp_path, waveform, sr)
     return tmp_path
 def transcribe(audio, selected_model):
     if audio is None:
         return "Please record or upload an audio file.", "Please record or upload an audio file."
     processed_audio = preprocess_audio(audio)
     pipe_selected = models[selected_model]
     other_model = [k for k in models if k != selected_model][0]
     pipe_other = models[other_model]
     result_selected = pipe_selected(processed_audio)["text"]
     result_other = pipe_other(processed_audio)["text"]
     return result_selected, result_other
+title = "🎙️ Moulsot Whisper ASR Comparison"
 description = """
+Compare two fine-tuned Whisper models for **Arabic ASR**:
 - 🟩 **moulsot_v0.1_2500**
 - 🟦 **moulsot_v0.2_1000**
+You can **record** or **upload** an audio sample.
+The app automatically:
+- converts to **16 kHz mono**
+- **removes leading silence** (≤ 0.1 s)
+Then both models transcribe the result side by side.
 """
 with gr.Blocks(title=title) as demo:
         audio_input = gr.Audio(
             sources=["microphone", "upload"],
             type="filepath",
+            label="🎤 Record or Upload Audio"
         )
         model_choice = gr.Radio(
             ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
         outputs=[output_selected, output_other]
     )
 if __name__ == "__main__":
     demo.launch()