import gradio as gr import torchaudio import soundfile as sf import torch from transformers import pipeline # Preload both models models = { "moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"), "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000") } # Adjust generation configs for m in models.values(): m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids m.model.generation_config.forced_decoder_ids = None def load_audio(audio_path): """Robustly load any audio file into (waveform, sr)""" try: waveform, sr = torchaudio.load(audio_path) except Exception: # fallback for unknown backends data, sr = sf.read(audio_path) waveform = torch.tensor(data, dtype=torch.float32).T if waveform.ndim == 1: waveform = waveform.unsqueeze(0) return waveform, sr def ensure_mono_16k(audio_path): """Convert audio to mono + 16 kHz""" waveform, sr = load_audio(audio_path) if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) waveform = resampler(waveform) sr = 16000 return waveform, sr def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01): """Trim leading silence, keep ≤ keep_ms ms""" energy = waveform.abs().mean(dim=0) non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0] if len(non_silence_idx) == 0: return waveform # all silence first_non_silence = non_silence_idx[0].item() keep_samples = int(sr * (keep_ms / 1000.0)) start = max(0, first_non_silence - keep_samples) return waveform[:, start:] def preprocess_audio(audio_path): waveform, sr = ensure_mono_16k(audio_path) waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01) tmp_path = "/tmp/processed_trimmed.wav" torchaudio.save(tmp_path, waveform, sr) return tmp_path def transcribe(audio, selected_model): if audio is None: return "Please record or upload an audio file.", "Please record or upload an audio file." processed_audio = preprocess_audio(audio) pipe_selected = models[selected_model] other_model = [k for k in models if k != selected_model][0] pipe_other = models[other_model] result_selected = pipe_selected(processed_audio)["text"] result_other = pipe_other(processed_audio)["text"] return result_selected, result_other title = "🎙️ Moulsot ASR Comparison" description = """ Compare two fine-tuned Whisper models for **Darija ASR**: - 🟩 **moulsot_v0.1_2500** - 🟦 **moulsot_v0.2_1000** You can **record** or **upload** an audio sample. The app automatically: - converts to **16 kHz mono** - **removes leading silence** (≤ 0.1 s) Then both models transcribe the result side by side. """ with gr.Blocks(title=title) as demo: gr.Markdown(f"# {title}\n{description}") with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="🎤 Record or Upload Audio" ) model_choice = gr.Radio( ["moulsot_v0.1_2500", "moulsot_v0.2_1000"], label="Choose Primary Model", value="moulsot_v0.1_2500" ) transcribe_btn = gr.Button("🚀 Transcribe") with gr.Row(): output_selected = gr.Textbox(label="🟩 Selected Model Output") output_other = gr.Textbox(label="🟦 Other Model Output") transcribe_btn.click( fn=transcribe, inputs=[audio_input, model_choice], outputs=[output_selected, output_other] ) if __name__ == "__main__": demo.launch()