|  | import gradio as gr | 
					
						
						|  | import torchaudio | 
					
						
						|  | import soundfile as sf | 
					
						
						|  | import torch | 
					
						
						|  | from transformers import pipeline | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | models = { | 
					
						
						|  | "moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"), | 
					
						
						|  | "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000") | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for m in models.values(): | 
					
						
						|  | m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids | 
					
						
						|  | m.model.generation_config.forced_decoder_ids = None | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load_audio(audio_path): | 
					
						
						|  | """Robustly load any audio file into (waveform, sr)""" | 
					
						
						|  | try: | 
					
						
						|  | waveform, sr = torchaudio.load(audio_path) | 
					
						
						|  | except Exception: | 
					
						
						|  |  | 
					
						
						|  | data, sr = sf.read(audio_path) | 
					
						
						|  | waveform = torch.tensor(data, dtype=torch.float32).T | 
					
						
						|  | if waveform.ndim == 1: | 
					
						
						|  | waveform = waveform.unsqueeze(0) | 
					
						
						|  | return waveform, sr | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def ensure_mono_16k(audio_path): | 
					
						
						|  | """Convert audio to mono + 16 kHz""" | 
					
						
						|  | waveform, sr = load_audio(audio_path) | 
					
						
						|  | if waveform.shape[0] > 1: | 
					
						
						|  | waveform = waveform.mean(dim=0, keepdim=True) | 
					
						
						|  | if sr != 16000: | 
					
						
						|  | resampler = torchaudio.transforms.Resample(sr, 16000) | 
					
						
						|  | waveform = resampler(waveform) | 
					
						
						|  | sr = 16000 | 
					
						
						|  | return waveform, sr | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01): | 
					
						
						|  | """Trim leading silence, keep β€ keep_ms ms""" | 
					
						
						|  | energy = waveform.abs().mean(dim=0) | 
					
						
						|  | non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0] | 
					
						
						|  | if len(non_silence_idx) == 0: | 
					
						
						|  | return waveform | 
					
						
						|  | first_non_silence = non_silence_idx[0].item() | 
					
						
						|  | keep_samples = int(sr * (keep_ms / 1000.0)) | 
					
						
						|  | start = max(0, first_non_silence - keep_samples) | 
					
						
						|  | return waveform[:, start:] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def preprocess_audio(audio_path): | 
					
						
						|  | waveform, sr = ensure_mono_16k(audio_path) | 
					
						
						|  | waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01) | 
					
						
						|  | tmp_path = "/tmp/processed_trimmed.wav" | 
					
						
						|  | torchaudio.save(tmp_path, waveform, sr) | 
					
						
						|  | return tmp_path | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def transcribe(audio, selected_model): | 
					
						
						|  | if audio is None: | 
					
						
						|  | return "Please record or upload an audio file.", "Please record or upload an audio file." | 
					
						
						|  |  | 
					
						
						|  | processed_audio = preprocess_audio(audio) | 
					
						
						|  |  | 
					
						
						|  | pipe_selected = models[selected_model] | 
					
						
						|  | other_model = [k for k in models if k != selected_model][0] | 
					
						
						|  | pipe_other = models[other_model] | 
					
						
						|  |  | 
					
						
						|  | result_selected = pipe_selected(processed_audio)["text"] | 
					
						
						|  | result_other = pipe_other(processed_audio)["text"] | 
					
						
						|  |  | 
					
						
						|  | return result_selected, result_other | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | title = "ποΈ Moulsot ASR Comparison" | 
					
						
						|  | description = """ | 
					
						
						|  | Compare two fine-tuned Whisper models for **Darija ASR**: | 
					
						
						|  | - π© **moulsot_v0.1_2500** | 
					
						
						|  | - π¦ **moulsot_v0.2_1000** | 
					
						
						|  |  | 
					
						
						|  | You can **record** or **upload** an audio sample. | 
					
						
						|  | The app automatically: | 
					
						
						|  | - converts to **16 kHz mono** | 
					
						
						|  | - **removes leading silence** (β€ 0.1 s) | 
					
						
						|  | Then both models transcribe the result side by side. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(title=title) as demo: | 
					
						
						|  | gr.Markdown(f"# {title}\n{description}") | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | audio_input = gr.Audio( | 
					
						
						|  | sources=["microphone", "upload"], | 
					
						
						|  | type="filepath", | 
					
						
						|  | label="π€ Record or Upload Audio" | 
					
						
						|  | ) | 
					
						
						|  | model_choice = gr.Radio( | 
					
						
						|  | ["moulsot_v0.1_2500", "moulsot_v0.2_1000"], | 
					
						
						|  | label="Choose Primary Model", | 
					
						
						|  | value="moulsot_v0.1_2500" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | transcribe_btn = gr.Button("π Transcribe") | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | output_selected = gr.Textbox(label="π© Selected Model Output") | 
					
						
						|  | output_other = gr.Textbox(label="π¦ Other Model Output") | 
					
						
						|  |  | 
					
						
						|  | transcribe_btn.click( | 
					
						
						|  | fn=transcribe, | 
					
						
						|  | inputs=[audio_input, model_choice], | 
					
						
						|  | outputs=[output_selected, output_other] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | demo.launch() | 
					
						
						|  |  |