File size: 3,829 Bytes
a5f1898 9d3cda7 b5458cd a5f1898 87f4296 a5f1898 b5458cd a5f1898 b5458cd 9d3cda7 e04057a b5458cd 9d3cda7 e04057a 9d3cda7 8862dd0 b5458cd 8862dd0 b5458cd 8862dd0 9d3cda7 b5458cd a5f1898 9d3cda7 8862dd0 b5458cd 9d3cda7 a5f1898 9d3cda7 b5458cd 9d3cda7 b5458cd a5f1898 cdce7e5 a5f1898 cdce7e5 a5f1898 b5458cd a5f1898 9d3cda7 b5458cd a5f1898 9d3cda7 a5f1898 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import torchaudio
import soundfile as sf
import torch
from transformers import pipeline
# Preload both models
models = {
"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}
# Adjust generation configs
for m in models.values():
m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
m.model.generation_config.forced_decoder_ids = None
def load_audio(audio_path):
"""Robustly load any audio file into (waveform, sr)"""
try:
waveform, sr = torchaudio.load(audio_path)
except Exception:
# fallback for unknown backends
data, sr = sf.read(audio_path)
waveform = torch.tensor(data, dtype=torch.float32).T
if waveform.ndim == 1:
waveform = waveform.unsqueeze(0)
return waveform, sr
def ensure_mono_16k(audio_path):
"""Convert audio to mono + 16 kHz"""
waveform, sr = load_audio(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
waveform = resampler(waveform)
sr = 16000
return waveform, sr
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
"""Trim leading silence, keep β€ keep_ms ms"""
energy = waveform.abs().mean(dim=0)
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
if len(non_silence_idx) == 0:
return waveform # all silence
first_non_silence = non_silence_idx[0].item()
keep_samples = int(sr * (keep_ms / 1000.0))
start = max(0, first_non_silence - keep_samples)
return waveform[:, start:]
def preprocess_audio(audio_path):
waveform, sr = ensure_mono_16k(audio_path)
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
tmp_path = "/tmp/processed_trimmed.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
def transcribe(audio, selected_model):
if audio is None:
return "Please record or upload an audio file.", "Please record or upload an audio file."
processed_audio = preprocess_audio(audio)
pipe_selected = models[selected_model]
other_model = [k for k in models if k != selected_model][0]
pipe_other = models[other_model]
result_selected = pipe_selected(processed_audio)["text"]
result_other = pipe_other(processed_audio)["text"]
return result_selected, result_other
title = "ποΈ Moulsot ASR Comparison"
description = """
Compare two fine-tuned Whisper models for **Darija ASR**:
- π© **moulsot_v0.1_2500**
- π¦ **moulsot_v0.2_1000**
You can **record** or **upload** an audio sample.
The app automatically:
- converts to **16 kHz mono**
- **removes leading silence** (β€ 0.1 s)
Then both models transcribe the result side by side.
"""
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="π€ Record or Upload Audio"
)
model_choice = gr.Radio(
["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
label="Choose Primary Model",
value="moulsot_v0.1_2500"
)
transcribe_btn = gr.Button("π Transcribe")
with gr.Row():
output_selected = gr.Textbox(label="π© Selected Model Output")
output_other = gr.Textbox(label="π¦ Other Model Output")
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, model_choice],
outputs=[output_selected, output_other]
)
if __name__ == "__main__":
demo.launch()
|