File size: 3,826 Bytes
a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 9d3cda7 8862dd0 9d3cda7 a5f1898 9d3cda7 8862dd0 a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 cea229d a5f1898 cea229d a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 9d3cda7 a5f1898 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
import torchaudio
from transformers import pipeline
# Preload both models
models = {
"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}
# Adjust generation configs for both
for m in models.values():
m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
m.model.generation_config.forced_decoder_ids = None
def ensure_mono_16k(audio_path):
"""Load audio, convert to mono + 16kHz, and save a temp version"""
waveform, sr = torchaudio.load(audio_path)
# Convert to mono if necessary
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample to 16kHz if necessary
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
waveform = resampler(waveform)
sr = 16000
tmp_path = "/tmp/processed_16k.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
"""
Remove leading silence from waveform, keeping at most `keep_ms` milliseconds.
`threshold` controls what is considered silence.
"""
# Compute energy-based mask
energy = waveform.abs().mean(dim=0)
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
if len(non_silence_idx) == 0:
return waveform # all silence
first_non_silence = non_silence_idx[0].item()
keep_samples = int(sr * (keep_ms / 1000.0))
start = max(0, first_non_silence - keep_samples)
trimmed = waveform[:, start:]
return trimmed
def preprocess_audio(audio_path):
waveform, sr = ensure_mono_16k(audio_path)
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
tmp_path = "/tmp/processed_trimmed.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
def transcribe(audio, selected_model):
if audio is None:
return "Please record or upload an audio file.", "Please record or upload an audio file."
# Convert uploaded/recorded audio to mono 16kHz
processed_audio = preprocess_audio(audio)
# Selected + other model
pipe_selected = models[selected_model]
other_model = [k for k in models if k != selected_model][0]
pipe_other = models[other_model]
# Run inference
result_selected = pipe_selected(processed_audio)["text"]
result_other = pipe_other(processed_audio)["text"]
return result_selected, result_other
title = "ποΈ Moulsot Comparison"
description = """
Compare two fine-tuned models for **Darija ASR**:
- π© **moulsot_v0.1_2500**
- π¦ **moulsot_v0.2_1000**
You can **record** or **upload** an audio sample (automatically resampled to 16 kHz mono),
then view transcriptions from both models side by side.
"""
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="π€ Record or Upload Audio (auto 16 kHz mono)"
)
model_choice = gr.Radio(
["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
label="Choose Primary Model",
value="moulsot_v0.1_2500"
)
transcribe_btn = gr.Button("π Transcribe")
with gr.Row():
output_selected = gr.Textbox(label="π© Selected Model Output")
output_other = gr.Textbox(label="π¦ Other Model Output")
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, model_choice],
outputs=[output_selected, output_other]
)
# Local launch
if __name__ == "__main__":
demo.launch()
|