MoulSot / app.py
01Yassine's picture
Update app.py
cea229d verified
raw
history blame
3.83 kB
import gradio as gr
import torchaudio
from transformers import pipeline
# Preload both models
models = {
"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}
# Adjust generation configs for both
for m in models.values():
m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
m.model.generation_config.forced_decoder_ids = None
def ensure_mono_16k(audio_path):
"""Load audio, convert to mono + 16kHz, and save a temp version"""
waveform, sr = torchaudio.load(audio_path)
# Convert to mono if necessary
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample to 16kHz if necessary
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
waveform = resampler(waveform)
sr = 16000
tmp_path = "/tmp/processed_16k.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
"""
Remove leading silence from waveform, keeping at most `keep_ms` milliseconds.
`threshold` controls what is considered silence.
"""
# Compute energy-based mask
energy = waveform.abs().mean(dim=0)
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
if len(non_silence_idx) == 0:
return waveform # all silence
first_non_silence = non_silence_idx[0].item()
keep_samples = int(sr * (keep_ms / 1000.0))
start = max(0, first_non_silence - keep_samples)
trimmed = waveform[:, start:]
return trimmed
def preprocess_audio(audio_path):
waveform, sr = ensure_mono_16k(audio_path)
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
tmp_path = "/tmp/processed_trimmed.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
def transcribe(audio, selected_model):
if audio is None:
return "Please record or upload an audio file.", "Please record or upload an audio file."
# Convert uploaded/recorded audio to mono 16kHz
processed_audio = preprocess_audio(audio)
# Selected + other model
pipe_selected = models[selected_model]
other_model = [k for k in models if k != selected_model][0]
pipe_other = models[other_model]
# Run inference
result_selected = pipe_selected(processed_audio)["text"]
result_other = pipe_other(processed_audio)["text"]
return result_selected, result_other
title = "πŸŽ™οΈ Moulsot Comparison"
description = """
Compare two fine-tuned models for **Darija ASR**:
- 🟩 **moulsot_v0.1_2500**
- 🟦 **moulsot_v0.2_1000**
You can **record** or **upload** an audio sample (automatically resampled to 16 kHz mono),
then view transcriptions from both models side by side.
"""
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="🎀 Record or Upload Audio (auto 16 kHz mono)"
)
model_choice = gr.Radio(
["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
label="Choose Primary Model",
value="moulsot_v0.1_2500"
)
transcribe_btn = gr.Button("πŸš€ Transcribe")
with gr.Row():
output_selected = gr.Textbox(label="🟩 Selected Model Output")
output_other = gr.Textbox(label="🟦 Other Model Output")
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, model_choice],
outputs=[output_selected, output_other]
)
# Local launch
if __name__ == "__main__":
demo.launch()