|
|
import gradio as gr |
|
|
import torchaudio |
|
|
from transformers import pipeline |
|
|
import soundfile as sf |
|
|
import torch |
|
|
|
|
|
|
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000") |
|
|
|
|
|
|
|
|
asr_pipeline.model.generation_config.input_ids = asr_pipeline.model.generation_config.forced_decoder_ids |
|
|
asr_pipeline.model.generation_config.forced_decoder_ids = None |
|
|
|
|
|
|
|
|
def load_audio(audio_path): |
|
|
"""Robustly load any audio file into (waveform, sr)""" |
|
|
try: |
|
|
waveform, sr = torchaudio.load(audio_path) |
|
|
except Exception: |
|
|
|
|
|
data, sr = sf.read(audio_path) |
|
|
waveform = torch.tensor(data, dtype=torch.float32).T |
|
|
if waveform.ndim == 1: |
|
|
waveform = waveform.unsqueeze(0) |
|
|
return waveform, sr |
|
|
|
|
|
|
|
|
def ensure_mono_16k(audio_path): |
|
|
"""Convert audio to mono + 16 kHz""" |
|
|
waveform, sr = load_audio(audio_path) |
|
|
if waveform.shape[0] > 1: |
|
|
waveform = waveform.mean(dim=0, keepdim=True) |
|
|
if sr != 16000: |
|
|
resampler = torchaudio.transforms.Resample(sr, 16000) |
|
|
waveform = resampler(waveform) |
|
|
sr = 16000 |
|
|
return waveform, sr |
|
|
|
|
|
|
|
|
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01): |
|
|
"""Trim leading silence, keep โค keep_ms ms""" |
|
|
energy = waveform.abs().mean(dim=0) |
|
|
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0] |
|
|
if len(non_silence_idx) == 0: |
|
|
return waveform |
|
|
first_non_silence = non_silence_idx[0].item() |
|
|
keep_samples = int(sr * (keep_ms / 1000.0)) |
|
|
start = max(0, first_non_silence - keep_samples) |
|
|
return waveform[:, start:] |
|
|
|
|
|
|
|
|
def preprocess_audio(audio_path): |
|
|
waveform, sr = ensure_mono_16k(audio_path) |
|
|
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01) |
|
|
tmp_path = "/tmp/processed_trimmed.wav" |
|
|
torchaudio.save(tmp_path, waveform, sr) |
|
|
return tmp_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe(audio): |
|
|
if audio is None: |
|
|
return "Please record or upload an audio file." |
|
|
|
|
|
|
|
|
processed_audio = preprocess_audio(audio) |
|
|
result = asr_pipeline(processed_audio)["text"] |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
title = "๐๏ธ MoulSot0.1 ASR ๐ฒ๐ฆ" |
|
|
description = """ |
|
|
**MoulSot0.1** model for Darija ASR ๐ฒ๐ฆ. |
|
|
You can record or upload an audio sample (it will be automatically resampled to 16 kHz mono), |
|
|
and view the transcription result below. |
|
|
|
|
|
MoulSot0.1 ูู ูุงุญุฏ ุงููู
ูุฐุฌ ุฏูุงู ุงูุชุนุฑู ุงูุชููุงุฆู ุนูู ุงูููุงู
(ASR) ูู ู
ุตู
ู
ุฎุตูุตูุง ููุฏุงุฑูุฌุฉ ุงูู
ุบุฑุจูุฉ. ูุงุฏ ุงููู
ูุฐุฌ ููู
ููู ุชุณุฌู ุดู ู
ูุทุน ุตูุชูุ ุฃู ุชูููุดุงุฑุฌููุ ู ูู ุฃูุชูู
ุงุชูููุง ููุญูู ุงูุตูุช ูู 16 ูููู ูุฑุชุฒ ู
ููู (ุฃุญุงุฏู)ุ ู ู
ู ุจุนุฏ ููุนุทูู ุงููุต ุงูู
ูุชูุจ ุฏูุงู ุฏุงูุดู ูู ููุชู. ูุนููุ ููุญูู ุงููุถุฑุฉ ุฏูุงูู ุงูู
ูุชูุจุฉ ููุฏุงุฑุฌุฉ. |
|
|
""" |
|
|
examples = [ |
|
|
["audio1.wav"], |
|
|
["audio2.wav"], |
|
|
] |
|
|
with gr.Blocks(title=title) as demo: |
|
|
gr.Markdown(f"# {title}\n{description}") |
|
|
|
|
|
with gr.Row(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="filepath", |
|
|
label="๐ค Record or Upload Audio (auto 16 kHz mono)" |
|
|
) |
|
|
|
|
|
transcribe_btn = gr.Button("๐ Transcribe") |
|
|
|
|
|
output_text = gr.Textbox(label="๐ฉ Transcription Output",lines=6) |
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[audio_input], |
|
|
outputs=[output_text], |
|
|
fn=transcribe, |
|
|
run_on_click=True, |
|
|
label="Example Audios" |
|
|
) |
|
|
transcribe_btn.click( |
|
|
fn=transcribe, |
|
|
inputs=[audio_input], |
|
|
outputs=[output_text] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |