File size: 4,450 Bytes
5db0355 87e47e2 5db0355 e1b40db 5db0355 87e47e2 5db0355 87e47e2 5db0355 87e47e2 5db0355 87e47e2 5db0355 87e47e2 5db0355 228f2d2 5db0355 228f2d2 5db0355 e35d680 228f2d2 5db0355 c96fa21 5db0355 c96fa21 5db0355 c96fa21 5db0355 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
import torchaudio
from transformers import pipeline
import soundfile as sf
import torch
# Load only the Moul-Sout-100 model
asr_pipeline = pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
# Adjust generation config if necessary
asr_pipeline.model.generation_config.input_ids = asr_pipeline.model.generation_config.forced_decoder_ids
asr_pipeline.model.generation_config.forced_decoder_ids = None
def load_audio(audio_path):
"""Robustly load any audio file into (waveform, sr)"""
try:
waveform, sr = torchaudio.load(audio_path)
except Exception:
# fallback for unknown backends
data, sr = sf.read(audio_path)
waveform = torch.tensor(data, dtype=torch.float32).T
if waveform.ndim == 1:
waveform = waveform.unsqueeze(0)
return waveform, sr
def ensure_mono_16k(audio_path):
"""Convert audio to mono + 16 kHz"""
waveform, sr = load_audio(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
waveform = resampler(waveform)
sr = 16000
return waveform, sr
def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
"""Trim leading silence, keep โค keep_ms ms"""
energy = waveform.abs().mean(dim=0)
non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
if len(non_silence_idx) == 0:
return waveform # all silence
first_non_silence = non_silence_idx[0].item()
keep_samples = int(sr * (keep_ms / 1000.0))
start = max(0, first_non_silence - keep_samples)
return waveform[:, start:]
def preprocess_audio(audio_path):
waveform, sr = ensure_mono_16k(audio_path)
waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
tmp_path = "/tmp/processed_trimmed.wav"
torchaudio.save(tmp_path, waveform, sr)
return tmp_path
# def ensure_mono_16k(audio_path):
# """Load audio, convert to mono + 16kHz, and save a temp version."""
# waveform, sr = torchaudio.load(audio_path)
# # Convert to mono if necessary
# if waveform.shape[0] > 1:
# waveform = waveform.mean(dim=0, keepdim=True)
# # Resample to 16kHz if necessary
# if sr != 16000:
# resampler = torchaudio.transforms.Resample(sr, 16000)
# waveform = resampler(waveform)
# sr = 16000
# tmp_path = "/tmp/processed_16k.wav"
# torchaudio.save(tmp_path, waveform, sr)
# return tmp_path
def transcribe(audio):
if audio is None:
return "Please record or upload an audio file."
# Process and transcribe
processed_audio = preprocess_audio(audio)
result = asr_pipeline(processed_audio)["text"]
return result
title = "๐๏ธ MoulSot0.1 ASR ๐ฒ๐ฆ"
description = """
**MoulSot0.1** model for Darija ASR ๐ฒ๐ฆ.
You can record or upload an audio sample (it will be automatically resampled to 16 kHz mono),
and view the transcription result below.
MoulSot0.1 ูู ูุงุญุฏ ุงููู
ูุฐุฌ ุฏูุงู ุงูุชุนุฑู ุงูุชููุงุฆู ุนูู ุงูููุงู
(ASR) ูู ู
ุตู
ู
ุฎุตูุตูุง ููุฏุงุฑูุฌุฉ ุงูู
ุบุฑุจูุฉ. ูุงุฏ ุงููู
ูุฐุฌ ููู
ููู ุชุณุฌู ุดู ู
ูุทุน ุตูุชูุ ุฃู ุชูููุดุงุฑุฌููุ ู ูู ุฃูุชูู
ุงุชูููุง ููุญูู ุงูุตูุช ูู 16 ูููู ูุฑุชุฒ ู
ููู (ุฃุญุงุฏู)ุ ู ู
ู ุจุนุฏ ููุนุทูู ุงููุต ุงูู
ูุชูุจ ุฏูุงู ุฏุงูุดู ูู ููุชู. ูุนููุ ููุญูู ุงููุถุฑุฉ ุฏูุงูู ุงูู
ูุชูุจุฉ ููุฏุงุฑุฌุฉ.
"""
examples = [
["audio1.wav"],
["audio2.wav"],
]
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="๐ค Record or Upload Audio (auto 16 kHz mono)"
)
transcribe_btn = gr.Button("๐ Transcribe")
output_text = gr.Textbox(label="๐ฉ Transcription Output",lines=6)
gr.Examples(
examples=examples,
inputs=[audio_input],
outputs=[output_text],
fn=transcribe,
run_on_click=True,
label="Example Audios"
)
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input],
outputs=[output_text]
)
# Local launch
if __name__ == "__main__":
demo.launch() |