File size: 4,450 Bytes
5db0355
 
 
87e47e2
 
5db0355
 
e1b40db
5db0355
 
 
 
 
 
87e47e2
 
 
 
 
 
 
 
 
 
 
 
 
5db0355
87e47e2
 
5db0355
 
 
 
 
 
87e47e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5db0355
 
 
 
87e47e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5db0355
 
 
 
 
87e47e2
5db0355
 
 
 
 
228f2d2
5db0355
228f2d2
5db0355
 
e35d680
228f2d2
5db0355
c96fa21
 
 
 
5db0355
 
 
 
 
 
 
 
 
 
 
 
c96fa21
 
 
 
 
 
 
 
 
5db0355
 
 
 
 
 
c96fa21
5db0355
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import torchaudio
from transformers import pipeline
import soundfile as sf
import torch 

# Load only the Moul-Sout-100 model
asr_pipeline = pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")

# Adjust generation config if necessary
asr_pipeline.model.generation_config.input_ids = asr_pipeline.model.generation_config.forced_decoder_ids
asr_pipeline.model.generation_config.forced_decoder_ids = None


def load_audio(audio_path):
    """Robustly load any audio file into (waveform, sr)"""
    try:
        waveform, sr = torchaudio.load(audio_path)
    except Exception:
        # fallback for unknown backends
        data, sr = sf.read(audio_path)
        waveform = torch.tensor(data, dtype=torch.float32).T
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)
    return waveform, sr


def ensure_mono_16k(audio_path):
    """Convert audio to mono + 16 kHz"""
    waveform, sr = load_audio(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000
    return waveform, sr


def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
    """Trim leading silence, keep โ‰ค keep_ms ms"""
    energy = waveform.abs().mean(dim=0)
    non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
    if len(non_silence_idx) == 0:
        return waveform  # all silence
    first_non_silence = non_silence_idx[0].item()
    keep_samples = int(sr * (keep_ms / 1000.0))
    start = max(0, first_non_silence - keep_samples)
    return waveform[:, start:]


def preprocess_audio(audio_path):
    waveform, sr = ensure_mono_16k(audio_path)
    waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
    tmp_path = "/tmp/processed_trimmed.wav"
    torchaudio.save(tmp_path, waveform, sr)
    return tmp_path


# def ensure_mono_16k(audio_path):
#     """Load audio, convert to mono + 16kHz, and save a temp version."""
#     waveform, sr = torchaudio.load(audio_path)
    
#     # Convert to mono if necessary
#     if waveform.shape[0] > 1:
#         waveform = waveform.mean(dim=0, keepdim=True)
    
#     # Resample to 16kHz if necessary
#     if sr != 16000:
#         resampler = torchaudio.transforms.Resample(sr, 16000)
#         waveform = resampler(waveform)
#         sr = 16000
    
#     tmp_path = "/tmp/processed_16k.wav"
#     torchaudio.save(tmp_path, waveform, sr)
#     return tmp_path


def transcribe(audio):
    if audio is None:
        return "Please record or upload an audio file."
    
    # Process and transcribe
    processed_audio = preprocess_audio(audio)
    result = asr_pipeline(processed_audio)["text"]
    
    return result


title = "๐ŸŽ™๏ธ MoulSot0.1 ASR ๐Ÿ‡ฒ๐Ÿ‡ฆ"
description = """
**MoulSot0.1** model for Darija ASR ๐Ÿ‡ฒ๐Ÿ‡ฆ.
You can record or upload an audio sample (it will be automatically resampled to 16 kHz mono),
and view the transcription result below.

MoulSot0.1 ู‡ูˆ ูˆุงุญุฏ ุงู„ู†ู…ูˆุฐุฌ ุฏูŠุงู„ ุงู„ุชุนุฑู ุงู„ุชู„ู‚ุงุฆูŠ ุนู„ู‰ ุงู„ูƒู„ุงู… (ASR) ู„ูŠ ู…ุตู…ู… ุฎุตูŠุตู‹ุง ู„ู„ุฏุงุฑูŠุฌุฉ ุงู„ู…ุบุฑุจูŠุฉ. ู‡ุงุฏ ุงู„ู†ู…ูˆุฐุฌ ูƒูŠู…ูƒู†ูƒ ุชุณุฌู„ ุดูŠ ู…ู‚ุทุน ุตูˆุชูŠุŒ ุฃูˆ ุชูŠู„ูŠุดุงุฑุฌูŠู‡ุŒ ูˆ ู‡ูˆ ุฃูˆุชูˆู…ุงุชูŠูƒูŠุง ูƒูŠุญูˆู„ ุงู„ุตูˆุช ู„ู€ 16 ูƒูŠู„ูˆ ู‡ุฑุชุฒ ู…ูˆู†ูˆ (ุฃุญุงุฏูŠ)ุŒ ูˆ ู…ู† ุจุนุฏ ูƒูŠุนุทูŠูƒ ุงู„ู†ุต ุงู„ู…ูƒุชูˆุจ ุฏูŠุงู„ ุฏุงูƒุดูŠ ู„ูŠ ู‚ู„ุชูŠ. ูŠุนู†ูŠุŒ ูƒูŠุญูˆู„ ุงู„ู‡ุถุฑุฉ ุฏูŠุงู„ูƒ ุงู„ู…ูƒุชูˆุจุฉ ู„ู„ุฏุงุฑุฌุฉ.
"""
examples = [
    ["audio1.wav"],
    ["audio2.wav"],
]
with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}\n{description}")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="๐ŸŽค Record or Upload Audio (auto 16 kHz mono)"
        )

    transcribe_btn = gr.Button("๐Ÿš€ Transcribe")

    output_text = gr.Textbox(label="๐ŸŸฉ Transcription Output",lines=6)
    gr.Examples(
        examples=examples,
        inputs=[audio_input],
        outputs=[output_text],
        fn=transcribe,
        run_on_click=True,
        label="Example Audios"
    )
    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input],
        outputs=[output_text]
    )


# Local launch
if __name__ == "__main__":
    demo.launch()