File size: 3,829 Bytes
a5f1898
9d3cda7
b5458cd
 
a5f1898
 
 
 
87f4296
 
a5f1898
 
b5458cd
a5f1898
 
 
 
b5458cd
 
 
 
 
 
 
 
 
 
 
 
 
 
9d3cda7
e04057a
b5458cd
9d3cda7
 
 
 
 
 
e04057a
 
9d3cda7
8862dd0
b5458cd
8862dd0
 
 
 
 
 
 
b5458cd
8862dd0
 
 
 
 
 
 
 
9d3cda7
b5458cd
a5f1898
 
 
9d3cda7
8862dd0
b5458cd
9d3cda7
a5f1898
9d3cda7
b5458cd
9d3cda7
 
b5458cd
a5f1898
 
 
cdce7e5
a5f1898
cdce7e5
a5f1898
 
 
b5458cd
 
 
 
 
a5f1898
 
 
 
 
 
 
9d3cda7
 
b5458cd
a5f1898
 
 
 
 
 
 
 
 
 
9d3cda7
 
a5f1898
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import torchaudio
import soundfile as sf
import torch
from transformers import pipeline

# Preload both models
models = {
    "moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
    "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}

# Adjust generation configs
for m in models.values():
    m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
    m.model.generation_config.forced_decoder_ids = None


def load_audio(audio_path):
    """Robustly load any audio file into (waveform, sr)"""
    try:
        waveform, sr = torchaudio.load(audio_path)
    except Exception:
        # fallback for unknown backends
        data, sr = sf.read(audio_path)
        waveform = torch.tensor(data, dtype=torch.float32).T
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)
    return waveform, sr


def ensure_mono_16k(audio_path):
    """Convert audio to mono + 16 kHz"""
    waveform, sr = load_audio(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000
    return waveform, sr


def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
    """Trim leading silence, keep ≀ keep_ms ms"""
    energy = waveform.abs().mean(dim=0)
    non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
    if len(non_silence_idx) == 0:
        return waveform  # all silence
    first_non_silence = non_silence_idx[0].item()
    keep_samples = int(sr * (keep_ms / 1000.0))
    start = max(0, first_non_silence - keep_samples)
    return waveform[:, start:]


def preprocess_audio(audio_path):
    waveform, sr = ensure_mono_16k(audio_path)
    waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
    tmp_path = "/tmp/processed_trimmed.wav"
    torchaudio.save(tmp_path, waveform, sr)
    return tmp_path


def transcribe(audio, selected_model):
    if audio is None:
        return "Please record or upload an audio file.", "Please record or upload an audio file."

    processed_audio = preprocess_audio(audio)

    pipe_selected = models[selected_model]
    other_model = [k for k in models if k != selected_model][0]
    pipe_other = models[other_model]

    result_selected = pipe_selected(processed_audio)["text"]
    result_other = pipe_other(processed_audio)["text"]

    return result_selected, result_other


title = "πŸŽ™οΈ Moulsot ASR Comparison"
description = """
Compare two fine-tuned Whisper models for **Darija ASR**:
- 🟩 **moulsot_v0.1_2500**
- 🟦 **moulsot_v0.2_1000**

You can **record** or **upload** an audio sample.  
The app automatically:
- converts to **16 kHz mono**
- **removes leading silence** (≀ 0.1 s)
Then both models transcribe the result side by side.
"""

with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}\n{description}")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="🎀 Record or Upload Audio"
        )
        model_choice = gr.Radio(
            ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
            label="Choose Primary Model",
            value="moulsot_v0.1_2500"
        )

    transcribe_btn = gr.Button("πŸš€ Transcribe")

    with gr.Row():
        output_selected = gr.Textbox(label="🟩 Selected Model Output")
        output_other = gr.Textbox(label="🟦 Other Model Output")

    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, model_choice],
        outputs=[output_selected, output_other]
    )

if __name__ == "__main__":
    demo.launch()