File size: 3,826 Bytes
a5f1898
9d3cda7
a5f1898
 
 
 
 
 
 
 
9d3cda7
a5f1898
 
 
 
 
9d3cda7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8862dd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d3cda7
a5f1898
 
 
9d3cda7
 
8862dd0
a5f1898
9d3cda7
 
a5f1898
9d3cda7
a5f1898
 
9d3cda7
 
 
a5f1898
 
 
cea229d
a5f1898
cea229d
a5f1898
 
 
9d3cda7
 
a5f1898
 
 
 
 
 
 
9d3cda7
 
 
a5f1898
 
 
 
 
 
 
 
 
 
9d3cda7
 
a5f1898
 
 
 
 
 
 
9d3cda7
a5f1898
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import torchaudio
from transformers import pipeline

# Preload both models
models = {
    "moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
    "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}

# Adjust generation configs for both
for m in models.values():
    m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
    m.model.generation_config.forced_decoder_ids = None


def ensure_mono_16k(audio_path):
    """Load audio, convert to mono + 16kHz, and save a temp version"""
    waveform, sr = torchaudio.load(audio_path)
    
    # Convert to mono if necessary
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Resample to 16kHz if necessary
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000
    
    tmp_path = "/tmp/processed_16k.wav"
    torchaudio.save(tmp_path, waveform, sr)
    return tmp_path

def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
    """
    Remove leading silence from waveform, keeping at most `keep_ms` milliseconds.
    `threshold` controls what is considered silence.
    """
    # Compute energy-based mask
    energy = waveform.abs().mean(dim=0)
    non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
    if len(non_silence_idx) == 0:
        return waveform  # all silence
    first_non_silence = non_silence_idx[0].item()
    keep_samples = int(sr * (keep_ms / 1000.0))
    start = max(0, first_non_silence - keep_samples)
    trimmed = waveform[:, start:]
    return trimmed


def preprocess_audio(audio_path):
    waveform, sr = ensure_mono_16k(audio_path)
    waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
    tmp_path = "/tmp/processed_trimmed.wav"
    torchaudio.save(tmp_path, waveform, sr)
    return tmp_path

def transcribe(audio, selected_model):
    if audio is None:
        return "Please record or upload an audio file.", "Please record or upload an audio file."

    # Convert uploaded/recorded audio to mono 16kHz
    processed_audio = preprocess_audio(audio)
    
    # Selected + other model
    pipe_selected = models[selected_model]
    other_model = [k for k in models if k != selected_model][0]
    pipe_other = models[other_model]
    
    # Run inference
    result_selected = pipe_selected(processed_audio)["text"]
    result_other = pipe_other(processed_audio)["text"]
    
    return result_selected, result_other


title = "πŸŽ™οΈ Moulsot Comparison"
description = """
Compare two fine-tuned models for **Darija ASR**:
- 🟩 **moulsot_v0.1_2500**
- 🟦 **moulsot_v0.2_1000**

You can **record** or **upload** an audio sample (automatically resampled to 16 kHz mono),
then view transcriptions from both models side by side.
"""

with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}\n{description}")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="🎀 Record or Upload Audio (auto 16 kHz mono)"
        )
        model_choice = gr.Radio(
            ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
            label="Choose Primary Model",
            value="moulsot_v0.1_2500"
        )

    transcribe_btn = gr.Button("πŸš€ Transcribe")

    with gr.Row():
        output_selected = gr.Textbox(label="🟩 Selected Model Output")
        output_other = gr.Textbox(label="🟦 Other Model Output")

    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, model_choice],
        outputs=[output_selected, output_other]
    )

# Local launch
if __name__ == "__main__":
    demo.launch()