import gradio as gr
import torchaudio
import soundfile as sf
import torch
from transformers import pipeline

# Preload both models
models = {
    "moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
    "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
}

# Adjust generation configs
for m in models.values():
    m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
    m.model.generation_config.forced_decoder_ids = None


def load_audio(audio_path):
    """Robustly load any audio file into (waveform, sr)"""
    try:
        waveform, sr = torchaudio.load(audio_path)
    except Exception:
        # fallback for unknown backends
        data, sr = sf.read(audio_path)
        waveform = torch.tensor(data, dtype=torch.float32).T
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)
    return waveform, sr


def ensure_mono_16k(audio_path):
    """Convert audio to mono + 16 kHz"""
    waveform, sr = load_audio(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000
    return waveform, sr


def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
    """Trim leading silence, keep ≤ keep_ms ms"""
    energy = waveform.abs().mean(dim=0)
    non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
    if len(non_silence_idx) == 0:
        return waveform  # all silence
    first_non_silence = non_silence_idx[0].item()
    keep_samples = int(sr * (keep_ms / 1000.0))
    start = max(0, first_non_silence - keep_samples)
    return waveform[:, start:]


def preprocess_audio(audio_path):
    waveform, sr = ensure_mono_16k(audio_path)
    waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
    tmp_path = "/tmp/processed_trimmed.wav"
    torchaudio.save(tmp_path, waveform, sr)
    return tmp_path


def transcribe(audio, selected_model):
    if audio is None:
        return "Please record or upload an audio file.", "Please record or upload an audio file."

    processed_audio = preprocess_audio(audio)

    pipe_selected = models[selected_model]
    other_model = [k for k in models if k != selected_model][0]
    pipe_other = models[other_model]

    result_selected = pipe_selected(processed_audio)["text"]
    result_other = pipe_other(processed_audio)["text"]

    return result_selected, result_other


title = "🎙️ Moulsot ASR Comparison"
description = """
Compare two fine-tuned Whisper models for **Darija ASR**:
- 🟩 **moulsot_v0.1_2500**
- 🟦 **moulsot_v0.2_1000**

You can **record** or **upload** an audio sample.  
The app automatically:
- converts to **16 kHz mono**
- **removes leading silence** (≤ 0.1 s)
Then both models transcribe the result side by side.
"""

with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}\n{description}")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="🎤 Record or Upload Audio"
        )
        model_choice = gr.Radio(
            ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
            label="Choose Primary Model",
            value="moulsot_v0.1_2500"
        )

    transcribe_btn = gr.Button("🚀 Transcribe")

    with gr.Row():
        output_selected = gr.Textbox(label="🟩 Selected Model Output")
        output_other = gr.Textbox(label="🟦 Other Model Output")

    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, model_choice],
        outputs=[output_selected, output_other]
    )

if __name__ == "__main__":
    demo.launch()