Spaces:

01Yassine
/

MoulSot

Runtime error

App Files Files Community

MoulSot / app.py

01Yassine

Update app.py

cdce7e5 verified 26 days ago

raw

history blame contribute delete

3.83 kB

	import gradio as gr
	import torchaudio
	import soundfile as sf
	import torch
	from transformers import pipeline

	# Preload both models
	models = {
	"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
	"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
	}

	# Adjust generation configs
	for m in models.values():
	m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
	m.model.generation_config.forced_decoder_ids = None


	def load_audio(audio_path):
	"""Robustly load any audio file into (waveform, sr)"""
	try:
	waveform, sr = torchaudio.load(audio_path)
	except Exception:
	# fallback for unknown backends
	data, sr = sf.read(audio_path)
	waveform = torch.tensor(data, dtype=torch.float32).T
	if waveform.ndim == 1:
	waveform = waveform.unsqueeze(0)
	return waveform, sr


	def ensure_mono_16k(audio_path):
	"""Convert audio to mono + 16 kHz"""
	waveform, sr = load_audio(audio_path)
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	waveform = resampler(waveform)
	sr = 16000
	return waveform, sr


	def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
	"""Trim leading silence, keep ≤ keep_ms ms"""
	energy = waveform.abs().mean(dim=0)
	non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
	if len(non_silence_idx) == 0:
	return waveform # all silence
	first_non_silence = non_silence_idx[0].item()
	keep_samples = int(sr * (keep_ms / 1000.0))
	start = max(0, first_non_silence - keep_samples)
	return waveform[:, start:]


	def preprocess_audio(audio_path):
	waveform, sr = ensure_mono_16k(audio_path)
	waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
	tmp_path = "/tmp/processed_trimmed.wav"
	torchaudio.save(tmp_path, waveform, sr)
	return tmp_path


	def transcribe(audio, selected_model):
	if audio is None:
	return "Please record or upload an audio file.", "Please record or upload an audio file."

	processed_audio = preprocess_audio(audio)

	pipe_selected = models[selected_model]
	other_model = [k for k in models if k != selected_model][0]
	pipe_other = models[other_model]

	result_selected = pipe_selected(processed_audio)["text"]
	result_other = pipe_other(processed_audio)["text"]

	return result_selected, result_other


	title = "🎙️ Moulsot ASR Comparison"
	description = """
	Compare two fine-tuned Whisper models for Darija ASR:
	- 🟩 moulsot_v0.1_2500
	- 🟦 moulsot_v0.2_1000

	You can record or upload an audio sample.
	The app automatically:
	- converts to 16 kHz mono
	- removes leading silence (≤ 0.1 s)
	Then both models transcribe the result side by side.
	"""

	with gr.Blocks(title=title) as demo:
	gr.Markdown(f"# {title}\n{description}")

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎤 Record or Upload Audio"
	)
	model_choice = gr.Radio(
	["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
	label="Choose Primary Model",
	value="moulsot_v0.1_2500"
	)

	transcribe_btn = gr.Button("🚀 Transcribe")

	with gr.Row():
	output_selected = gr.Textbox(label="🟩 Selected Model Output")
	output_other = gr.Textbox(label="🟦 Other Model Output")

	transcribe_btn.click(
	fn=transcribe,
	inputs=[audio_input, model_choice],
	outputs=[output_selected, output_other]
	)

	if __name__ == "__main__":
	demo.launch()