01Yassine commited on
Commit
b5458cd
Β·
verified Β·
1 Parent(s): e04057a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -21
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
  import torchaudio
 
 
3
  from transformers import pipeline
4
 
5
  # Preload both models
@@ -8,14 +10,28 @@ models = {
8
  "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
9
  }
10
 
11
- # Adjust generation configs for both
12
  for m in models.values():
13
  m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
14
  m.model.generation_config.forced_decoder_ids = None
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def ensure_mono_16k(audio_path):
17
  """Convert audio to mono + 16 kHz"""
18
- waveform, sr = torchaudio.load(audio_path)
19
  if waveform.shape[0] > 1:
20
  waveform = waveform.mean(dim=0, keepdim=True)
21
  if sr != 16000:
@@ -26,11 +42,7 @@ def ensure_mono_16k(audio_path):
26
 
27
 
28
  def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
29
- """
30
- Remove leading silence from waveform, keeping at most `keep_ms` milliseconds.
31
- `threshold` controls what is considered silence.
32
- """
33
- # Compute energy-based mask
34
  energy = waveform.abs().mean(dim=0)
35
  non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
36
  if len(non_silence_idx) == 0:
@@ -38,8 +50,7 @@ def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
38
  first_non_silence = non_silence_idx[0].item()
39
  keep_samples = int(sr * (keep_ms / 1000.0))
40
  start = max(0, first_non_silence - keep_samples)
41
- trimmed = waveform[:, start:]
42
- return trimmed
43
 
44
 
45
  def preprocess_audio(audio_path):
@@ -49,33 +60,34 @@ def preprocess_audio(audio_path):
49
  torchaudio.save(tmp_path, waveform, sr)
50
  return tmp_path
51
 
 
52
  def transcribe(audio, selected_model):
53
  if audio is None:
54
  return "Please record or upload an audio file.", "Please record or upload an audio file."
55
 
56
- # Convert uploaded/recorded audio to mono 16kHz
57
  processed_audio = preprocess_audio(audio)
58
-
59
- # Selected + other model
60
  pipe_selected = models[selected_model]
61
  other_model = [k for k in models if k != selected_model][0]
62
  pipe_other = models[other_model]
63
-
64
- # Run inference
65
  result_selected = pipe_selected(processed_audio)["text"]
66
  result_other = pipe_other(processed_audio)["text"]
67
-
68
  return result_selected, result_other
69
 
70
 
71
- title = "πŸŽ™οΈ MoulSot Comparison"
72
  description = """
73
- Compare two fine-tuned models for **Darija ASR**:
74
  - 🟩 **moulsot_v0.1_2500**
75
  - 🟦 **moulsot_v0.2_1000**
76
 
77
- You can **record** or **upload** an audio sample (automatically resampled to 16 kHz mono),
78
- then view transcriptions from both models side by side.
 
 
 
79
  """
80
 
81
  with gr.Blocks(title=title) as demo:
@@ -85,7 +97,7 @@ with gr.Blocks(title=title) as demo:
85
  audio_input = gr.Audio(
86
  sources=["microphone", "upload"],
87
  type="filepath",
88
- label="🎀 Record or Upload Audio (auto 16 kHz mono)"
89
  )
90
  model_choice = gr.Radio(
91
  ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
@@ -105,6 +117,5 @@ with gr.Blocks(title=title) as demo:
105
  outputs=[output_selected, output_other]
106
  )
107
 
108
- # Local launch
109
  if __name__ == "__main__":
110
  demo.launch()
 
1
  import gradio as gr
2
  import torchaudio
3
+ import soundfile as sf
4
+ import torch
5
  from transformers import pipeline
6
 
7
  # Preload both models
 
10
  "moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
11
  }
12
 
13
+ # Adjust generation configs
14
  for m in models.values():
15
  m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
16
  m.model.generation_config.forced_decoder_ids = None
17
 
18
+
19
+ def load_audio(audio_path):
20
+ """Robustly load any audio file into (waveform, sr)"""
21
+ try:
22
+ waveform, sr = torchaudio.load(audio_path)
23
+ except Exception:
24
+ # fallback for unknown backends
25
+ data, sr = sf.read(audio_path)
26
+ waveform = torch.tensor(data, dtype=torch.float32).T
27
+ if waveform.ndim == 1:
28
+ waveform = waveform.unsqueeze(0)
29
+ return waveform, sr
30
+
31
+
32
  def ensure_mono_16k(audio_path):
33
  """Convert audio to mono + 16 kHz"""
34
+ waveform, sr = load_audio(audio_path)
35
  if waveform.shape[0] > 1:
36
  waveform = waveform.mean(dim=0, keepdim=True)
37
  if sr != 16000:
 
42
 
43
 
44
  def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
45
+ """Trim leading silence, keep ≀ keep_ms ms"""
 
 
 
 
46
  energy = waveform.abs().mean(dim=0)
47
  non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
48
  if len(non_silence_idx) == 0:
 
50
  first_non_silence = non_silence_idx[0].item()
51
  keep_samples = int(sr * (keep_ms / 1000.0))
52
  start = max(0, first_non_silence - keep_samples)
53
+ return waveform[:, start:]
 
54
 
55
 
56
  def preprocess_audio(audio_path):
 
60
  torchaudio.save(tmp_path, waveform, sr)
61
  return tmp_path
62
 
63
+
64
  def transcribe(audio, selected_model):
65
  if audio is None:
66
  return "Please record or upload an audio file.", "Please record or upload an audio file."
67
 
 
68
  processed_audio = preprocess_audio(audio)
69
+
 
70
  pipe_selected = models[selected_model]
71
  other_model = [k for k in models if k != selected_model][0]
72
  pipe_other = models[other_model]
73
+
 
74
  result_selected = pipe_selected(processed_audio)["text"]
75
  result_other = pipe_other(processed_audio)["text"]
76
+
77
  return result_selected, result_other
78
 
79
 
80
+ title = "πŸŽ™οΈ Moulsot Whisper ASR Comparison"
81
  description = """
82
+ Compare two fine-tuned Whisper models for **Arabic ASR**:
83
  - 🟩 **moulsot_v0.1_2500**
84
  - 🟦 **moulsot_v0.2_1000**
85
 
86
+ You can **record** or **upload** an audio sample.
87
+ The app automatically:
88
+ - converts to **16 kHz mono**
89
+ - **removes leading silence** (≀ 0.1 s)
90
+ Then both models transcribe the result side by side.
91
  """
92
 
93
  with gr.Blocks(title=title) as demo:
 
97
  audio_input = gr.Audio(
98
  sources=["microphone", "upload"],
99
  type="filepath",
100
+ label="🎀 Record or Upload Audio"
101
  )
102
  model_choice = gr.Radio(
103
  ["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
 
117
  outputs=[output_selected, output_other]
118
  )
119
 
 
120
  if __name__ == "__main__":
121
  demo.launch()