Spaces:

KDM999
/

asr-multimodel-comparison

Running

App Files Files Community

KDM999 commited on Apr 6

Commit

a635c25

verified ·

1 Parent(s): b21ecd7

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -50

app.py CHANGED Viewed

@@ -1,31 +1,38 @@
 import gradio as gr
 import random
 import json
-import os
 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
 from transformers import pipeline
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
     data = json.load(f)
-# Available filter values
 ages = sorted(set(entry["age"] for entry in data))
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
-# Load pipelines
-device = 0  # 0 for CUDA/GPU, -1 for CPU
-pipe_whisper = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
-pipe_wav2vec2 = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
-pipe_hubert = pipeline("automatic-speech-recognition", model="facebook/hubert-base-ls960", device=device)
-def load_audio(file_path):
-    waveform, sr = torchaudio.load(file_path)
-    return torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0].numpy()
 def transcribe(pipe, file_path):
     result = pipe(file_path)
@@ -35,62 +42,100 @@ def highlight_differences(ref, hyp):
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
     for opcode, i1, i2, j1, j2 in sm.get_opcodes():
-        if opcode == 'equal':
             result.extend(hyp.split()[j1:j2])
-        elif opcode in ('replace', 'insert', 'delete'):
             wrong = hyp.split()[j1:j2]
             result.extend([f"<span style='color:red'>{w}</span>" for w in wrong])
     return " ".join(result)
-def run_demo(age, gender, accent):
     filtered = [
         entry for entry in data
         if entry["age"] == age and entry["gender"] == gender and entry["accent"] == accent
     ]
     if not filtered:
-        return "No matching sample.", None, "", "", "", "", "", ""
     sample = random.choice(filtered)
     file_path = os.path.join("common_voice_en_validated_249", sample["path"])
-    gold = sample["sentence"].strip().lower()
-    whisper_text = transcribe(pipe_whisper, file_path)
-    wav2vec_text = transcribe(pipe_wav2vec2, file_path)
-    hubert_text = transcribe(pipe_hubert, file_path)
-    table = f"""
-    <table border="1" style="width:100%">
-        <tr><th>Model</th><th>Transcription</th><th>WER</th></tr>
-        <tr><td><b>Gold</b></td><td>{gold}</td><td>0.00</td></tr>
-        <tr><td>Whisper</td><td>{highlight_differences(gold, whisper_text)}</td><td>{wer(gold, whisper_text):.2f}</td></tr>
-        <tr><td>Wav2Vec2</td><td>{highlight_differences(gold, wav2vec_text)}</td><td>{wer(gold, wav2vec_text):.2f}</td></tr>
-        <tr><td>HuBERT</td><td>{highlight_differences(gold, hubert_text)}</td><td>{wer(gold, hubert_text):.2f}</td></tr>
-    </table>
-    """
-    return sample["sentence"], file_path, gold, whisper_text, wav2vec_text, hubert_text, table, f"Audio path: {file_path}"
 with gr.Blocks() as demo:
-    gr.Markdown("# ASR Model Comparison on ESL Audio")
-    gr.Markdown("Filter by age, gender, and accent. Then generate a random ESL learner's audio to compare how Whisper, Wav2Vec2, and HuBERT transcribe it.")
     with gr.Row():
         age = gr.Dropdown(choices=ages, label="Age")
         gender = gr.Dropdown(choices=genders, label="Gender")
         accent = gr.Dropdown(choices=accents, label="Accent")
-    btn = gr.Button("Generate and Transcribe")
-    audio = gr.Audio(label="Audio", type="filepath")
-    wer_output = gr.HTML()
-    btn.click(fn=run_demo, inputs=[age, gender, accent], outputs=[
-        gr.Textbox(label="Gold (Correct)"),
-        audio,
-        gr.Textbox(label="Whisper Output"),
-        gr.Textbox(label="Wav2Vec2 Output"),
-        gr.Textbox(label="HuBERT Output"),
-        wer_output,
-        gr.Textbox(label="Path")
-    ])
 demo.launch()

 import gradio as gr
 import random
 import json
 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
 from transformers import pipeline
+import os
+import string
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
     data = json.load(f)
+# Prepare dropdown options
 ages = sorted(set(entry["age"] for entry in data))
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
+# Load ASR pipelines
+device = 0
+pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device, generate_kwargs={"language": "en"})
+pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, generate_kwargs={"language": "en"})
+pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device, generate_kwargs={"language": "en"})
+pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
+pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft", device=device)
+# Functions
+def convert_to_wav(file_path):
+    wav_path = file_path.replace(".mp3", ".wav")
+    if not os.path.exists(wav_path):
+        waveform, sample_rate = torchaudio.load(file_path)
+        waveform = waveform.mean(dim=0, keepdim=True)
+        torchaudio.save(wav_path, waveform, sample_rate)
+    return wav_path
 def transcribe(pipe, file_path):
     result = pipe(file_path)
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
     for opcode, i1, i2, j1, j2 in sm.get_opcodes():
+        if opcode == "equal":
             result.extend(hyp.split()[j1:j2])
+        else:
             wrong = hyp.split()[j1:j2]
             result.extend([f"<span style='color:red'>{w}</span>" for w in wrong])
     return " ".join(result)
+def normalize(text):
+    text = text.lower()
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    return text.strip()
+# Generate Audio
+def generate_audio(age, gender, accent):
     filtered = [
         entry for entry in data
         if entry["age"] == age and entry["gender"] == gender and entry["accent"] == accent
     ]
     if not filtered:
+        return None, "No matching sample."
     sample = random.choice(filtered)
     file_path = os.path.join("common_voice_en_validated_249", sample["path"])
+    wav_file_path = convert_to_wav(file_path)
+    return wav_file_path, wav_file_path
+# Transcribe & Compare
+def transcribe_audio(file_path):
+    if not file_path:
+        return "No file selected.", "", "", "", "", "", ""
+    filename_mp3 = os.path.basename(file_path).replace(".wav", ".mp3")
+    gold = ""
+    for entry in data:
+        if entry["path"].endswith(filename_mp3):
+            gold = normalize(entry["sentence"])
+            break
+    if not gold:
+        return "Reference not found.", "", "", "", "", "", ""
+    outputs = {}
+    models = {
+        "openai/whisper-medium": pipe_whisper_medium,
+        "openai/whisper-base": pipe_whisper_base,
+        "openai/whisper-tiny": pipe_whisper_tiny,
+        "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h,
+        "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft,
+    }
+    for name, model in models.items():
+        text = transcribe(model, file_path)
+        clean = normalize(text)
+        wer_score = wer(gold, clean)
+        outputs[name] = f"<b>{name} (WER: {wer_score:.2f}):</b><br>{highlight_differences(gold, clean)}"
+    return (gold, *outputs.values())
+# Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Comparing ASR Models on Diverse English Speech Samples")
+    gr.Markdown("
+        This demo compares the transcription performance of six automatic speech recognition (ASR) models on audio samples from English learners. "
+        "Users can select speaker metadata (age, gender, accent) to explore how models handle diverse speech profiles. "
+        "All samples are drawn from the validated subset (n=249) of the English dataset in the Common Voice Delta Segment 21.0 release.")
     with gr.Row():
         age = gr.Dropdown(choices=ages, label="Age")
         gender = gr.Dropdown(choices=genders, label="Gender")
         accent = gr.Dropdown(choices=accents, label="Accent")
+    generate_btn = gr.Button("Get Audio")
+    audio_output = gr.Audio(label="Audio", type="filepath", interactive=False)
+    file_path_output = gr.Textbox(label="Audio File Path", visible=False)
+    generate_btn.click(generate_audio, [age, gender, accent], [audio_output, file_path_output])
+    transcribe_btn = gr.Button("Transcribe with All Models")
+    gold_text = gr.Textbox(label="Reference (Gold Standard)")
+    whisper_medium_html = gr.HTML(label="Whisper Medium")
+    whisper_base_html = gr.HTML(label="Whisper Base")
+    whisper_tiny_html = gr.HTML(label="Whisper Tiny")
+    wav2vec_html = gr.HTML(label="Wav2Vec2 Base")
+    hubert_html = gr.HTML(label="HuBERT Large")
+    transcribe_btn.click(
+        transcribe_audio,
+        inputs=[file_path_output],
+        outputs=[
+            gold_text,
+            whisper_medium_html,
+            whisper_base_html,
+            whisper_tiny_html,
+            wav2vec_html,
+            hubert_html,
+        ],
+    )
 demo.launch()