Spaces:

Aditigo
/

DetectAccent

Build error

App Files Files Community

Aditigo commited on May 24

Commit

da1806c

verified ·

1 Parent(s): b24597e

Added Gradio app.py

Browse files

Files changed (1) hide show

app.py +230 -0

app.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#!/usr/bin/env python3
+"""
+English Accent Detector - Analyzes speaker's accent from video URLs
+"""
+from __future__ import annotations
+import argparse
+import random
+import tempfile
+from collections import Counter
+from pathlib import Path
+import time
+import torch
+import torchaudio
+import gradio as gr
+from speechbrain.inference.classifiers import EncoderClassifier
+from yt_dlp import YoutubeDL
+from huggingface_hub.utils import LocalEntryNotFoundError
+# ─────────────── Model setup (with retry) ───────────────
+ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa"
+LANG_MODEL_ID   = "speechbrain/lang-id-voxlingua107-ecapa"
+DEVICE = "cpu"  # force CPU; Spaces' free tier has no GPU
+def load_with_retry(model_id: str, tries: int = 5, backoff: int = 5):
+    """Download model weights with exponential-backoff retry."""
+    for attempt in range(1, tries + 1):
+        try:
+            return EncoderClassifier.from_hparams(
+                source=model_id,
+                run_opts={"device": DEVICE},
+            )
+        except LocalEntryNotFoundError:
+            if attempt == tries:
+                raise
+            wait = backoff * attempt
+            print(f"[{model_id}] download failed (try {attempt}/{tries}), retrying in {wait}s")
+            time.sleep(wait)
+accent_clf = load_with_retry(ACCENT_MODEL_ID)
+lang_clf   = load_with_retry(LANG_MODEL_ID)
+# ─────────────── Helpers ───────────────
+def sec_to_hms(sec: int) -> str:
+    h = sec // 3600
+    m = (sec % 3600) // 60
+    s = sec % 60
+    return f"{h:02d}:{m:02d}:{s:02d}"
+def download_audio(url: str, out_path: Path) -> Path:
+    opts = {
+        "format": "bestaudio/best",
+        "outtmpl": str(out_path.with_suffix(".%(ext)s")),
+        "postprocessors": [],
+        "quiet": True,
+    }
+    with YoutubeDL(opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        filename = ydl.prepare_filename(info)
+    return Path(filename)
+def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None:
+    target_sr = 16000
+    offset = start * target_sr
+    frames = dur * target_sr
+    wav, orig_sr = torchaudio.load(str(src), frame_offset=offset, num_frames=frames)
+    if orig_sr != target_sr:
+        wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav)
+    torchaudio.save(str(dst), wav, target_sr, encoding="PCM_S", bits_per_sample=16)
+def pick_random_offsets(total_s: int, n: int) -> list[int]:
+    max_start = total_s - 8
+    pool = list(range(max_start + 1))
+    if n > len(pool):
+        n = len(pool)
+    return random.sample(pool, n)
+# ─────────────── Classification ───────────────
+def classify_language(wav: Path) -> tuple[str, float]:
+    sig = lang_clf.load_audio(str(wav))
+    _, log_p, _, label = lang_clf.classify_batch(sig)
+    return label[0], float(log_p.exp().item()) * 100
+def classify_accent(wav: Path) -> tuple[str, float]:
+    sig = accent_clf.load_audio(str(wav))
+    _, log_p, _, label = accent_clf.classify_batch(sig)
+    return label[0], float(log_p.item()) * 100
+def calculate_english_confidence(lang: str, lang_conf: float, accent_conf: float) -> float:
+    if not lang.lower().startswith("en"):
+        return 0.0
+    english_score = (lang_conf * 0.7) + (accent_conf * 0.3)
+    return min(100.0, max(0.0, english_score))
+# ─────────────── Core pipeline ───────────────
+def analyse_accent(url: str, n_samples: int = 4) -> dict:
+    if not url:
+        return {"error": "Please provide a video URL."}
+    if n_samples < 1:
+        return {"error": "Number of samples must be at least 1."}
+    with tempfile.TemporaryDirectory() as td:
+        td = Path(td)
+        try:
+            # 1) Download audio
+            audio_file = download_audio(url, td / "audio")
+            info = torchaudio.info(str(audio_file))
+            total_s = int(info.num_frames / info.sample_rate)
+            if total_s < 8:
+                return {"error": "Audio shorter than 8 seconds."}
+            # 2) Language detection
+            mid_start = max(0, total_s // 2 - 4)
+            lang_wav = td / "lang_check.wav"
+            extract_wav(audio_file, lang_wav, start=mid_start)
+            lang, lang_conf = classify_language(lang_wav)
+            is_english = lang.lower().startswith("en")
+            if not is_english:
+                return {
+                    "is_english_speaker": False,
+                    "detected_language": lang,
+                    "language_confidence": round(lang_conf, 1),
+                    "accent_classification": "N/A",
+                    "english_confidence_score": 0.0,
+                    "summary": f"Non-English language detected: {lang} ({lang_conf:.1f}%)"
+                }
+            # 3) Accent analysis
+            offsets = pick_random_offsets(total_s, n_samples)
+            accent_results = []
+            for i, start in enumerate(sorted(offsets)):
+                clip_wav = td / f"clip_{i}.wav"
+                extract_wav(audio_file, clip_wav, start=start)
+                acc, conf = classify_accent(clip_wav)
+                accent_results.append({
+                    "clip": i + 1,
+                    "time_range": f"{sec_to_hms(start)} - {sec_to_hms(start + 8)}",
+                    "accent": acc,
+                    "confidence": round(conf, 1),
+                })
+            # 4) Aggregate results
+            labels = [r["accent"] for r in accent_results]
+            most_common_accent, count = Counter(labels).most_common(1)[0]
+            confs = [r["confidence"] for r in accent_results if r["accent"] == most_common_accent]
+            avg_conf = sum(confs) / len(confs)
+            eng_conf = calculate_english_confidence(lang, lang_conf, avg_conf)
+            return {
+                "is_english_speaker": True,
+                "detected_language": "English",
+                "language_confidence": round(lang_conf, 1),
+                "accent_classification": most_common_accent,
+                "accent_confidence": round(avg_conf, 1),
+                "english_confidence_score": round(eng_conf, 1),
+                "samples_analyzed": len(accent_results),
+                "consensus": f"{count}/{n_samples} samples",
+                "detailed_results": accent_results,
+                "summary": (
+                    f"English speaker detected with {most_common_accent} accent "
+                    f"(confidence: {eng_conf:.1f}%)"
+                )
+            }
+        except Exception as e:
+            return {"error": f"Processing failed: {e}"}
+# ─────────────── Gradio UI ───────────────
+def app():
+    with gr.Blocks(title="English Accent Detector") as demo:
+        gr.Markdown(
+            "# 🎙️ English Accent Detector\n"
+            "**Analyze speaker's accent from video URLs**\n\n"
+            "This tool:\n"
+            "1. Accepts public video URLs (YouTube, Loom, direct MP4 links)\n"
+            "2. Extracts audio from the video\n"
+            "3. Analyzes if the speaker is an English language candidate\n"
+            "4. Classifies the accent type and provides confidence scores\n"
+        )
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Text(
+                    label="Video URL",
+                    placeholder="Enter public video URL (YouTube, Loom, etc.)",
+                    lines=1
+                )
+                samples_input = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=4,
+                    step=1,
+                    label="Number of audio samples to analyze",
+                    info="More samples = more accurate but slower"
+                )
+                analyze_btn = gr.Button("🔍 Analyze Accent", variant="primary")
+            with gr.Column():
+                result_output = gr.JSON(label="Analysis Results")
+        gr.Markdown("### Example URLs to try:")
+        gr.Examples(
+            examples=[
+                ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", 4],
+                ["https://www.youtube.com/shorts/VO6n9GTzSqU", 4],
+            ],
+            inputs=[url_input, samples_input],
+            label="Click to load example"
+        )
+        analyze_btn.click(
+            fn=analyse_accent,
+            inputs=[url_input, samples_input],
+            outputs=result_output
+        )
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="English Accent Detector")
+    parser.add_argument(
+        "--port", type=int, default=7860,
+        help="Port to run the server on"
+    )
+    args = parser.parse_args()
+    demo = app()
+    # On Hugging Face Spaces, a public URL is provided automatically
+    demo.launch(server_port=args.port)