Spaces:

Diggz10
/

voiceclear

Running

App Files Files Community

Diggz10 commited on Aug 21

Commit

203bd74

verified ·

1 Parent(s): 09dbb5c

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -34

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
-# app.py — Voice Clarity Booster with mode switch + dry/wet mix
-# Modes: MetricGAN+ (denoise) | SepFormer (dereverb+denoise) | Bypass (EQ only)
 import os
 import io
@@ -24,6 +27,13 @@ import soundfile as sf
 import torch
 import torchaudio
 # Prefer new SpeechBrain API; fall back for older versions
 try:
     from speechbrain.inference import SpectralMaskEnhancement
@@ -31,7 +41,6 @@ except Exception:  # < 1.0
     from speechbrain.pretrained import SpectralMaskEnhancement  # type: ignore
 try:
-    # SepFormer enhancement model (WHAMR) via separation interface
     from speechbrain.inference import SepformerSeparation
 except Exception:
     from speechbrain.pretrained import SepformerSeparation  # type: ignore
@@ -87,12 +96,11 @@ def _to_mono(wav: np.ndarray) -> np.ndarray:
         if t <= 8:   # [C, T]
             return wav.mean(axis=0).astype(np.float32)
         return wav.mean(axis=1).astype(np.float32)
-    # higher dims: fall back
     return wav.reshape(-1).astype(np.float32)
-def _sanitize(mono: np.ndarray) -> np.ndarray:
-    return np.nan_to_num(mono, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
@@ -124,36 +132,79 @@ def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
 def _align_lengths(a: np.ndarray, b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-    """Pad/crop to same length so we can mix dry/wet safely."""
     n = min(len(a), len(b))
     return a[:n], b[:n]
 # -----------------------------
 # Core pipeline
 # -----------------------------
-def _run_metricgan(clean_16k_path: str) -> torch.Tensor:
     enh = _get_metricgan()
     with torch.no_grad():
-        out = enh.enhance_file(clean_16k_path)  # [1, T] float32 -1..1
     return out
-def _run_sepformer(clean_16k_path: str) -> torch.Tensor:
     sep = _get_sepformer()
     with torch.no_grad():
-        # Some SB versions return [n_src, T]; others [1, T]
-        out = sep.separate_file(path=clean_16k_path)
-    # Normalize shape to [1, T]
     if isinstance(out, torch.Tensor):
         if out.dim() == 1:
             out = out.unsqueeze(0)
         elif out.dim() == 2 and out.shape[0] > 1:
-            out = out[:1, :]  # pick primary enhanced speech
         return out
-    # If older API returns numpy or list, convert:
     if hasattr(out, "numpy"):
-        t = torch.from_numpy(out)
         if t.dim() == 1:
             t = t.unsqueeze(0)
         elif t.dim() == 2 and t.shape[0] > 1:
@@ -174,17 +225,22 @@ def _enhance_numpy_audio(
     presence_db: float = 0.0,      # default 0 for safer tone
     lowcut_hz: float = 0.0,        # default 0 (off)
     out_sr: Optional[int] = None,
-) -> Tuple[int, np.ndarray]:
     """
     Input: (sr, np.float32 [T] or [T,C])
-    Returns: (sr_out, np.float32 [T])
     """
     sr_in, wav_np = audio
     wav_mono = _sanitize(_to_mono(wav_np))
     # Guard: tiny input
     if wav_mono.size < 32:
-        return sr_in, np.zeros(1600 if sr_in else 1600, dtype=np.float32)
     dry_t = torch.from_numpy(wav_mono).unsqueeze(0)  # [1, T @ sr_in]
     # Prepare 16k mono file for models
@@ -216,19 +272,38 @@ def _enhance_numpy_audio(
     # Resample both to output rate for mixing & export
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
     proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
-    dry_out = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
     # Align and mix
     proc_out, dry_out = _align_lengths(proc_out, dry_out)
     dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
-    mixed = (1.0 - (1.0 - dry_wet)) * proc_out + (1.0 - dry_wet) * dry_out  # equivalent to dry*(1-dw) + proc*dw
-    mixed = _sanitize(mixed)
-    # Safety: if somehow too tiny, fall back to dry
-    if mixed.size < 160:
-        return sr_out, dry_out
-    return sr_out, mixed
 # -----------------------------
@@ -241,25 +316,33 @@ def gradio_enhance(
     presence_db: float,
     lowcut_hz: float,
     output_sr: str,
 ):
     if audio is None:
-        return None
     out_sr = None
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
-    sr_out, enhanced = _enhance_numpy_audio(
         audio,
         mode=mode,
         dry_wet=dry_wet_pct / 100.0,
         presence_db=float(presence_db),
         lowcut_hz=float(lowcut_hz),
         out_sr=out_sr,
     )
-    return (sr_out, enhanced)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## Voice Clarity Booster")
     with gr.Row():
         with gr.Column():
             in_audio = gr.Audio(
@@ -282,6 +365,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             lowcut = gr.Slider(
                 minimum=0, maximum=200, value=0, step=5, label="Low-Cut (Hz)"
             )
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
                 value="Original",
@@ -289,12 +373,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
             btn = gr.Button("Enhance")
         with gr.Column():
-            out_audio = gr.Audio(type="numpy", label="Enhanced", autoplay=True)
     btn.click(
         gradio_enhance,
-        inputs=[in_audio, mode, dry_wet, presence, lowcut, out_sr],
-        outputs=[out_audio],
     )
 # Start server (Hugging Face Spaces expects this unguarded)

+# app.py — Voice Clarity Booster with clear A/B comparison & loudness match
+# - Modes: MetricGAN+ (denoise), SepFormer (dereverb+denoise), Bypass
+# - Dry/Wet, Presence, Low-cut
+# - Loudness Match (optional)
+# - Outputs: Enhanced, A/B alternating (2s O/E flip), Delta (Original−Enhanced), Metrics
 import os
 import io
 import torch
 import torchaudio
+# Optional: pyloudnorm for true LUFS matching; fallback to RMS if not available
+try:
+    import pyloudnorm as pyln
+    _HAVE_PYLN = True
+except Exception:
+    _HAVE_PYLN = False
 # Prefer new SpeechBrain API; fall back for older versions
 try:
     from speechbrain.inference import SpectralMaskEnhancement
     from speechbrain.pretrained import SpectralMaskEnhancement  # type: ignore
 try:
     from speechbrain.inference import SepformerSeparation
 except Exception:
     from speechbrain.pretrained import SepformerSeparation  # type: ignore
         if t <= 8:   # [C, T]
             return wav.mean(axis=0).astype(np.float32)
         return wav.mean(axis=1).astype(np.float32)
     return wav.reshape(-1).astype(np.float32)
+def _sanitize(x: np.ndarray) -> np.ndarray:
+    return np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
 def _align_lengths(a: np.ndarray, b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     n = min(len(a), len(b))
     return a[:n], b[:n]
+def _loudness_match_to_ref(ref: np.ndarray, cand: np.ndarray, sr: int) -> Tuple[np.ndarray, str]:
+    """
+    Match cand loudness to ref, returning adjusted signal and a short description.
+    Uses LUFS (pyloudnorm) if available, else RMS.
+    """
+    if len(ref) < sr // 10 or len(cand) < sr // 10:
+        return cand, "skipped (clip too short)"
+    if _HAVE_PYLN:
+        try:
+            meter = pyln.Meter(sr)  # EBUR128 meter
+            l_ref = meter.integrated_loudness(ref.astype(np.float64))
+            l_cand = meter.integrated_loudness(cand.astype(np.float64))
+            gain_db = l_ref - l_cand
+            cand_adj = cand * (10.0 ** (gain_db / 20.0))
+            return cand_adj.astype(np.float32), f"LUFS matched (Δ {gain_db:+.2f} dB)"
+        except Exception:
+            pass
+    # Fallback: RMS match
+    eps = 1e-9
+    rms_ref = np.sqrt(np.mean(ref**2) + eps)
+    rms_cand = np.sqrt(np.mean(cand**2) + eps)
+    gain = rms_ref / (rms_cand + eps)
+    cand_adj = cand * gain
+    gain_db = 20.0 * np.log10(gain + eps)
+    return cand_adj.astype(np.float32), f"RMS matched (Δ {gain_db:+.2f} dB)"
+def _make_ab_alternating(orig: np.ndarray, enh: np.ndarray, sr: int, seg_sec: float = 2.0) -> np.ndarray:
+    """
+    Build an A/B track that alternates: seg of Original, seg of Enhanced, repeated.
+    """
+    seg_n = max(1, int(seg_sec * sr))
+    orig, enh = _align_lengths(orig, enh)
+    n = len(orig)
+    out = []
+    pos = 0
+    flag = True  # True=orig, False=enh
+    while pos < n:
+        end = min(pos + seg_n, n)
+        out.append(orig[pos:end] if flag else enh[pos:end])
+        pos = end
+        flag = not flag
+    return np.concatenate(out, axis=0).astype(np.float32)
 # -----------------------------
 # Core pipeline
 # -----------------------------
+def _run_metricgan(path_16k: str) -> torch.Tensor:
     enh = _get_metricgan()
     with torch.no_grad():
+        out = enh.enhance_file(path_16k)  # [1, T]
     return out
+def _run_sepformer(path_16k: str) -> torch.Tensor:
     sep = _get_sepformer()
     with torch.no_grad():
+        out = sep.separate_file(path=path_16k)
     if isinstance(out, torch.Tensor):
         if out.dim() == 1:
             out = out.unsqueeze(0)
         elif out.dim() == 2 and out.shape[0] > 1:
+            out = out[:1, :]
         return out
     if hasattr(out, "numpy"):
+        t = torch.from_numpy(out.numpy())
         if t.dim() == 1:
             t = t.unsqueeze(0)
         elif t.dim() == 2 and t.shape[0] > 1:
     presence_db: float = 0.0,      # default 0 for safer tone
     lowcut_hz: float = 0.0,        # default 0 (off)
     out_sr: Optional[int] = None,
+    loudness_match: bool = True,
+) -> Tuple[int, np.ndarray, np.ndarray, str]:
     """
     Input: (sr, np.float32 [T] or [T,C])
+    Returns: (sr_out, enhanced, delta, metrics_text)
+      - enhanced: final output (after dry/wet, polish, loudness match)
+      - delta: original - enhanced (at output SR & length-matched)
     """
     sr_in, wav_np = audio
     wav_mono = _sanitize(_to_mono(wav_np))
     # Guard: tiny input
     if wav_mono.size < 32:
+        sr_out = sr_in if sr_in else 16000
+        silence = np.zeros(int(sr_out * 1.0), dtype=np.float32)
+        return sr_out, silence, silence, "Input too short; returned silence."
     dry_t = torch.from_numpy(wav_mono).unsqueeze(0)  # [1, T @ sr_in]
     # Prepare 16k mono file for models
     # Resample both to output rate for mixing & export
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
     proc_out = _resample_torch(proc, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
+    dry_out  = _resample_torch(dry_t, sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
     # Align and mix
     proc_out, dry_out = _align_lengths(proc_out, dry_out)
     dry_wet = float(np.clip(dry_wet, 0.0, 1.0))
+    enhanced = proc_out * dry_wet + dry_out * (1.0 - dry_wet)
+    # Loudness match enhanced back to original (optional)
+    loud_text = "off"
+    if loudness_match:
+        enhanced, loud_text = _loudness_match_to_ref(dry_out, enhanced, sr_out)
+    enhanced = _sanitize(enhanced)
+    # Delta (what changed)
+    delta = dry_out - enhanced
+    delta = _sanitize(delta)
+    # Basic metrics
+    eps = 1e-9
+    rms_dry = np.sqrt(np.mean(dry_out**2) + eps)
+    rms_enh = np.sqrt(np.mean(enhanced**2) + eps)
+    rms_delta = np.sqrt(np.mean(delta**2) + eps)
+    change_db = 20 * np.log10((rms_dry + eps) / (rms_delta + eps))
+    metrics = (
+        f"Mode: {mode} | Dry/Wet: {dry_wet*100:.0f}% | Presence: {presence_db:+.1f} dB | "
+        f"Low-cut: {lowcut_hz:.0f} Hz | Loudness match: {loud_text}\n"
+        f"Dur: {len(enhanced)/sr_out:.2f}s | Δ (original−enhanced) RMS: {20*np.log10(rms_delta+eps):+.2f} dBFS | "
+        f'Approx. "noise removed" ratio: {change_db:.2f} dB'
+    )
+    return sr_out, enhanced, delta, metrics
 # -----------------------------
     presence_db: float,
     lowcut_hz: float,
     output_sr: str,
+    loudness_match: bool,
 ):
     if audio is None:
+        return None, None, None, "No audio provided."
     out_sr = None
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
+    sr_out, enhanced, delta, metrics = _enhance_numpy_audio(
         audio,
         mode=mode,
         dry_wet=dry_wet_pct / 100.0,
         presence_db=float(presence_db),
         lowcut_hz=float(lowcut_hz),
         out_sr=out_sr,
+        loudness_match=bool(loudness_match),
     )
+    # Build A/B alternating track
+    sr_in, wav_np = audio
+    orig_mono = _sanitize(_to_mono(wav_np))
+    orig_at_out = _resample_torch(torch.from_numpy(orig_mono).unsqueeze(0), sr_in, sr_out).squeeze(0).numpy().astype(np.float32)
+    orig_at_out, enhanced = _align_lengths(orig_at_out, enhanced)
+    ab_alt = _make_ab_alternating(orig_at_out, enhanced, sr_out, seg_sec=2.0)
+    return (sr_out, enhanced), (sr_out, ab_alt), (sr_out, delta), metrics
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## Voice Clarity Booster — with A/B and Delta listening")
     with gr.Row():
         with gr.Column():
             in_audio = gr.Audio(
             lowcut = gr.Slider(
                 minimum=0, maximum=200, value=0, step=5, label="Low-Cut (Hz)"
             )
+            loudmatch = gr.Checkbox(value=True, label="Loudness-match enhanced to original")
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
                 value="Original",
             )
             btn = gr.Button("Enhance")
         with gr.Column():
+            out_audio = gr.Audio(type="numpy", label="Enhanced (autoplay)", autoplay=True)
+            ab_audio = gr.Audio(type="numpy", label="A/B Alternating (2s O → 2s E)", autoplay=False)
+            delta_audio = gr.Audio(type="numpy", label="Delta: Original − Enhanced", autoplay=False)
+            metrics = gr.Markdown("")
     btn.click(
         gradio_enhance,
+        inputs=[in_audio, mode, dry_wet, presence, lowcut, out_sr, loudmatch],
+        outputs=[out_audio, ab_audio, delta_audio, metrics],
     )
 # Start server (Hugging Face Spaces expects this unguarded)