Spaces:

Diggz10
/

voiceclear

Running

App Files Files Community

Diggz10 commited on Aug 21

Commit

b67ceda

verified ·

1 Parent(s): 4497e6e

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -28

app.py CHANGED Viewed

@@ -3,14 +3,34 @@ import os
 import tempfile
 from typing import Tuple, Optional
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import torch
 import torchaudio
-from fastapi import FastAPI, File, UploadFile, Query, Response
 from fastapi.responses import StreamingResponse
-from speechbrain.pretrained import SpectralMaskEnhancement
 # -----------------------------
 # Model: SpeechBrain MetricGAN+
@@ -22,7 +42,6 @@ _DEVICE = "cpu"
 def _get_enhancer() -> SpectralMaskEnhancement:
     global _ENHANCER
     if _ENHANCER is None:
-        # Downloads once and caches in the Space
         _ENHANCER = SpectralMaskEnhancement.from_hparams(
             source="speechbrain/metricgan-plus-voicebank",
             savedir="pretrained/metricgan_plus_voicebank",
@@ -35,16 +54,13 @@ def _get_enhancer() -> SpectralMaskEnhancement:
 # Audio helpers
 # -----------------------------
 def _to_mono(wav: np.ndarray) -> np.ndarray:
-    """Ensure mono shape [T]."""
     if wav.ndim == 1:
         return wav.astype(np.float32)
-    # shape [T, C] or [C, T]
     if wav.shape[0] < wav.shape[1]:
-        # likely [T, C]
         return wav.mean(axis=1).astype(np.float32)
-    else:
-        # likely [C, T]
-        return wav.mean(axis=0).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
@@ -56,21 +72,19 @@ def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
 def _highpass(wav: torch.Tensor, sr: int, cutoff_hz: float) -> torch.Tensor:
     if cutoff_hz is None or cutoff_hz <= 0:
         return wav
-    # 2nd-order Butterworth-ish highpass via biquad
     return torchaudio.functional.highpass_biquad(wav, sr, cutoff_hz)
 def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
-    """Simple presence (peaking) EQ around 4.5 kHz."""
     if abs(gain_db) < 1e-6:
         return wav
-    center = 4500.0  # presence band
-    q = 0.707       # wide-ish
     return torchaudio.functional.equalizer_biquad(wav, sr, center, q, gain_db)
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
-    """Peak-normalize to target dBFS (default -1 dB)."""
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
@@ -98,35 +112,33 @@ def _enhance_numpy_audio(
     enh = _get_enhancer()
     wav_16k = _resample_torch(wav_t, sr_in, 16000)
-    # Enhance via file path API for maximum compatibility
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
-        # Enhance; returns torch.Tensor [1, T]
-        clean = enh.enhance_file(tmp_in.name)
     try:
         os.remove(tmp_in.name)
     except Exception:
         pass
-    # Optional polish: high-pass & presence EQ
     clean = _highpass(clean, 16000, lowcut_hz)
     clean = _presence_boost(clean, 16000, presence_db)
-    # Peak limiting to avoid inter-sample clip
     clean = _limit_peak(clean, target_dbfs=-1.0)
-    # Resample back
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
-    clean_out = _resample_torch(clean, 16000, sr_out).squeeze(0).numpy().astype(
-        np.float32
     )
     return sr_out, clean_out
 def _wav_bytes(sr: int, mono_f32: np.ndarray) -> bytes:
-    """Encode a mono float32 array as 16-bit PCM WAV into bytes."""
     buf = io.BytesIO()
     sf.write(buf, mono_f32, sr, subtype="PCM_16", format="WAV")
     buf.seek(0)
@@ -136,7 +148,7 @@ def _wav_bytes(sr: int, mono_f32: np.ndarray) -> bytes:
 # -----------------------------
 # FastAPI app with raw endpoint
 # -----------------------------
-app = FastAPI(title="Voice Clarity Booster (MetricGAN+)", version="1.0.0")
 @app.post("/enhance")
@@ -148,7 +160,6 @@ async def enhance_endpoint(
 ):
     """Raw REST endpoint. Returns enhanced audio as audio/wav bytes."""
     data = await file.read()
-    # Decode with soundfile
     wav_np, sr_in = sf.read(io.BytesIO(data), always_2d=False, dtype="float32")
     sr_out, enhanced = _enhance_numpy_audio(
         (sr_in, wav_np),
@@ -157,7 +168,9 @@ async def enhance_endpoint(
         out_sr=output_sr if output_sr > 0 else None,
     )
     wav_bytes = _wav_bytes(sr_out, enhanced)
-    headers = {"Content-Disposition": f'attachment; filename="{os.path.splitext(file.filename or "audio")[0]}_enhanced.wav"'}
     return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers)
@@ -175,7 +188,6 @@ def gradio_enhance(
     out_sr = None
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
-    # "Original" -> None
     sr_out, enhanced = _enhance_numpy_audio(
         audio, presence_db=float(presence_db), lowcut_hz=float(lowcut_hz), out_sr=out_sr
     )

 import tempfile
 from typing import Tuple, Optional
+# ---- tame noisy deprecation warnings (optional but nice) ----
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    message=".*torchaudio._backend.list_audio_backends has been deprecated.*",
+)
+warnings.filterwarnings(
+    "ignore",
+    module=r"speechbrain\..*",
+    category=UserWarning,
+)
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import torch
 import torchaudio
+from fastapi import FastAPI, File, UploadFile, Query
 from fastapi.responses import StreamingResponse
+# ---- SpeechBrain import: prefer new API, fall back if older version ----
+try:
+    # SpeechBrain >= 1.0
+    from speechbrain.inference import SpectralMaskEnhancement
+except Exception:  # pragma: no cover
+    # Older SpeechBrain (<1.0)
+    from speechbrain.pretrained import SpectralMaskEnhancement  # type: ignore
 # -----------------------------
 # Model: SpeechBrain MetricGAN+
 def _get_enhancer() -> SpectralMaskEnhancement:
     global _ENHANCER
     if _ENHANCER is None:
         _ENHANCER = SpectralMaskEnhancement.from_hparams(
             source="speechbrain/metricgan-plus-voicebank",
             savedir="pretrained/metricgan_plus_voicebank",
 # Audio helpers
 # -----------------------------
 def _to_mono(wav: np.ndarray) -> np.ndarray:
+    """Ensure mono shape [T] float32."""
     if wav.ndim == 1:
         return wav.astype(np.float32)
+    # [T, C] or [C, T]
     if wav.shape[0] < wav.shape[1]:
         return wav.mean(axis=1).astype(np.float32)
+    return wav.mean(axis=0).astype(np.float32)
 def _resample_torch(wav: torch.Tensor, sr_in: int, sr_out: int) -> torch.Tensor:
 def _highpass(wav: torch.Tensor, sr: int, cutoff_hz: float) -> torch.Tensor:
     if cutoff_hz is None or cutoff_hz <= 0:
         return wav
     return torchaudio.functional.highpass_biquad(wav, sr, cutoff_hz)
 def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
+    """Simple presence EQ around ~4.5 kHz."""
     if abs(gain_db) < 1e-6:
         return wav
+    center = 4500.0
+    q = 0.707
     return torchaudio.functional.equalizer_biquad(wav, sr, center, q, gain_db)
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
     enh = _get_enhancer()
     wav_16k = _resample_torch(wav_t, sr_in, 16000)
+    # Enhance via file path API for broad compatibility
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
+        clean = enh.enhance_file(tmp_in.name)  # torch.Tensor [1, T]
     try:
         os.remove(tmp_in.name)
     except Exception:
         pass
+    # Optional polish: high-pass & presence EQ + peak limit
     clean = _highpass(clean, 16000, lowcut_hz)
     clean = _presence_boost(clean, 16000, presence_db)
     clean = _limit_peak(clean, target_dbfs=-1.0)
+    # Resample to requested output rate (or original)
     sr_out = sr_in if (out_sr is None or out_sr <= 0) else int(out_sr)
+    clean_out = (
+        _resample_torch(clean, 16000, sr_out).squeeze(0).numpy().astype(np.float32)
     )
     return sr_out, clean_out
 def _wav_bytes(sr: int, mono_f32: np.ndarray) -> bytes:
+    """Encode mono float32 array as 16-bit PCM WAV bytes."""
     buf = io.BytesIO()
     sf.write(buf, mono_f32, sr, subtype="PCM_16", format="WAV")
     buf.seek(0)
 # -----------------------------
 # FastAPI app with raw endpoint
 # -----------------------------
+app = FastAPI(title="Voice Clarity Booster (MetricGAN+)", version="1.0.1")
 @app.post("/enhance")
 ):
     """Raw REST endpoint. Returns enhanced audio as audio/wav bytes."""
     data = await file.read()
     wav_np, sr_in = sf.read(io.BytesIO(data), always_2d=False, dtype="float32")
     sr_out, enhanced = _enhance_numpy_audio(
         (sr_in, wav_np),
         out_sr=output_sr if output_sr > 0 else None,
     )
     wav_bytes = _wav_bytes(sr_out, enhanced)
+    headers = {
+        "Content-Disposition": f'attachment; filename="{os.path.splitext(file.filename or "audio")[0]}_enhanced.wav"'
+    }
     return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers)
     out_sr = None
     if output_sr in {"44100", "48000"}:
         out_sr = int(output_sr)
     sr_out, enhanced = _enhance_numpy_audio(
         audio, presence_db=float(presence_db), lowcut_hz=float(lowcut_hz), out_sr=out_sr
     )