Spaces:

varunkul
/

Voice-guard

Sleeping

File size: 1,853 Bytes

6a6d12b


import io
import os
import numpy as np
import soundfile as sf
import librosa

TARGET_SR = 16000

def load_audio(file_or_bytes, target_sr: int = TARGET_SR):
    """Load audio from path or bytes, mono, target SR."""
    if isinstance(file_or_bytes, (str, os.PathLike)):
        y, sr = librosa.load(file_or_bytes, sr=target_sr, mono=True)
    else:
        # assume bytes-like
        y, sr0 = sf.read(io.BytesIO(file_or_bytes))
        if y.ndim > 1:
            y = y.mean(axis=1)
        if sr0 != target_sr:
            y = librosa.resample(y, orig_sr=sr0, target_sr=target_sr)
        sr = target_sr
    # normalize
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    return y.astype(np.float32), sr

def pad_or_trim(y: np.ndarray, duration_s: float = 3.0, sr: int = TARGET_SR):
    """Pad with zeros or trim to a fixed duration (seconds)."""
    n = int(duration_s * sr)
    if len(y) < n:
        pad = n - len(y)
        y = np.pad(y, (0, pad))
    elif len(y) > n:
        y = y[:n]
    return y

def logmel(y: np.ndarray, sr: int = TARGET_SR, n_mels: int = 64):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=n_mels, fmin=20, fmax=sr//2)
    S = np.log1p(S).astype(np.float32)
    return S  # (n_mels, T)

def heuristic_features(y: np.ndarray, sr: int = TARGET_SR):
    """Lightweight features for a quick heuristic classifier."""
    zcr = librosa.feature.zero_crossing_rate(y, hop_length=256).mean()
    cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    flat = librosa.feature.spectral_flatness(y=y).mean()
    roll = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    rms = librosa.feature.rms(y=y).mean()
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)
    feats = np.array([zcr, cent, flat, roll, rms, *mfcc], dtype=np.float32)
    return feats