import io import os import numpy as np import soundfile as sf import librosa TARGET_SR = 16000 def load_audio(file_or_bytes, target_sr: int = TARGET_SR): """Load audio from path or bytes, mono, target SR.""" if isinstance(file_or_bytes, (str, os.PathLike)): y, sr = librosa.load(file_or_bytes, sr=target_sr, mono=True) else: # assume bytes-like y, sr0 = sf.read(io.BytesIO(file_or_bytes)) if y.ndim > 1: y = y.mean(axis=1) if sr0 != target_sr: y = librosa.resample(y, orig_sr=sr0, target_sr=target_sr) sr = target_sr # normalize if np.max(np.abs(y)) > 0: y = y / np.max(np.abs(y)) return y.astype(np.float32), sr def pad_or_trim(y: np.ndarray, duration_s: float = 3.0, sr: int = TARGET_SR): """Pad with zeros or trim to a fixed duration (seconds).""" n = int(duration_s * sr) if len(y) < n: pad = n - len(y) y = np.pad(y, (0, pad)) elif len(y) > n: y = y[:n] return y def logmel(y: np.ndarray, sr: int = TARGET_SR, n_mels: int = 64): S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=n_mels, fmin=20, fmax=sr//2) S = np.log1p(S).astype(np.float32) return S # (n_mels, T) def heuristic_features(y: np.ndarray, sr: int = TARGET_SR): """Lightweight features for a quick heuristic classifier.""" zcr = librosa.feature.zero_crossing_rate(y, hop_length=256).mean() cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean() flat = librosa.feature.spectral_flatness(y=y).mean() roll = librosa.feature.spectral_rolloff(y=y, sr=sr).mean() rms = librosa.feature.rms(y=y).mean() mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1) feats = np.array([zcr, cent, flat, roll, rms, *mfcc], dtype=np.float32) return feats