Spaces:
Sleeping
Sleeping
File size: 1,853 Bytes
6a6d12b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import io
import os
import numpy as np
import soundfile as sf
import librosa
TARGET_SR = 16000
def load_audio(file_or_bytes, target_sr: int = TARGET_SR):
"""Load audio from path or bytes, mono, target SR."""
if isinstance(file_or_bytes, (str, os.PathLike)):
y, sr = librosa.load(file_or_bytes, sr=target_sr, mono=True)
else:
# assume bytes-like
y, sr0 = sf.read(io.BytesIO(file_or_bytes))
if y.ndim > 1:
y = y.mean(axis=1)
if sr0 != target_sr:
y = librosa.resample(y, orig_sr=sr0, target_sr=target_sr)
sr = target_sr
# normalize
if np.max(np.abs(y)) > 0:
y = y / np.max(np.abs(y))
return y.astype(np.float32), sr
def pad_or_trim(y: np.ndarray, duration_s: float = 3.0, sr: int = TARGET_SR):
"""Pad with zeros or trim to a fixed duration (seconds)."""
n = int(duration_s * sr)
if len(y) < n:
pad = n - len(y)
y = np.pad(y, (0, pad))
elif len(y) > n:
y = y[:n]
return y
def logmel(y: np.ndarray, sr: int = TARGET_SR, n_mels: int = 64):
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=n_mels, fmin=20, fmax=sr//2)
S = np.log1p(S).astype(np.float32)
return S # (n_mels, T)
def heuristic_features(y: np.ndarray, sr: int = TARGET_SR):
"""Lightweight features for a quick heuristic classifier."""
zcr = librosa.feature.zero_crossing_rate(y, hop_length=256).mean()
cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
flat = librosa.feature.spectral_flatness(y=y).mean()
roll = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
rms = librosa.feature.rms(y=y).mean()
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)
feats = np.array([zcr, cent, flat, roll, rms, *mfcc], dtype=np.float32)
return feats
|