Spaces:

varunkul
/

Voice-guard

Running

App Files Files Community

Voice-guard / app /utils /audio.py

varunkul

Upload 4 files

6a6d12b verified about 1 month ago

raw

history blame contribute delete

1.85 kB


	import io
	import os
	import numpy as np
	import soundfile as sf
	import librosa

	TARGET_SR = 16000

	def load_audio(file_or_bytes, target_sr: int = TARGET_SR):
	"""Load audio from path or bytes, mono, target SR."""
	if isinstance(file_or_bytes, (str, os.PathLike)):
	y, sr = librosa.load(file_or_bytes, sr=target_sr, mono=True)
	else:
	# assume bytes-like
	y, sr0 = sf.read(io.BytesIO(file_or_bytes))
	if y.ndim > 1:
	y = y.mean(axis=1)
	if sr0 != target_sr:
	y = librosa.resample(y, orig_sr=sr0, target_sr=target_sr)
	sr = target_sr
	# normalize
	if np.max(np.abs(y)) > 0:
	y = y / np.max(np.abs(y))
	return y.astype(np.float32), sr

	def pad_or_trim(y: np.ndarray, duration_s: float = 3.0, sr: int = TARGET_SR):
	"""Pad with zeros or trim to a fixed duration (seconds)."""
	n = int(duration_s * sr)
	if len(y) < n:
	pad = n - len(y)
	y = np.pad(y, (0, pad))
	elif len(y) > n:
	y = y[:n]
	return y

	def logmel(y: np.ndarray, sr: int = TARGET_SR, n_mels: int = 64):
	S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=n_mels, fmin=20, fmax=sr//2)
	S = np.log1p(S).astype(np.float32)
	return S # (n_mels, T)

	def heuristic_features(y: np.ndarray, sr: int = TARGET_SR):
	"""Lightweight features for a quick heuristic classifier."""
	zcr = librosa.feature.zero_crossing_rate(y, hop_length=256).mean()
	cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
	flat = librosa.feature.spectral_flatness(y=y).mean()
	roll = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
	rms = librosa.feature.rms(y=y).mean()
	mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)
	feats = np.array([zcr, cent, flat, roll, rms, *mfcc], dtype=np.float32)
	return feats