Spaces:

grgsaliba
/

voice-denoising

Sleeping

App Files Files Community

voice-denoising / train_with_hf_datasets.py

grgsaliba

Upload train_with_hf_datasets.py with huggingface_hub

2a37d6d verified about 1 month ago

raw

history blame contribute delete

15.6 kB

	"""
	Train DTLN model using Hugging Face datasets
	Uses real speech and noise datasets for production-quality training
	"""

	import tensorflow as tf
	import tensorflow_model_optimization as tfmot
	import numpy as np
	import soundfile as sf
	import librosa
	from pathlib import Path
	import argparse
	from dtln_ethos_u55 import DTLN_Ethos_U55
	import os
	from datasets import load_dataset, Audio
	from tqdm import tqdm


	class HuggingFaceAudioDataGenerator(tf.keras.utils.Sequence):
	"""
	Data generator using Hugging Face datasets
	Loads clean speech and noise from HF Hub
	"""

	def __init__(
	self,
	clean_dataset_name="librispeech_asr",
	noise_dataset_name="dns-challenge/dns-challenge-4",
	clean_split="train.clean.100",
	noise_split="train",
	batch_size=16,
	samples_per_epoch=1000,
	frame_len=512,
	frame_shift=128,
	sampling_rate=16000,
	snr_range=(0, 20),
	shuffle=True,
	cache_dir=None
	):
	"""
	Args:
	clean_dataset_name: HF dataset for clean speech (default: LibriSpeech)
	noise_dataset_name: HF dataset for noise (default: DNS Challenge)
	clean_split: Split to use from clean dataset
	noise_split: Split to use from noise dataset
	batch_size: Batch size for training
	samples_per_epoch: Number of samples per epoch
	frame_len: Frame length in samples
	frame_shift: Frame shift in samples
	sampling_rate: Target sampling rate
	snr_range: Range of SNR for mixing (min, max) in dB
	shuffle: Whether to shuffle data each epoch
	cache_dir: Directory to cache datasets
	"""
	print(f"\n{'='*60}")
	print("Initializing Hugging Face Dataset Generator")
	print(f"{'='*60}")

	self.batch_size = batch_size
	self.samples_per_epoch = samples_per_epoch
	self.frame_len = frame_len
	self.frame_shift = frame_shift
	self.sampling_rate = sampling_rate
	self.snr_range = snr_range
	self.shuffle = shuffle

	# Segment length for training (1 second)
	self.segment_len = sampling_rate

	# Load datasets
	print(f"\n1. Loading clean speech dataset: {clean_dataset_name}")
	print(f" Split: {clean_split}")

	try:
	self.clean_dataset = load_dataset(
	clean_dataset_name,
	split=clean_split,
	streaming=True, # Stream for large datasets
	cache_dir=cache_dir
	)
	# Cast audio to correct sampling rate
	self.clean_dataset = self.clean_dataset.cast_column(
	"audio",
	Audio(sampling_rate=sampling_rate)
	)
	print(f" ✓ Clean speech dataset loaded")
	except Exception as e:
	print(f" ⚠ Error loading clean dataset: {e}")
	print(f" Using fallback: common_voice")
	self.clean_dataset = load_dataset(
	"mozilla-foundation/common_voice_11_0",
	"en",
	split="train",
	streaming=True,
	cache_dir=cache_dir
	)
	self.clean_dataset = self.clean_dataset.cast_column(
	"audio",
	Audio(sampling_rate=sampling_rate)
	)

	print(f"\n2. Loading noise dataset: {noise_dataset_name}")
	print(f" Split: {noise_split}")

	try:
	self.noise_dataset = load_dataset(
	noise_dataset_name,
	split=noise_split,
	streaming=True,
	cache_dir=cache_dir
	)
	self.noise_dataset = self.noise_dataset.cast_column(
	"audio",
	Audio(sampling_rate=sampling_rate)
	)
	print(f" ✓ Noise dataset loaded")
	except Exception as e:
	print(f" ⚠ Error loading noise dataset: {e}")
	print(f" Using synthetic noise instead")
	self.noise_dataset = None

	# Create iterators
	self.clean_iter = iter(self.clean_dataset)
	if self.noise_dataset:
	self.noise_iter = iter(self.noise_dataset)

	self.on_epoch_end()

	print(f"\n{'='*60}")
	print(f"Dataset Generator Ready")
	print(f" Batch size: {batch_size}")
	print(f" Samples per epoch: {samples_per_epoch}")
	print(f" Batches per epoch: {len(self)}")
	print(f"{'='*60}\n")

	def __len__(self):
	"""Return number of batches per epoch"""
	return self.samples_per_epoch // self.batch_size

	def __getitem__(self, index):
	"""Generate one batch of data"""
	batch_clean = []
	batch_noisy = []

	for _ in range(self.batch_size):
	try:
	# Get clean audio
	clean_audio = self._load_next_clean_audio()

	# Get noise
	if self.noise_dataset:
	noise_audio = self._load_next_noise_audio()
	else:
	# Generate synthetic noise
	noise_audio = np.random.randn(self.segment_len).astype(np.float32) * 0.1

	# Mix clean and noise at random SNR
	noisy_audio = self._mix_audio(clean_audio, noise_audio)

	batch_clean.append(clean_audio)
	batch_noisy.append(noisy_audio)

	except Exception as e:
	# If error, use white noise as fallback
	print(f" Warning: Error loading sample: {e}")
	noise = np.random.randn(self.segment_len).astype(np.float32) * 0.01
	batch_clean.append(noise)
	batch_noisy.append(noise)

	return np.array(batch_noisy), np.array(batch_clean)

	def on_epoch_end(self):
	"""Reset iterators at epoch end"""
	pass

	def _load_next_clean_audio(self):
	"""Load next clean audio sample"""
	try:
	sample = next(self.clean_iter)
	audio = sample['audio']['array']
	except StopIteration:
	# Restart iterator
	self.clean_iter = iter(self.clean_dataset)
	sample = next(self.clean_iter)
	audio = sample['audio']['array']

	return self._preprocess_audio(audio)

	def _load_next_noise_audio(self):
	"""Load next noise sample"""
	try:
	sample = next(self.noise_iter)
	if 'audio' in sample:
	audio = sample['audio']['array']
	elif 'noise' in sample:
	audio = sample['noise']['array']
	else:
	# Fallback to white noise
	audio = np.random.randn(self.segment_len).astype(np.float32) * 0.1
	except StopIteration:
	# Restart iterator
	self.noise_iter = iter(self.noise_dataset)
	sample = next(self.noise_iter)
	audio = sample['audio']['array']

	return self._preprocess_audio(audio)

	def _preprocess_audio(self, audio):
	"""Preprocess audio to target length and format"""
	# Convert to float32
	audio = audio.astype(np.float32)

	# Trim or pad to segment length
	if len(audio) > self.segment_len:
	start = np.random.randint(0, len(audio) - self.segment_len)
	audio = audio[start:start + self.segment_len]
	else:
	audio = np.pad(audio, (0, self.segment_len - len(audio)))

	# Normalize
	max_val = np.max(np.abs(audio))
	if max_val > 1e-8:
	audio = audio / max_val

	return audio

	def _mix_audio(self, clean, noise):
	"""Mix clean audio with noise at random SNR"""
	snr = np.random.uniform(*self.snr_range)

	# Calculate noise power
	clean_power = np.mean(clean ** 2)
	noise_power = np.mean(noise ** 2)

	# Calculate noise scaling factor
	if noise_power > 1e-8:
	snr_linear = 10 ** (snr / 10)
	noise_scale = np.sqrt(clean_power / (snr_linear * noise_power))
	else:
	noise_scale = 0.1

	# Mix
	noisy = clean + noise_scale * noise

	# Normalize to prevent clipping
	max_val = np.max(np.abs(noisy))
	if max_val > 1e-8:
	noisy = noisy / max_val * 0.95

	return noisy.astype(np.float32)


	def create_loss_function():
	"""
	Create custom loss function combining time and frequency domain losses
	"""
	def combined_loss(y_true, y_pred):
	# Time domain MSE
	time_loss = tf.reduce_mean(tf.square(y_true - y_pred))

	# Frequency domain loss (STFT-based)
	stft_true = tf.signal.stft(
	y_true,
	frame_length=512,
	frame_step=128
	)
	stft_pred = tf.signal.stft(
	y_pred,
	frame_length=512,
	frame_step=128
	)

	mag_true = tf.abs(stft_true)
	mag_pred = tf.abs(stft_pred)

	freq_loss = tf.reduce_mean(tf.square(mag_true - mag_pred))

	# Combined loss (weighted)
	return 0.7 * time_loss + 0.3 * freq_loss

	return combined_loss


	def train_model(
	clean_dataset="librispeech_asr",
	noise_dataset="dns-challenge/dns-challenge-4",
	clean_split="train.clean.100",
	noise_split="train",
	output_dir='./models',
	epochs=50,
	batch_size=16,
	samples_per_epoch=1000,
	lstm_units=128,
	learning_rate=0.001,
	use_qat=True,
	cache_dir=None
	):
	"""
	Main training function using HF datasets

	Args:
	clean_dataset: HF dataset name for clean speech
	noise_dataset: HF dataset name for noise
	clean_split: Split for clean dataset
	noise_split: Split for noise dataset
	output_dir: Directory to save models
	epochs: Number of training epochs
	batch_size: Training batch size
	samples_per_epoch: Samples per epoch
	lstm_units: Number of LSTM units
	learning_rate: Learning rate for Adam optimizer
	use_qat: Whether to use quantization-aware training
	cache_dir: Directory to cache datasets
	"""
	# Create output directory
	os.makedirs(output_dir, exist_ok=True)

	print("="*60)
	print("Training DTLN with Hugging Face Datasets")
	print("="*60)

	# Create model
	print("\n1. Building DTLN model...")
	dtln = DTLN_Ethos_U55(
	frame_len=512,
	frame_shift=128,
	lstm_units=lstm_units,
	sampling_rate=16000
	)

	model = dtln.build_model()
	print(" ✓ Model built")
	print(f" Parameters: {model.count_params():,}")

	# Apply QAT if requested
	if use_qat:
	print("\n2. Applying Quantization-Aware Training...")
	quantize_model = tfmot.quantization.keras.quantize_model
	model = quantize_model(model)
	print(" ✓ QAT applied (INT8 optimized)")

	# Compile model
	print("\n3. Compiling model...")
	model.compile(
	optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
	loss=create_loss_function(),
	metrics=['mae']
	)
	print(" ✓ Model compiled")

	# Create data generator
	print("\n4. Creating Hugging Face data generator...")
	train_generator = HuggingFaceAudioDataGenerator(
	clean_dataset_name=clean_dataset,
	noise_dataset_name=noise_dataset,
	clean_split=clean_split,
	noise_split=noise_split,
	batch_size=batch_size,
	samples_per_epoch=samples_per_epoch,
	frame_len=512,
	frame_shift=128,
	sampling_rate=16000,
	snr_range=(0, 20),
	shuffle=True,
	cache_dir=cache_dir
	)

	# Callbacks
	callbacks = [
	tf.keras.callbacks.ModelCheckpoint(
	filepath=os.path.join(output_dir, 'best_model.h5'),
	monitor='loss',
	save_best_only=True,
	verbose=1
	),
	tf.keras.callbacks.ReduceLROnPlateau(
	monitor='loss',
	factor=0.5,
	patience=5,
	min_lr=1e-6,
	verbose=1
	),
	tf.keras.callbacks.EarlyStopping(
	monitor='loss',
	patience=10,
	restore_best_weights=True,
	verbose=1
	),
	tf.keras.callbacks.TensorBoard(
	log_dir=os.path.join(output_dir, 'logs'),
	histogram_freq=1
	)
	]

	# Train
	print("\n5. Starting training...")
	print("="*60)
	history = model.fit(
	train_generator,
	epochs=epochs,
	callbacks=callbacks,
	verbose=1
	)

	# Save final model
	final_model_path = os.path.join(output_dir, 'dtln_final.h5')
	model.save(final_model_path)

	print(f"\n{'='*60}")
	print("✓ Training Complete!")
	print(f"{'='*60}")
	print(f"Final loss: {history.history['loss'][-1]:.4f}")
	print(f"Best loss: {min(history.history['loss']):.4f}")
	print(f"Model saved to: {final_model_path}")
	print(f"{'='*60}\n")

	return model, history


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description='Train DTLN model using Hugging Face datasets'
	)

	# Dataset arguments
	parser.add_argument(
	'--clean-dataset',
	type=str,
	default='librispeech_asr',
	help='HF dataset for clean speech (default: librispeech_asr)'
	)
	parser.add_argument(
	'--noise-dataset',
	type=str,
	default='dns-challenge/dns-challenge-4',
	help='HF dataset for noise (default: dns-challenge)'
	)
	parser.add_argument(
	'--clean-split',
	type=str,
	default='train.clean.100',
	help='Split for clean dataset'
	)
	parser.add_argument(
	'--noise-split',
	type=str,
	default='train',
	help='Split for noise dataset'
	)

	# Training arguments
	parser.add_argument(
	'--output-dir',
	type=str,
	default='./models',
	help='Output directory for models'
	)
	parser.add_argument(
	'--epochs',
	type=int,
	default=50,
	help='Number of training epochs'
	)
	parser.add_argument(
	'--batch-size',
	type=int,
	default=16,
	help='Training batch size'
	)
	parser.add_argument(
	'--samples-per-epoch',
	type=int,
	default=1000,
	help='Number of samples per epoch'
	)
	parser.add_argument(
	'--lstm-units',
	type=int,
	default=128,
	help='Number of LSTM units'
	)
	parser.add_argument(
	'--learning-rate',
	type=float,
	default=0.001,
	help='Learning rate'
	)
	parser.add_argument(
	'--no-qat',
	action='store_true',
	help='Disable quantization-aware training'
	)
	parser.add_argument(
	'--cache-dir',
	type=str,
	default=None,
	help='Directory to cache HF datasets'
	)

	args = parser.parse_args()

	# Train model
	model, history = train_model(
	clean_dataset=args.clean_dataset,
	noise_dataset=args.noise_dataset,
	clean_split=args.clean_split,
	noise_split=args.noise_split,
	output_dir=args.output_dir,
	epochs=args.epochs,
	batch_size=args.batch_size,
	samples_per_epoch=args.samples_per_epoch,
	lstm_units=args.lstm_units,
	learning_rate=args.learning_rate,
	use_qat=not args.no_qat,
	cache_dir=args.cache_dir
	)