""" Hugging Face Space: DTLN Voice Denoising Real-time speech denoising optimized for edge deployment """ import gradio as gr import numpy as np import soundfile as sf import tempfile import os from scipy import signal import zipfile from pathlib import Path # Note: In production, you would load a trained model # For this demo, we'll use a simple spectral subtraction approach def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10): """ Simple spectral subtraction for demonstration In production, this would use the trained DTLN model Args: audio: Input audio array sample_rate: Sampling rate noise_reduction_db: Amount of noise reduction in dB Returns: Denoised audio array """ # Compute STFT f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512) # Estimate noise from first 0.3 seconds noise_frames = int(0.3 * len(t)) noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True) # Spectral subtraction magnitude = np.abs(Zxx) phase = np.angle(Zxx) # Subtract noise estimate (with floor) alpha = 10 ** (noise_reduction_db / 20) magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude) # Reconstruct complex spectrum Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase) # Inverse STFT _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate) # Ensure output length matches input (trim or pad if needed) if len(audio_cleaned) > len(audio): audio_cleaned = audio_cleaned[:len(audio)] elif len(audio_cleaned) < len(audio): audio_cleaned = np.pad(audio_cleaned, (0, len(audio) - len(audio_cleaned)), mode='constant') # Normalize audio_cleaned = audio_cleaned / (np.max(np.abs(audio_cleaned)) + 1e-8) * 0.95 return audio_cleaned def process_audio(audio_file, noise_reduction): """ Process uploaded audio file Args: audio_file: Path to uploaded audio file noise_reduction: Noise reduction strength (0-20 dB) Returns: Tuple of (sample_rate, denoised_audio) """ if audio_file is None: return None, "Please upload an audio file" try: # Load audio audio, sample_rate = sf.read(audio_file) # Convert to mono if stereo if len(audio.shape) > 1: audio = np.mean(audio, axis=1) # Resample to 16kHz if needed (DTLN's native sample rate) if sample_rate != 16000: import scipy.signal as scipy_signal num_samples = int(len(audio) * 16000 / sample_rate) audio = scipy_signal.resample(audio, num_samples) sample_rate = 16000 # Normalize input audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95 # Apply denoising # Note: In production, this would use the trained DTLN model denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction) # Calculate improvement metrics noise = audio - denoised signal_power = np.mean(audio ** 2) noise_power = np.mean(noise ** 2) snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10)) info = f""" ✅ Processing Complete! 📊 Audio Info: - Duration: {len(audio)/sample_rate:.2f}s - Sample Rate: {sample_rate} Hz - Length: {len(audio):,} samples 📈 Quality Metrics: - SNR Improvement: {snr_improvement:.2f} dB - Noise Reduction: {noise_reduction} dB ⚠️ Note: This demo uses spectral subtraction for demonstration. The actual DTLN model provides superior quality when trained! """ return (sample_rate, denoised.astype(np.float32)), info except Exception as e: return None, f"❌ Error processing audio: {str(e)}" def generate_demo_audio(): """Generate demo noisy audio""" sample_rate = 16000 duration = 3.0 t = np.linspace(0, duration, int(duration * sample_rate)) # Generate synthetic speech speech = ( 0.3 * np.sin(2 * np.pi * 200 * t) + 0.2 * np.sin(2 * np.pi * 400 * t) + 0.15 * np.sin(2 * np.pi * 600 * t) ) # Add speech-like envelope envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t) speech = speech * envelope # Add noise noise = np.random.randn(len(t)) * 0.2 noisy = speech + noise # Normalize noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95 # Save to temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') sf.write(temp_file.name, noisy.astype(np.float32), sample_rate) return temp_file.name def start_training(clean_zip, noise_zip, epochs, batch_size, lstm_units): """ Start training process with uploaded datasets Args: clean_zip: Uploaded ZIP file with clean speech noise_zip: Uploaded ZIP file with noise samples epochs: Number of training epochs batch_size: Batch size lstm_units: Number of LSTM units Returns: Status message """ if clean_zip is None or noise_zip is None: return "❌ Please upload both clean speech and noise datasets as ZIP files" try: # Create temporary directories temp_dir = tempfile.mkdtemp() clean_dir = os.path.join(temp_dir, 'clean') noise_dir = os.path.join(temp_dir, 'noise') os.makedirs(clean_dir, exist_ok=True) os.makedirs(noise_dir, exist_ok=True) # Extract ZIP files with zipfile.ZipFile(clean_zip, 'r') as zip_ref: zip_ref.extractall(clean_dir) with zipfile.ZipFile(noise_zip, 'r') as zip_ref: zip_ref.extractall(noise_dir) # Count files clean_files = list(Path(clean_dir).glob('**/*.wav')) noise_files = list(Path(noise_dir).glob('**/*.wav')) status = f""" 📦 Dataset Extracted Successfully! 📊 Dataset Info: - Clean speech files: {len(clean_files)} - Noise files: {len(noise_files)} - Training epochs: {epochs} - Batch size: {batch_size} - LSTM units: {lstm_units} ⚠️ Training on Hugging Face Spaces: Due to the computational requirements and limited resources on Hugging Face Spaces, training cannot be run directly in this demo environment. 📥 To train your own model: 1. Download the training files from the "Files" tab: - train_dtln.py - dtln_ethos_u55.py - convert_to_tflite.py 2. Run training locally or on a GPU instance: ```bash python train_dtln.py \\ --clean-dir ./data/clean_speech \\ --noise-dir ./data/noise \\ --epochs {epochs} \\ --batch-size {batch_size} \\ --lstm-units {lstm_units} ``` 3. Convert to TFLite INT8: ```bash python convert_to_tflite.py \\ --model ./models/best_model.h5 \\ --output ./models/dtln.tflite \\ --calibration-dir ./data/clean_speech ``` 💡 Recommended Training Environment: - GPU: NVIDIA RTX 3060 or better - RAM: 16GB+ - Storage: 10GB+ for datasets - Time: 2-4 hours for 50 epochs For detailed instructions, see the deployment guide in the Files tab! """ return status except Exception as e: return f"❌ Error processing datasets: {str(e)}" # Custom CSS custom_css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { background: linear-gradient(90deg, #4CAF50, #45a049); border: none; } .gr-button:hover { background: linear-gradient(90deg, #45a049, #4CAF50); } #component-0 { max-width: 1200px; margin: auto; padding: 20px; } """ # Build Gradio interface with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ DTLN Voice Denoising Real-time speech enhancement optimized for edge deployment with **TensorFlow Lite**. ### 🚀 Features: - **Optimized for Edge AI**: Lightweight model with <100KB size - **Real-time Processing**: Low latency for streaming audio - **INT8 Quantization**: Efficient deployment with 8-bit precision - **TensorFlow Lite**: Ready for microcontroller deployment --- """) with gr.Tabs(): # Demo Tab with gr.Tab("🎵 Demo"): with gr.Row(): with gr.Column(): gr.Markdown("### 📤 Input") audio_input = gr.Audio( label="Upload Noisy Audio", type="filepath" ) noise_reduction = gr.Slider( minimum=0, maximum=20, value=10, step=1, label="Noise Reduction Strength (dB)", info="Higher values remove more noise but may affect speech quality" ) with gr.Row(): process_btn = gr.Button("🔄 Denoise Audio", variant="primary", size="lg") demo_btn = gr.Button("🎵 Try Demo Audio", variant="secondary") with gr.Column(): gr.Markdown("### 📥 Output") audio_output = gr.Audio( label="Denoised Audio", type="numpy" ) info_output = gr.Textbox( label="Processing Info", lines=12, max_lines=12 ) # Training Tab with gr.Tab("🔬 Training"): gr.Markdown(""" ### Train Your Own DTLN Model Upload your datasets and configure training parameters. ⚠️ **Note**: Training requires significant computational resources and cannot run directly on Hugging Face Spaces. This interface helps you prepare your data and provides the exact commands to run training locally. """) with gr.Row(): with gr.Column(): gr.Markdown("#### 📦 Datasets") clean_upload = gr.File( label="Clean Speech Dataset (ZIP)", file_types=[".zip"], type="filepath" ) gr.Markdown("*Upload a ZIP file containing clean speech WAV files*") noise_upload = gr.File( label="Noise Dataset (ZIP)", file_types=[".zip"], type="filepath" ) gr.Markdown("*Upload a ZIP file containing noise WAV files*") with gr.Column(): gr.Markdown("#### ⚙️ Training Parameters") epochs_slider = gr.Slider( minimum=10, maximum=200, value=50, step=10, label="Training Epochs" ) batch_slider = gr.Slider( minimum=4, maximum=64, value=16, step=4, label="Batch Size" ) lstm_slider = gr.Slider( minimum=64, maximum=256, value=128, step=32, label="LSTM Units" ) train_btn = gr.Button("📊 Prepare Training", variant="primary", size="lg") training_output = gr.Textbox( label="Training Instructions", lines=25, max_lines=30 ) # About section with gr.Accordion("📖 About This Model", open=False): gr.Markdown(""" ### DTLN Architecture **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model: - **Two-stage processing**: Magnitude estimation → Final enhancement - **LSTM-based**: Captures temporal dependencies in speech - **<1M parameters**: Lightweight for edge deployment - **Frequency + Time domain**: Processes both domains for better quality ### Edge Hardware Acceleration Compatible with various edge AI accelerators: - **NPU**: Arm Ethos-U series - **CPU**: ARM Cortex-M series - **Quantization**: 8-bit and 16-bit integer operations - **Memory**: Optimized for constrained devices ### Performance Targets | Metric | Value | |--------|-------| | Model Size | ~100 KB (INT8) | | Latency | 3-6 ms | | Power | 30-40 mW | | SNR Improvement | 10-15 dB | --- ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration. Download the full implementation to train and deploy the actual DTLN model! """) # Deployment guide section with gr.Accordion("🛠️ Training & Deployment Guide", open=False): gr.Markdown(""" ### Quick Start ```bash # 1. Install dependencies pip install -r requirements.txt # 2. Train model python train_dtln.py \\ --clean-dir ./data/clean_speech \\ --noise-dir ./data/noise \\ --epochs 50 \\ --batch-size 16 # 3. Convert to TFLite INT8 python convert_to_tflite.py \\ --model ./models/best_model.h5 \\ --output ./models/dtln_ethos_u55.tflite \\ --calibration-dir ./data/clean_speech # 4. (Optional) Optimize for hardware accelerator vela --accelerator-config ethos-u55-256 \\ --system-config Ethos_U55_High_End_Embedded \\ ./models/dtln_ethos_u55.tflite ``` ### Download Full Implementation The complete training and deployment code is available in the Files tab → Includes: - `dtln_ethos_u55.py` - Model architecture - `train_dtln.py` - Training with QAT - `convert_to_tflite.py` - TFLite conversion - `alif_e7_voice_denoising_guide.md` - Complete guide - `example_usage.py` - Usage examples ### Resources - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers) - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u) - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551) """) # Tech specs section with gr.Accordion("⚙️ Technical Specifications", open=False): gr.Markdown(""" ### Model Architecture Details **Input**: Raw audio waveform @ 16kHz - Frame length: 512 samples (32ms) - Frame shift: 128 samples (8ms) - Frequency bins: 257 (FFT size 512) **Network Structure**: ``` Input Audio (16kHz) ↓ STFT (512-point) ↓ [Stage 1] LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 1 ↓ Enhanced Magnitude 1 ↓ [Stage 2] LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 2 ↓ Enhanced Magnitude ↓ ISTFT ↓ Output Audio (16kHz) ``` **Training Configuration**: - Loss: Combined time + frequency domain MSE - Optimizer: Adam (lr=0.001) - Batch size: 16 - Epochs: 50 - Quantization: INT8 post-training quantization **Memory Footprint**: - Model weights: ~80 KB (INT8) - Tensor arena: ~100 KB - Audio buffers: ~2 KB - **Total**: ~200 KB ### Edge Device Deployment **Hardware Utilization**: - NPU/CPU: For LSTM inference - CPU: For FFT operations (CMSIS-DSP) - Memory: Optimized buffer management - Peripherals: I2S/PDM for audio I/O **Power Profile**: - Active inference: 30-40 mW - Idle: <1 mW - Average (50% duty): ~15-20 mW **Real-time Constraints**: - Frame processing: 8ms available - FFT: ~1ms - NPU inference: ~4ms - IFFT + overhead: ~2ms - **Margin**: ~1ms """) # Event handlers process_btn.click( fn=process_audio, inputs=[audio_input, noise_reduction], outputs=[audio_output, info_output] ) demo_btn.click( fn=generate_demo_audio, inputs=[], outputs=[audio_input] ) train_btn.click( fn=start_training, inputs=[clean_upload, noise_upload, epochs_slider, batch_slider, lstm_slider], outputs=[training_output] ) # Footer gr.Markdown(""" --- ### 📚 Citation If you use this model in your research, please cite: ```bibtex @inproceedings{westhausen2020dtln, title={Dual-signal transformation LSTM network for real-time noise suppression}, author={Westhausen, Nils L and Meyer, Bernd T}, booktitle={Interspeech}, year={2020} } ``` ---
Built for Edge AI • Optimized for MicrocontrollersOriginal DTLN
""") # Launch configuration if __name__ == "__main__": demo.launch()