Spaces:

grgsaliba
/

voice-denoising

Sleeping

File size: 12,419 Bytes

"""
Hugging Face Space: DTLN Voice Denoising
Real-time speech denoising optimized for edge deployment
"""

import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
from scipy import signal

# Note: In production, you would load a trained model
# For this demo, we'll use a simple spectral subtraction approach

def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
    """
    Simple spectral subtraction for demonstration
    In production, this would use the trained DTLN model
    
    Args:
        audio: Input audio array
        sample_rate: Sampling rate
        noise_reduction_db: Amount of noise reduction in dB
    
    Returns:
        Denoised audio array
    """
    # Compute STFT
    f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)
    
    # Estimate noise from first 0.3 seconds
    noise_frames = int(0.3 * len(t))
    noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)
    
    # Spectral subtraction
    magnitude = np.abs(Zxx)
    phase = np.angle(Zxx)
    
    # Subtract noise estimate (with floor)
    alpha = 10 ** (noise_reduction_db / 20)
    magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)
    
    # Reconstruct complex spectrum
    Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)
    
    # Inverse STFT
    _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)
    
    # Normalize
    audio_cleaned = audio_cleaned / (np.max(np.abs(audio_cleaned)) + 1e-8) * 0.95
    
    return audio_cleaned


def process_audio(audio_file, noise_reduction):
    """
    Process uploaded audio file
    
    Args:
        audio_file: Path to uploaded audio file
        noise_reduction: Noise reduction strength (0-20 dB)
    
    Returns:
        Tuple of (sample_rate, denoised_audio)
    """
    if audio_file is None:
        return None, "Please upload an audio file"
    
    try:
        # Load audio
        audio, sample_rate = sf.read(audio_file)
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)
        
        # Resample to 16kHz if needed (DTLN's native sample rate)
        if sample_rate != 16000:
            from scipy.signal import resample
            num_samples = int(len(audio) * 16000 / sample_rate)
            audio = resample(audio, num_samples)
            sample_rate = 16000
        
        # Normalize input
        audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
        
        # Apply denoising
        # Note: In production, this would use the trained DTLN model
        denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)
        
        # Calculate improvement metrics
        noise = audio - denoised
        signal_power = np.mean(audio ** 2)
        noise_power = np.mean(noise ** 2)
        snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))
        
        info = f"""
        ✅ Processing Complete!
        
        📊 Audio Info:
        - Duration: {len(audio)/sample_rate:.2f}s
        - Sample Rate: {sample_rate} Hz
        - Length: {len(audio):,} samples
        
        📈 Quality Metrics:
        - SNR Improvement: {snr_improvement:.2f} dB
        - Noise Reduction: {noise_reduction} dB
        
        ⚠️ Note: This demo uses spectral subtraction for demonstration.
        The actual DTLN model provides superior quality when trained!
        """
        
        return (sample_rate, denoised.astype(np.float32)), info
        
    except Exception as e:
        return None, f"❌ Error processing audio: {str(e)}"


def generate_demo_audio():
    """Generate demo noisy audio"""
    sample_rate = 16000
    duration = 3.0
    t = np.linspace(0, duration, int(duration * sample_rate))
    
    # Generate synthetic speech
    speech = (
        0.3 * np.sin(2 * np.pi * 200 * t) +
        0.2 * np.sin(2 * np.pi * 400 * t) +
        0.15 * np.sin(2 * np.pi * 600 * t)
    )
    
    # Add speech-like envelope
    envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
    speech = speech * envelope
    
    # Add noise
    noise = np.random.randn(len(t)) * 0.2
    noisy = speech + noise
    
    # Normalize
    noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95
    
    # Save to temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
    sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)
    
    return temp_file.name


# Custom CSS
custom_css = """
.gradio-container {
    font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
    background: linear-gradient(90deg, #4CAF50, #45a049);
    border: none;
}
.gr-button:hover {
    background: linear-gradient(90deg, #45a049, #4CAF50);
}
#component-0 {
    max-width: 900px;
    margin: auto;
    padding: 20px;
}
"""

# Build Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎙️ DTLN Voice Denoising

    Real-time speech enhancement optimized for edge deployment with **TensorFlow Lite**.

    ### 🚀 Features:
    - **Optimized for Edge AI**: Lightweight model with <100KB size
    - **Real-time Processing**: Low latency for streaming audio
    - **INT8 Quantization**: Efficient deployment with 8-bit precision
    - **TensorFlow Lite**: Ready for microcontroller deployment

    ---
    """)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📤 Input")
            audio_input = gr.Audio(
                label="Upload Noisy Audio",
                type="filepath",
                sources=["upload", "microphone"]
            )
            
            noise_reduction = gr.Slider(
                minimum=0,
                maximum=20,
                value=10,
                step=1,
                label="Noise Reduction Strength (dB)",
                info="Higher values remove more noise but may affect speech quality"
            )
            
            with gr.Row():
                process_btn = gr.Button("🔄 Denoise Audio", variant="primary", size="lg")
                demo_btn = gr.Button("🎵 Try Demo Audio", variant="secondary")
        
        with gr.Column():
            gr.Markdown("### 📥 Output")
            audio_output = gr.Audio(
                label="Denoised Audio",
                type="numpy"
            )
            
            info_output = gr.Textbox(
                label="Processing Info",
                lines=12,
                max_lines=12
            )
    
    # About section
    with gr.Accordion("📖 About This Model", open=False):
        gr.Markdown("""
        ### DTLN Architecture
        
        **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:
        
        - **Two-stage processing**: Magnitude estimation → Final enhancement
        - **LSTM-based**: Captures temporal dependencies in speech
        - **<1M parameters**: Lightweight for edge deployment
        - **Frequency + Time domain**: Processes both domains for better quality
        
        ### Edge Hardware Acceleration

        Compatible with various edge AI accelerators:
        - **NPU**: Arm Ethos-U series
        - **CPU**: ARM Cortex-M series
        - **Quantization**: 8-bit and 16-bit integer operations
        - **Memory**: Optimized for constrained devices
        
        ### Performance Targets
        
        | Metric | Value |
        |--------|-------|
        | Model Size | ~100 KB (INT8) |
        | Latency | 3-6 ms |
        | Power | 30-40 mW |
        | SNR Improvement | 10-15 dB |
        
        ---
        
        ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
        Download the full implementation to train and deploy the actual DTLN model!
        """)
    
    # Training guide section
    with gr.Accordion("🛠️ Training & Deployment Guide", open=False):
        gr.Markdown("""
        ### Quick Start
        
        ```bash
        # 1. Install dependencies
        pip install -r requirements.txt
        
        # 2. Train model
        python train_dtln.py \\
            --clean-dir ./data/clean_speech \\
            --noise-dir ./data/noise \\
            --epochs 50 \\
            --batch-size 16
        
        # 3. Convert to TFLite INT8
        python convert_to_tflite.py \\
            --model ./models/best_model.h5 \\
            --output ./models/dtln_ethos_u55.tflite \\
            --calibration-dir ./data/clean_speech
        
        # 4. (Optional) Optimize for hardware accelerator
        vela --accelerator-config ethos-u55-256 \\
             --system-config Ethos_U55_High_End_Embedded \\
             ./models/dtln_ethos_u55.tflite
        ```
        
        ### Download Full Implementation
        
        The complete training and deployment code is available in the Files tab →
        
        Includes:
        - `dtln_ethos_u55.py` - Model architecture
        - `train_dtln.py` - Training with QAT
        - `convert_to_tflite.py` - TFLite conversion
        - `alif_e7_voice_denoising_guide.md` - Complete guide
        - `example_usage.py` - Usage examples
        
        ### Resources

        - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
        - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
        - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
        """)
    
    # Tech specs section
    with gr.Accordion("⚙️ Technical Specifications", open=False):
        gr.Markdown("""
        ### Model Architecture Details
        
        **Input**: Raw audio waveform @ 16kHz
        - Frame length: 512 samples (32ms)
        - Frame shift: 128 samples (8ms)
        - Frequency bins: 257 (FFT size 512)
        
        **Network Structure**:
        ```
        Input Audio (16kHz)
            ↓
        STFT (512-point)
            ↓
        [Stage 1]
        LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 1
            ↓
        Enhanced Magnitude 1
            ↓
        [Stage 2]
        LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 2
            ↓
        Enhanced Magnitude
            ↓
        ISTFT
            ↓
        Output Audio (16kHz)
        ```
        
        **Training Configuration**:
        - Loss: Combined time + frequency domain MSE
        - Optimizer: Adam (lr=0.001)
        - Batch size: 16
        - Epochs: 50
        - Quantization: INT8 post-training quantization
        
        **Memory Footprint**:
        - Model weights: ~80 KB (INT8)
        - Tensor arena: ~100 KB
        - Audio buffers: ~2 KB
        - **Total**: ~200 KB
        
        ### Edge Device Deployment

        **Hardware Utilization**:
        - NPU/CPU: For LSTM inference
        - CPU: For FFT operations (CMSIS-DSP)
        - Memory: Optimized buffer management
        - Peripherals: I2S/PDM for audio I/O
        
        **Power Profile**:
        - Active inference: 30-40 mW
        - Idle: <1 mW
        - Average (50% duty): ~15-20 mW
        
        **Real-time Constraints**:
        - Frame processing: 8ms available
        - FFT: ~1ms
        - NPU inference: ~4ms
        - IFFT + overhead: ~2ms
        - **Margin**: ~1ms
        """)
    
    # Event handlers
    process_btn.click(
        fn=process_audio,
        inputs=[audio_input, noise_reduction],
        outputs=[audio_output, info_output],
        api_name="denoise"
    )

    demo_btn.click(
        fn=generate_demo_audio,
        inputs=[],
        outputs=[audio_input],
        api_name="demo"
    )
    
    # Footer
    gr.Markdown("""
    ---
    
    ### 📚 Citation
    
    If you use this model in your research, please cite:
    
    ```bibtex
    @inproceedings{westhausen2020dtln,
      title={Dual-signal transformation LSTM network for real-time noise suppression},
      author={Westhausen, Nils L and Meyer, Bernd T},
      booktitle={Interspeech},
      year={2020}
    }
    ```
    
    ---
    
    <div style="text-align: center; color: #666;">
        Built for <b>Edge AI</b> • Optimized for <b>Microcontrollers</b> •
        <a href="https://github.com/breizhn/DTLN">Original DTLN</a>
    </div>
    """)

# Launch configuration
if __name__ == "__main__":
    demo.launch()