Spaces:

grgsaliba
/

voice-denoising

Sleeping

File size: 17,914 Bytes

b6c9ef9
9e750ac
b6c9ef9
 
 
 
 
 
 
 
 
f7fb413
 
b6c9ef9
 
 
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
 
 
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
f7fb413
b6c9ef9
 
92858ca
 
 
 
 
 
 
b6c9ef9
 
92858ca
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
dea61f6
b6c9ef9
dea61f6
b6c9ef9
f7fb413
b6c9ef9
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
 
 
f7fb413
b6c9ef9
 
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
 
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6c9ef9
 
 
 
 
 
 
 
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
 
 
 
 
9e750ac
 
 
 
b6c9ef9
9e750ac
 
b6c9ef9
 
9e750ac
b6c9ef9
 
f7fb413
 
 
 
b6c9ef9
f7fb413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6c9ef9
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
 
f7fb413
9e750ac
 
 
 
 
b6c9ef9
9e750ac
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
f7fb413
 
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
f7fb413
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
 
f7fb413
9e750ac
b6c9ef9
 
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
9e750ac
b6c9ef9
9e750ac
 
b6c9ef9
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
 
f7fb413
9e750ac
 
b6c9ef9
9e750ac
 
 
b6c9ef9
f7fb413
b6c9ef9
 
 
 
f7fb413
b6c9ef9
 
 
 
 
 
 
f7fb413
b6c9ef9
 
 
 
402ea25
b6c9ef9
08b14e1
b6c9ef9
 
 
402ea25
b6c9ef9
f7fb413
 
 
 
 
 
 
b6c9ef9
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
f7fb413
b6c9ef9
 
 
 
 
 
 
 
f7fb413
b6c9ef9
f7fb413
b6c9ef9
9e750ac
b6c9ef9

"""
Hugging Face Space: DTLN Voice Denoising
Real-time speech denoising optimized for edge deployment
"""

import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
from scipy import signal
import zipfile
from pathlib import Path

# Note: In production, you would load a trained model
# For this demo, we'll use a simple spectral subtraction approach

def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
    """
    Simple spectral subtraction for demonstration
    In production, this would use the trained DTLN model

    Args:
        audio: Input audio array
        sample_rate: Sampling rate
        noise_reduction_db: Amount of noise reduction in dB

    Returns:
        Denoised audio array
    """
    # Compute STFT
    f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)

    # Estimate noise from first 0.3 seconds
    noise_frames = int(0.3 * len(t))
    noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)

    # Spectral subtraction
    magnitude = np.abs(Zxx)
    phase = np.angle(Zxx)

    # Subtract noise estimate (with floor)
    alpha = 10 ** (noise_reduction_db / 20)
    magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)

    # Reconstruct complex spectrum
    Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)

    # Inverse STFT
    _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)

    # Ensure output length matches input (trim or pad if needed)
    if len(audio_cleaned) > len(audio):
        audio_cleaned = audio_cleaned[:len(audio)]
    elif len(audio_cleaned) < len(audio):
        audio_cleaned = np.pad(audio_cleaned, (0, len(audio) - len(audio_cleaned)), mode='constant')

    # Normalize
    audio_cleaned = audio_cleaned / (np.max(np.abs(audio_cleaned)) + 1e-8) * 0.95

    return audio_cleaned


def process_audio(audio_file, noise_reduction):
    """
    Process uploaded audio file

    Args:
        audio_file: Path to uploaded audio file
        noise_reduction: Noise reduction strength (0-20 dB)

    Returns:
        Tuple of (sample_rate, denoised_audio)
    """
    if audio_file is None:
        return None, "Please upload an audio file"

    try:
        # Load audio
        audio, sample_rate = sf.read(audio_file)

        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)

        # Resample to 16kHz if needed (DTLN's native sample rate)
        if sample_rate != 16000:
            import scipy.signal as scipy_signal
            num_samples = int(len(audio) * 16000 / sample_rate)
            audio = scipy_signal.resample(audio, num_samples)
            sample_rate = 16000

        # Normalize input
        audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95

        # Apply denoising
        # Note: In production, this would use the trained DTLN model
        denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)

        # Calculate improvement metrics
        noise = audio - denoised
        signal_power = np.mean(audio ** 2)
        noise_power = np.mean(noise ** 2)
        snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))

        info = f"""
        ✅ Processing Complete!

        📊 Audio Info:
        - Duration: {len(audio)/sample_rate:.2f}s
        - Sample Rate: {sample_rate} Hz
        - Length: {len(audio):,} samples

        📈 Quality Metrics:
        - SNR Improvement: {snr_improvement:.2f} dB
        - Noise Reduction: {noise_reduction} dB

        ⚠️ Note: This demo uses spectral subtraction for demonstration.
        The actual DTLN model provides superior quality when trained!
        """

        return (sample_rate, denoised.astype(np.float32)), info

    except Exception as e:
        return None, f"❌ Error processing audio: {str(e)}"


def generate_demo_audio():
    """Generate demo noisy audio"""
    sample_rate = 16000
    duration = 3.0
    t = np.linspace(0, duration, int(duration * sample_rate))

    # Generate synthetic speech
    speech = (
        0.3 * np.sin(2 * np.pi * 200 * t) +
        0.2 * np.sin(2 * np.pi * 400 * t) +
        0.15 * np.sin(2 * np.pi * 600 * t)
    )

    # Add speech-like envelope
    envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
    speech = speech * envelope

    # Add noise
    noise = np.random.randn(len(t)) * 0.2
    noisy = speech + noise

    # Normalize
    noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95

    # Save to temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
    sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)

    return temp_file.name


def start_training(clean_zip, noise_zip, epochs, batch_size, lstm_units):
    """
    Start training process with uploaded datasets

    Args:
        clean_zip: Uploaded ZIP file with clean speech
        noise_zip: Uploaded ZIP file with noise samples
        epochs: Number of training epochs
        batch_size: Batch size
        lstm_units: Number of LSTM units

    Returns:
        Status message
    """
    if clean_zip is None or noise_zip is None:
        return "❌ Please upload both clean speech and noise datasets as ZIP files"

    try:
        # Create temporary directories
        temp_dir = tempfile.mkdtemp()
        clean_dir = os.path.join(temp_dir, 'clean')
        noise_dir = os.path.join(temp_dir, 'noise')
        os.makedirs(clean_dir, exist_ok=True)
        os.makedirs(noise_dir, exist_ok=True)

        # Extract ZIP files
        with zipfile.ZipFile(clean_zip, 'r') as zip_ref:
            zip_ref.extractall(clean_dir)

        with zipfile.ZipFile(noise_zip, 'r') as zip_ref:
            zip_ref.extractall(noise_dir)

        # Count files
        clean_files = list(Path(clean_dir).glob('**/*.wav'))
        noise_files = list(Path(noise_dir).glob('**/*.wav'))

        status = f"""
        📦 Dataset Extracted Successfully!

        📊 Dataset Info:
        - Clean speech files: {len(clean_files)}
        - Noise files: {len(noise_files)}
        - Training epochs: {epochs}
        - Batch size: {batch_size}
        - LSTM units: {lstm_units}

        ⚠️ Training on Hugging Face Spaces:

        Due to the computational requirements and limited resources on Hugging Face Spaces,
        training cannot be run directly in this demo environment.

        📥 To train your own model:

        1. Download the training files from the "Files" tab:
           - train_dtln.py
           - dtln_ethos_u55.py
           - convert_to_tflite.py

        2. Run training locally or on a GPU instance:

           ```bash
           python train_dtln.py \\
               --clean-dir ./data/clean_speech \\
               --noise-dir ./data/noise \\
               --epochs {epochs} \\
               --batch-size {batch_size} \\
               --lstm-units {lstm_units}
           ```

        3. Convert to TFLite INT8:

           ```bash
           python convert_to_tflite.py \\
               --model ./models/best_model.h5 \\
               --output ./models/dtln.tflite \\
               --calibration-dir ./data/clean_speech
           ```

        💡 Recommended Training Environment:
        - GPU: NVIDIA RTX 3060 or better
        - RAM: 16GB+
        - Storage: 10GB+ for datasets
        - Time: 2-4 hours for 50 epochs

        For detailed instructions, see the deployment guide in the Files tab!
        """

        return status

    except Exception as e:
        return f"❌ Error processing datasets: {str(e)}"


# Custom CSS
custom_css = """
.gradio-container {
    font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
    background: linear-gradient(90deg, #4CAF50, #45a049);
    border: none;
}
.gr-button:hover {
    background: linear-gradient(90deg, #45a049, #4CAF50);
}
#component-0 {
    max-width: 1200px;
    margin: auto;
    padding: 20px;
}
"""

# Build Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎙️ DTLN Voice Denoising

    Real-time speech enhancement optimized for edge deployment with **TensorFlow Lite**.

    ### 🚀 Features:
    - **Optimized for Edge AI**: Lightweight model with <100KB size
    - **Real-time Processing**: Low latency for streaming audio
    - **INT8 Quantization**: Efficient deployment with 8-bit precision
    - **TensorFlow Lite**: Ready for microcontroller deployment

    ---
    """)

    with gr.Tabs():
        # Demo Tab
        with gr.Tab("🎵 Demo"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 📤 Input")
                    audio_input = gr.Audio(
                        label="Upload Noisy Audio",
                        type="filepath"
                    )

                    noise_reduction = gr.Slider(
                        minimum=0,
                        maximum=20,
                        value=10,
                        step=1,
                        label="Noise Reduction Strength (dB)",
                        info="Higher values remove more noise but may affect speech quality"
                    )

                    with gr.Row():
                        process_btn = gr.Button("🔄 Denoise Audio", variant="primary", size="lg")
                        demo_btn = gr.Button("🎵 Try Demo Audio", variant="secondary")

                with gr.Column():
                    gr.Markdown("### 📥 Output")
                    audio_output = gr.Audio(
                        label="Denoised Audio",
                        type="numpy"
                    )

                    info_output = gr.Textbox(
                        label="Processing Info",
                        lines=12,
                        max_lines=12
                    )

        # Training Tab
        with gr.Tab("🔬 Training"):
            gr.Markdown("""
            ### Train Your Own DTLN Model

            Upload your datasets and configure training parameters.

            ⚠️ **Note**: Training requires significant computational resources and cannot run
            directly on Hugging Face Spaces. This interface helps you prepare your data and
            provides the exact commands to run training locally.
            """)

            with gr.Row():
                with gr.Column():
                    gr.Markdown("#### 📦 Datasets")

                    clean_upload = gr.File(
                        label="Clean Speech Dataset (ZIP)",
                        file_types=[".zip"],
                        type="filepath"
                    )
                    gr.Markdown("*Upload a ZIP file containing clean speech WAV files*")

                    noise_upload = gr.File(
                        label="Noise Dataset (ZIP)",
                        file_types=[".zip"],
                        type="filepath"
                    )
                    gr.Markdown("*Upload a ZIP file containing noise WAV files*")

                with gr.Column():
                    gr.Markdown("#### ⚙️ Training Parameters")

                    epochs_slider = gr.Slider(
                        minimum=10,
                        maximum=200,
                        value=50,
                        step=10,
                        label="Training Epochs"
                    )

                    batch_slider = gr.Slider(
                        minimum=4,
                        maximum=64,
                        value=16,
                        step=4,
                        label="Batch Size"
                    )

                    lstm_slider = gr.Slider(
                        minimum=64,
                        maximum=256,
                        value=128,
                        step=32,
                        label="LSTM Units"
                    )

                    train_btn = gr.Button("📊 Prepare Training", variant="primary", size="lg")

            training_output = gr.Textbox(
                label="Training Instructions",
                lines=25,
                max_lines=30
            )

    # About section
    with gr.Accordion("📖 About This Model", open=False):
        gr.Markdown("""
        ### DTLN Architecture

        **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:

        - **Two-stage processing**: Magnitude estimation → Final enhancement
        - **LSTM-based**: Captures temporal dependencies in speech
        - **<1M parameters**: Lightweight for edge deployment
        - **Frequency + Time domain**: Processes both domains for better quality

        ### Edge Hardware Acceleration

        Compatible with various edge AI accelerators:
        - **NPU**: Arm Ethos-U series
        - **CPU**: ARM Cortex-M series
        - **Quantization**: 8-bit and 16-bit integer operations
        - **Memory**: Optimized for constrained devices

        ### Performance Targets

        | Metric | Value |
        |--------|-------|
        | Model Size | ~100 KB (INT8) |
        | Latency | 3-6 ms |
        | Power | 30-40 mW |
        | SNR Improvement | 10-15 dB |

        ---

        ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
        Download the full implementation to train and deploy the actual DTLN model!
        """)

    # Deployment guide section
    with gr.Accordion("🛠️ Training & Deployment Guide", open=False):
        gr.Markdown("""
        ### Quick Start

        ```bash
        # 1. Install dependencies
        pip install -r requirements.txt

        # 2. Train model
        python train_dtln.py \\
            --clean-dir ./data/clean_speech \\
            --noise-dir ./data/noise \\
            --epochs 50 \\
            --batch-size 16

        # 3. Convert to TFLite INT8
        python convert_to_tflite.py \\
            --model ./models/best_model.h5 \\
            --output ./models/dtln_ethos_u55.tflite \\
            --calibration-dir ./data/clean_speech

        # 4. (Optional) Optimize for hardware accelerator
        vela --accelerator-config ethos-u55-256 \\
             --system-config Ethos_U55_High_End_Embedded \\
             ./models/dtln_ethos_u55.tflite
        ```

        ### Download Full Implementation

        The complete training and deployment code is available in the Files tab →

        Includes:
        - `dtln_ethos_u55.py` - Model architecture
        - `train_dtln.py` - Training with QAT
        - `convert_to_tflite.py` - TFLite conversion
        - `alif_e7_voice_denoising_guide.md` - Complete guide
        - `example_usage.py` - Usage examples

        ### Resources

        - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
        - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
        - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
        """)

    # Tech specs section
    with gr.Accordion("⚙️ Technical Specifications", open=False):
        gr.Markdown("""
        ### Model Architecture Details

        **Input**: Raw audio waveform @ 16kHz
        - Frame length: 512 samples (32ms)
        - Frame shift: 128 samples (8ms)
        - Frequency bins: 257 (FFT size 512)

        **Network Structure**:
        ```
        Input Audio (16kHz)
            ↓
        STFT (512-point)
            ↓
        [Stage 1]
        LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 1
            ↓
        Enhanced Magnitude 1
            ↓
        [Stage 2]
        LSTM (128 units) → Dense (sigmoid) → Magnitude Mask 2
            ↓
        Enhanced Magnitude
            ↓
        ISTFT
            ↓
        Output Audio (16kHz)
        ```

        **Training Configuration**:
        - Loss: Combined time + frequency domain MSE
        - Optimizer: Adam (lr=0.001)
        - Batch size: 16
        - Epochs: 50
        - Quantization: INT8 post-training quantization

        **Memory Footprint**:
        - Model weights: ~80 KB (INT8)
        - Tensor arena: ~100 KB
        - Audio buffers: ~2 KB
        - **Total**: ~200 KB

        ### Edge Device Deployment

        **Hardware Utilization**:
        - NPU/CPU: For LSTM inference
        - CPU: For FFT operations (CMSIS-DSP)
        - Memory: Optimized buffer management
        - Peripherals: I2S/PDM for audio I/O

        **Power Profile**:
        - Active inference: 30-40 mW
        - Idle: <1 mW
        - Average (50% duty): ~15-20 mW

        **Real-time Constraints**:
        - Frame processing: 8ms available
        - FFT: ~1ms
        - NPU inference: ~4ms
        - IFFT + overhead: ~2ms
        - **Margin**: ~1ms
        """)

    # Event handlers
    process_btn.click(
        fn=process_audio,
        inputs=[audio_input, noise_reduction],
        outputs=[audio_output, info_output]
    )

    demo_btn.click(
        fn=generate_demo_audio,
        inputs=[],
        outputs=[audio_input]
    )

    train_btn.click(
        fn=start_training,
        inputs=[clean_upload, noise_upload, epochs_slider, batch_slider, lstm_slider],
        outputs=[training_output]
    )

    # Footer
    gr.Markdown("""
    ---

    ### 📚 Citation

    If you use this model in your research, please cite:

    ```bibtex
    @inproceedings{westhausen2020dtln,
      title={Dual-signal transformation LSTM network for real-time noise suppression},
      author={Westhausen, Nils L and Meyer, Bernd T},
      booktitle={Interspeech},
      year={2020}
    }
    ```

    ---

    <div style="text-align: center; color: #666;">
        Built for <b>Edge AI</b> • Optimized for <b>Microcontrollers</b> •
        <a href="https://github.com/breizhn/DTLN">Original DTLN</a>
    </div>
    """)

# Launch configuration
if __name__ == "__main__":
    demo.launch()