Spaces:

Luigi
/

ZipVoice-DEMO

Paused

Luigi commited on Sep 25

Commit

ed290ee

0 Parent(s):

Clean Spaces deployment - Gradio interface only

Contains only the essential files for HuggingFace Spaces:
- app.py: Gradio web interface
- requirements.txt: Python dependencies
- README.md: Spaces documentation

Removed all training code, examples, and binary files for clean deployment.

Files changed (3) hide show

README.md +52 -0
app.py +304 -0
requirements.txt +22 -0

README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+---
+title: ZipVoice
+emoji: 🎵
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "4.0.0"
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# ZipVoice - Zero-Shot Text-to-Speech
+A Gradio web interface for ZipVoice, enabling easy voice cloning and text-to-speech synthesis through your browser.
+## Features
+- 🎵 Zero-shot voice cloning with audio prompts
+- 🌐 Multi-lingual support (Chinese & English)
+- ⚡ Fast inference with flow matching
+- 🎛️ Interactive web UI
+- 📱 Mobile-friendly interface
+## Usage
+1. Enter text to synthesize
+2. Upload a short audio prompt (1-3 seconds recommended)
+3. Provide the transcription of the prompt audio
+4. Choose your preferred model and speed
+5. Click "Generate Speech"!
+## Models
+- **zipvoice**: Higher quality synthesis
+- **zipvoice_distill**: Faster inference
+## Tips for Best Results
+- Use short, clear audio prompts (1-3 seconds)
+- Ensure transcription exactly matches the audio
+- Try different speed settings
+- Both Chinese and English text supported
+## Technical Details
+- **Backend**: PyTorch with HuggingFace integration
+- **Vocoder**: Vocos for high-quality audio
+- **Architecture**: Flow matching for fast TTS
+- **Models**: Automatically downloaded from HuggingFace
+For more information, visit the [GitHub repository](https://github.com/k2-fsa/ZipVoice).

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+#!/usr/bin/env python3
+"""
+ZipVoice Gradio Web Interface for HuggingFace Spaces
+"""
+import os
+import tempfile
+import gradio as gr
+import torch
+from pathlib import Path
+# Import ZipVoice components
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.models.zipvoice_distill import ZipVoiceDistill
+from zipvoice.tokenizer.tokenizer import EmiliaTokenizer
+from zipvoice.utils.checkpoint import load_checkpoint
+from zipvoice.utils.feature import VocosFbank
+from zipvoice.bin.infer_zipvoice import generate_sentence
+from lhotse.utils import fix_random_seed
+# Global variables for caching models
+_models_cache = {}
+_tokenizer_cache = None
+_vocoder_cache = None
+_feature_extractor_cache = None
+def load_models_and_components(model_name: str):
+    """Load and cache models, tokenizer, vocoder, and feature extractor."""
+    global _models_cache, _tokenizer_cache, _vocoder_cache, _feature_extractor_cache
+    # Set device (CPU for Spaces, but could be adapted for GPU)
+    device = torch.device("cpu")
+    if model_name not in _models_cache:
+        print(f"Loading {model_name} model...")
+        # Model directory mapping
+        model_dir_map = {
+            "zipvoice": "zipvoice",
+            "zipvoice_distill": "zipvoice_distill",
+        }
+        huggingface_repo = "k2-fsa/ZipVoice"
+        # Download model files from HuggingFace
+        from huggingface_hub import hf_hub_download
+        model_ckpt = hf_hub_download(
+            huggingface_repo, filename=f"{model_dir_map[model_name]}/model.pt"
+        )
+        model_config_path = hf_hub_download(
+            huggingface_repo, filename=f"{model_dir_map[model_name]}/model.json"
+        )
+        token_file = hf_hub_download(
+            huggingface_repo, filename=f"{model_dir_map[model_name]}/tokens.txt"
+        )
+        # Load tokenizer (cache it)
+        if _tokenizer_cache is None:
+            _tokenizer_cache = EmiliaTokenizer(token_file=token_file)
+        tokenizer = _tokenizer_cache
+        tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
+        # Load model configuration
+        import json
+        with open(model_config_path, "r") as f:
+            model_config = json.load(f)
+        # Create model
+        if model_name == "zipvoice":
+            model = ZipVoice(**model_config["model"], **tokenizer_config)
+        else:
+            model = ZipVoiceDistill(**model_config["model"], **tokenizer_config)
+        # Load model weights
+        load_checkpoint(filename=model_ckpt, model=model, strict=True)
+        model = model.to(device)
+        model.eval()
+        _models_cache[model_name] = model
+    # Load vocoder (cache it)
+    if _vocoder_cache is None:
+        from vocos import Vocos
+        _vocoder_cache = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+        _vocoder_cache = _vocoder_cache.to(device)
+        _vocoder_cache.eval()
+    # Load feature extractor (cache it)
+    if _feature_extractor_cache is None:
+        _feature_extractor_cache = VocosFbank()
+    return (_models_cache[model_name], _tokenizer_cache,
+            _vocoder_cache, _feature_extractor_cache,
+            model_config["feature"]["sampling_rate"])
+def synthesize_speech_gradio(
+    text: str,
+    prompt_audio_file,
+    prompt_text: str,
+    model_name: str,
+    speed: float
+):
+    """Synthesize speech using ZipVoice for Gradio interface."""
+    if not text.strip():
+        return None, "Error: Please enter text to synthesize."
+    if prompt_audio_file is None:
+        return None, "Error: Please upload a prompt audio file."
+    if not prompt_text.strip():
+        return None, "Error: Please enter the transcription of the prompt audio."
+    try:
+        # Set random seed for reproducibility
+        fix_random_seed(666)
+        # Load models and components
+        model, tokenizer, vocoder, feature_extractor, sampling_rate = load_models_and_components(model_name)
+        device = torch.device("cpu")
+        # Save uploaded audio to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+            temp_audio_path = temp_audio.name
+            with open(temp_audio_path, "wb") as f:
+                f.write(prompt_audio_file)
+        # Create temporary output file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
+            output_path = temp_output.name
+        print(f"Synthesizing: '{text}' using {model_name}")
+        print(f"Prompt: {prompt_text}")
+        print(f"Speed: {speed}")
+        # Generate speech
+        with torch.inference_mode():
+            metrics = generate_sentence(
+                save_path=output_path,
+                prompt_text=prompt_text,
+                prompt_wav=temp_audio_path,
+                text=text,
+                model=model,
+                vocoder=vocoder,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                device=device,
+                num_step=16 if model_name == "zipvoice" else 8,
+                guidance_scale=1.0 if model_name == "zipvoice" else 3.0,
+                speed=speed,
+                t_shift=0.5,
+                target_rms=0.1,
+                feat_scale=0.1,
+                sampling_rate=sampling_rate,
+                max_duration=100,
+                remove_long_sil=False,
+            )
+        # Read the generated audio file
+        with open(output_path, "rb") as f:
+            audio_data = f.read()
+        # Clean up temporary files
+        os.unlink(temp_audio_path)
+        os.unlink(output_path)
+        success_msg = f"Synthesis completed! Duration: {metrics['wav_seconds']:.2f}s, RTF: {metrics['rtf']:.2f}"
+        return audio_data, success_msg
+    except Exception as e:
+        error_msg = f"Error during synthesis: {str(e)}"
+        print(error_msg)
+        return None, error_msg
+def create_gradio_interface():
+    """Create the Gradio web interface."""
+    # Custom CSS for better styling
+    css = """
+    .gradio-container {
+        max-width: 1200px;
+        margin: auto;
+    }
+    .title {
+        text-align: center;
+        color: #2563eb;
+        font-size: 2.5em;
+        font-weight: bold;
+        margin-bottom: 1em;
+    }
+    .subtitle {
+        text-align: center;
+        color: #64748b;
+        font-size: 1.2em;
+        margin-bottom: 2em;
+    }
+    """
+    with gr.Blocks(title="ZipVoice - Zero-Shot Text-to-Speech", css=css) as interface:
+        gr.HTML("""
+        <div class="title">🎵 ZipVoice</div>
+        <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                text_input = gr.Textbox(
+                    label="Text to Synthesize",
+                    placeholder="Enter the text you want to convert to speech...",
+                    lines=3,
+                    value="這是一則語音測試"
+                )
+                with gr.Row():
+                    model_dropdown = gr.Dropdown(
+                        choices=["zipvoice", "zipvoice_distill"],
+                        value="zipvoice",
+                        label="Model",
+                        info="zipvoice_distill is faster but slightly less accurate"
+                    )
+                    speed_slider = gr.Slider(
+                        minimum=0.5,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Speed",
+                        info="1.0 = normal speed, >1.0 = faster, <1.0 = slower"
+                    )
+                prompt_audio = gr.File(
+                    label="Prompt Audio",
+                    file_types=["audio"],
+                    type="binary",
+                    info="Upload a short audio clip (1-3 seconds recommended) to mimic the voice style"
+                )
+                prompt_text = gr.Textbox(
+                    label="Prompt Transcription",
+                    placeholder="Enter the exact transcription of the prompt audio...",
+                    lines=2,
+                    info="This should match what is spoken in the audio file"
+                )
+                generate_btn = gr.Button(
+                    "🎵 Generate Speech",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                output_audio = gr.Audio(
+                    label="Generated Speech",
+                    type="filepath"
+                )
+                status_text = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    lines=3
+                )
+                gr.Examples(
+                    examples=[
+                        ["Hello world! This is a test of ZipVoice.", None, "Hello world! This is a test.", "zipvoice", 1.0],
+                        ["今天天氣真好，我們去公園散步吧！", None, "今天天氣真好", "zipvoice", 1.0],
+                        ["The quick brown fox jumps over the lazy dog.", None, "The quick brown fox", "zipvoice_distill", 1.2],
+                    ],
+                    inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
+                    label="Quick Examples"
+                )
+        # Event handling
+        generate_btn.click(
+            fn=synthesize_speech_gradio,
+            inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
+            outputs=[output_audio, status_text]
+        )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 2em; color: #64748b; font-size: 0.9em;">
+            <p>Powered by <a href="https://github.com/k2-fsa/ZipVoice" target="_blank">ZipVoice</a> |
+            Built with <a href="https://gradio.app" target="_blank">Gradio</a></p>
+            <p>Upload a short audio clip as prompt, and ZipVoice will synthesize speech in that voice style!</p>
+        </div>
+        """)
+    return interface
+if __name__ == "__main__":
+    # Create and launch the interface
+    interface = create_gradio_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", 7860)),
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+--find-links https://k2-fsa.github.io/icefall/piper_phonemize.html
+torch
+torchaudio
+numpy
+lhotse
+huggingface_hub
+safetensors
+tensorboard
+vocos
+pydub
+gradio
+# Normalization
+cn2an
+inflect
+# Tokenization
+jieba
+piper_phonemize
+pypinyin
+setuptools<81