Luigi commited on
Commit
ed290ee
·
0 Parent(s):

Clean Spaces deployment - Gradio interface only

Browse files

Contains only the essential files for HuggingFace Spaces:
- app.py: Gradio web interface
- requirements.txt: Python dependencies
- README.md: Spaces documentation

Removed all training code, examples, and binary files for clean deployment.

Files changed (3) hide show
  1. README.md +52 -0
  2. app.py +304 -0
  3. requirements.txt +22 -0
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ZipVoice
3
+ emoji: 🎵
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.0.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # ZipVoice - Zero-Shot Text-to-Speech
14
+
15
+ A Gradio web interface for ZipVoice, enabling easy voice cloning and text-to-speech synthesis through your browser.
16
+
17
+ ## Features
18
+
19
+ - 🎵 Zero-shot voice cloning with audio prompts
20
+ - 🌐 Multi-lingual support (Chinese & English)
21
+ - ⚡ Fast inference with flow matching
22
+ - 🎛️ Interactive web UI
23
+ - 📱 Mobile-friendly interface
24
+
25
+ ## Usage
26
+
27
+ 1. Enter text to synthesize
28
+ 2. Upload a short audio prompt (1-3 seconds recommended)
29
+ 3. Provide the transcription of the prompt audio
30
+ 4. Choose your preferred model and speed
31
+ 5. Click "Generate Speech"!
32
+
33
+ ## Models
34
+
35
+ - **zipvoice**: Higher quality synthesis
36
+ - **zipvoice_distill**: Faster inference
37
+
38
+ ## Tips for Best Results
39
+
40
+ - Use short, clear audio prompts (1-3 seconds)
41
+ - Ensure transcription exactly matches the audio
42
+ - Try different speed settings
43
+ - Both Chinese and English text supported
44
+
45
+ ## Technical Details
46
+
47
+ - **Backend**: PyTorch with HuggingFace integration
48
+ - **Vocoder**: Vocos for high-quality audio
49
+ - **Architecture**: Flow matching for fast TTS
50
+ - **Models**: Automatically downloaded from HuggingFace
51
+
52
+ For more information, visit the [GitHub repository](https://github.com/k2-fsa/ZipVoice).
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ZipVoice Gradio Web Interface for HuggingFace Spaces
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ import gradio as gr
9
+ import torch
10
+ from pathlib import Path
11
+
12
+ # Import ZipVoice components
13
+ from zipvoice.models.zipvoice import ZipVoice
14
+ from zipvoice.models.zipvoice_distill import ZipVoiceDistill
15
+ from zipvoice.tokenizer.tokenizer import EmiliaTokenizer
16
+ from zipvoice.utils.checkpoint import load_checkpoint
17
+ from zipvoice.utils.feature import VocosFbank
18
+ from zipvoice.bin.infer_zipvoice import generate_sentence
19
+ from lhotse.utils import fix_random_seed
20
+
21
+ # Global variables for caching models
22
+ _models_cache = {}
23
+ _tokenizer_cache = None
24
+ _vocoder_cache = None
25
+ _feature_extractor_cache = None
26
+
27
+
28
+ def load_models_and_components(model_name: str):
29
+ """Load and cache models, tokenizer, vocoder, and feature extractor."""
30
+ global _models_cache, _tokenizer_cache, _vocoder_cache, _feature_extractor_cache
31
+
32
+ # Set device (CPU for Spaces, but could be adapted for GPU)
33
+ device = torch.device("cpu")
34
+
35
+ if model_name not in _models_cache:
36
+ print(f"Loading {model_name} model...")
37
+
38
+ # Model directory mapping
39
+ model_dir_map = {
40
+ "zipvoice": "zipvoice",
41
+ "zipvoice_distill": "zipvoice_distill",
42
+ }
43
+
44
+ huggingface_repo = "k2-fsa/ZipVoice"
45
+
46
+ # Download model files from HuggingFace
47
+ from huggingface_hub import hf_hub_download
48
+
49
+ model_ckpt = hf_hub_download(
50
+ huggingface_repo, filename=f"{model_dir_map[model_name]}/model.pt"
51
+ )
52
+ model_config_path = hf_hub_download(
53
+ huggingface_repo, filename=f"{model_dir_map[model_name]}/model.json"
54
+ )
55
+ token_file = hf_hub_download(
56
+ huggingface_repo, filename=f"{model_dir_map[model_name]}/tokens.txt"
57
+ )
58
+
59
+ # Load tokenizer (cache it)
60
+ if _tokenizer_cache is None:
61
+ _tokenizer_cache = EmiliaTokenizer(token_file=token_file)
62
+ tokenizer = _tokenizer_cache
63
+ tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
64
+
65
+ # Load model configuration
66
+ import json
67
+ with open(model_config_path, "r") as f:
68
+ model_config = json.load(f)
69
+
70
+ # Create model
71
+ if model_name == "zipvoice":
72
+ model = ZipVoice(**model_config["model"], **tokenizer_config)
73
+ else:
74
+ model = ZipVoiceDistill(**model_config["model"], **tokenizer_config)
75
+
76
+ # Load model weights
77
+ load_checkpoint(filename=model_ckpt, model=model, strict=True)
78
+ model = model.to(device)
79
+ model.eval()
80
+
81
+ _models_cache[model_name] = model
82
+
83
+ # Load vocoder (cache it)
84
+ if _vocoder_cache is None:
85
+ from vocos import Vocos
86
+ _vocoder_cache = Vocos.from_pretrained("charactr/vocos-mel-24khz")
87
+ _vocoder_cache = _vocoder_cache.to(device)
88
+ _vocoder_cache.eval()
89
+
90
+ # Load feature extractor (cache it)
91
+ if _feature_extractor_cache is None:
92
+ _feature_extractor_cache = VocosFbank()
93
+
94
+ return (_models_cache[model_name], _tokenizer_cache,
95
+ _vocoder_cache, _feature_extractor_cache,
96
+ model_config["feature"]["sampling_rate"])
97
+
98
+
99
+ def synthesize_speech_gradio(
100
+ text: str,
101
+ prompt_audio_file,
102
+ prompt_text: str,
103
+ model_name: str,
104
+ speed: float
105
+ ):
106
+ """Synthesize speech using ZipVoice for Gradio interface."""
107
+ if not text.strip():
108
+ return None, "Error: Please enter text to synthesize."
109
+
110
+ if prompt_audio_file is None:
111
+ return None, "Error: Please upload a prompt audio file."
112
+
113
+ if not prompt_text.strip():
114
+ return None, "Error: Please enter the transcription of the prompt audio."
115
+
116
+ try:
117
+ # Set random seed for reproducibility
118
+ fix_random_seed(666)
119
+
120
+ # Load models and components
121
+ model, tokenizer, vocoder, feature_extractor, sampling_rate = load_models_and_components(model_name)
122
+
123
+ device = torch.device("cpu")
124
+
125
+ # Save uploaded audio to temporary file
126
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
127
+ temp_audio_path = temp_audio.name
128
+ with open(temp_audio_path, "wb") as f:
129
+ f.write(prompt_audio_file)
130
+
131
+ # Create temporary output file
132
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
133
+ output_path = temp_output.name
134
+
135
+ print(f"Synthesizing: '{text}' using {model_name}")
136
+ print(f"Prompt: {prompt_text}")
137
+ print(f"Speed: {speed}")
138
+
139
+ # Generate speech
140
+ with torch.inference_mode():
141
+ metrics = generate_sentence(
142
+ save_path=output_path,
143
+ prompt_text=prompt_text,
144
+ prompt_wav=temp_audio_path,
145
+ text=text,
146
+ model=model,
147
+ vocoder=vocoder,
148
+ tokenizer=tokenizer,
149
+ feature_extractor=feature_extractor,
150
+ device=device,
151
+ num_step=16 if model_name == "zipvoice" else 8,
152
+ guidance_scale=1.0 if model_name == "zipvoice" else 3.0,
153
+ speed=speed,
154
+ t_shift=0.5,
155
+ target_rms=0.1,
156
+ feat_scale=0.1,
157
+ sampling_rate=sampling_rate,
158
+ max_duration=100,
159
+ remove_long_sil=False,
160
+ )
161
+
162
+ # Read the generated audio file
163
+ with open(output_path, "rb") as f:
164
+ audio_data = f.read()
165
+
166
+ # Clean up temporary files
167
+ os.unlink(temp_audio_path)
168
+ os.unlink(output_path)
169
+
170
+ success_msg = f"Synthesis completed! Duration: {metrics['wav_seconds']:.2f}s, RTF: {metrics['rtf']:.2f}"
171
+ return audio_data, success_msg
172
+
173
+ except Exception as e:
174
+ error_msg = f"Error during synthesis: {str(e)}"
175
+ print(error_msg)
176
+ return None, error_msg
177
+
178
+
179
+ def create_gradio_interface():
180
+ """Create the Gradio web interface."""
181
+
182
+ # Custom CSS for better styling
183
+ css = """
184
+ .gradio-container {
185
+ max-width: 1200px;
186
+ margin: auto;
187
+ }
188
+ .title {
189
+ text-align: center;
190
+ color: #2563eb;
191
+ font-size: 2.5em;
192
+ font-weight: bold;
193
+ margin-bottom: 1em;
194
+ }
195
+ .subtitle {
196
+ text-align: center;
197
+ color: #64748b;
198
+ font-size: 1.2em;
199
+ margin-bottom: 2em;
200
+ }
201
+ """
202
+
203
+ with gr.Blocks(title="ZipVoice - Zero-Shot Text-to-Speech", css=css) as interface:
204
+
205
+ gr.HTML("""
206
+ <div class="title">🎵 ZipVoice</div>
207
+ <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
208
+ """)
209
+
210
+ with gr.Row():
211
+ with gr.Column(scale=2):
212
+ text_input = gr.Textbox(
213
+ label="Text to Synthesize",
214
+ placeholder="Enter the text you want to convert to speech...",
215
+ lines=3,
216
+ value="這是一則語音測試"
217
+ )
218
+
219
+ with gr.Row():
220
+ model_dropdown = gr.Dropdown(
221
+ choices=["zipvoice", "zipvoice_distill"],
222
+ value="zipvoice",
223
+ label="Model",
224
+ info="zipvoice_distill is faster but slightly less accurate"
225
+ )
226
+
227
+ speed_slider = gr.Slider(
228
+ minimum=0.5,
229
+ maximum=2.0,
230
+ value=1.0,
231
+ step=0.1,
232
+ label="Speed",
233
+ info="1.0 = normal speed, >1.0 = faster, <1.0 = slower"
234
+ )
235
+
236
+ prompt_audio = gr.File(
237
+ label="Prompt Audio",
238
+ file_types=["audio"],
239
+ type="binary",
240
+ info="Upload a short audio clip (1-3 seconds recommended) to mimic the voice style"
241
+ )
242
+
243
+ prompt_text = gr.Textbox(
244
+ label="Prompt Transcription",
245
+ placeholder="Enter the exact transcription of the prompt audio...",
246
+ lines=2,
247
+ info="This should match what is spoken in the audio file"
248
+ )
249
+
250
+ generate_btn = gr.Button(
251
+ "🎵 Generate Speech",
252
+ variant="primary",
253
+ size="lg"
254
+ )
255
+
256
+ with gr.Column(scale=1):
257
+ output_audio = gr.Audio(
258
+ label="Generated Speech",
259
+ type="filepath"
260
+ )
261
+
262
+ status_text = gr.Textbox(
263
+ label="Status",
264
+ interactive=False,
265
+ lines=3
266
+ )
267
+
268
+ gr.Examples(
269
+ examples=[
270
+ ["Hello world! This is a test of ZipVoice.", None, "Hello world! This is a test.", "zipvoice", 1.0],
271
+ ["今天天氣真好,我們去公園散步吧!", None, "今天天氣真好", "zipvoice", 1.0],
272
+ ["The quick brown fox jumps over the lazy dog.", None, "The quick brown fox", "zipvoice_distill", 1.2],
273
+ ],
274
+ inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
275
+ label="Quick Examples"
276
+ )
277
+
278
+ # Event handling
279
+ generate_btn.click(
280
+ fn=synthesize_speech_gradio,
281
+ inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
282
+ outputs=[output_audio, status_text]
283
+ )
284
+
285
+ # Footer
286
+ gr.HTML("""
287
+ <div style="text-align: center; margin-top: 2em; color: #64748b; font-size: 0.9em;">
288
+ <p>Powered by <a href="https://github.com/k2-fsa/ZipVoice" target="_blank">ZipVoice</a> |
289
+ Built with <a href="https://gradio.app" target="_blank">Gradio</a></p>
290
+ <p>Upload a short audio clip as prompt, and ZipVoice will synthesize speech in that voice style!</p>
291
+ </div>
292
+ """)
293
+
294
+ return interface
295
+
296
+
297
+ if __name__ == "__main__":
298
+ # Create and launch the interface
299
+ interface = create_gradio_interface()
300
+ interface.launch(
301
+ server_name="0.0.0.0",
302
+ server_port=int(os.environ.get("PORT", 7860)),
303
+ show_error=True
304
+ )
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --find-links https://k2-fsa.github.io/icefall/piper_phonemize.html
2
+
3
+ torch
4
+ torchaudio
5
+ numpy
6
+ lhotse
7
+ huggingface_hub
8
+ safetensors
9
+ tensorboard
10
+ vocos
11
+ pydub
12
+ gradio
13
+
14
+ # Normalization
15
+ cn2an
16
+ inflect
17
+
18
+ # Tokenization
19
+ jieba
20
+ piper_phonemize
21
+ pypinyin
22
+ setuptools<81