Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| ZipVoice Gradio Web Interface for HuggingFace Spaces | |
| Updated for Gradio 5.47.0 compatibility | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import tempfile | |
| import gradio as gr | |
| import torch | |
| from pathlib import Path | |
| import spaces | |
| import whisper | |
| # Add current directory to Python path for local zipvoice package | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| # Import ZipVoice components | |
| from zipvoice.models.zipvoice import ZipVoice | |
| from zipvoice.models.zipvoice_distill import ZipVoiceDistill | |
| from zipvoice.tokenizer.tokenizer import EmiliaTokenizer | |
| from zipvoice.utils.checkpoint import load_checkpoint | |
| from zipvoice.utils.feature import VocosFbank | |
| from zipvoice.bin.infer_zipvoice import generate_sentence | |
| from lhotse.utils import fix_random_seed | |
| # Global caches for lazy loading | |
| _models_cache: dict[str, dict[str, object]] = {} | |
| _tokenizer_cache: EmiliaTokenizer | None = None | |
| _vocoder_cache = None | |
| _feature_extractor_cache = None | |
| def load_models_and_components(model_name: str): | |
| """Load and cache models, tokenizer, vocoder, and feature extractor.""" | |
| global _models_cache, _tokenizer_cache, _vocoder_cache, _feature_extractor_cache | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| if model_name not in _models_cache: | |
| print(f"Loading {model_name} model…") | |
| model_dir_map = { | |
| "zipvoice": "zipvoice", | |
| "zipvoice_distill": "zipvoice_distill", | |
| } | |
| huggingface_repo = "k2-fsa/ZipVoice" | |
| from huggingface_hub import hf_hub_download | |
| model_ckpt = hf_hub_download(huggingface_repo, filename=f"{model_dir_map[model_name]}/model.pt") | |
| model_config_path = hf_hub_download(huggingface_repo, filename=f"{model_dir_map[model_name]}/model.json") | |
| token_file = hf_hub_download(huggingface_repo, filename=f"{model_dir_map[model_name]}/tokens.txt") | |
| if _tokenizer_cache is None: | |
| _tokenizer_cache = EmiliaTokenizer(token_file=token_file) | |
| tokenizer = _tokenizer_cache | |
| tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id} | |
| with open(model_config_path, "r") as f: | |
| model_config = json.load(f) | |
| if model_name == "zipvoice": | |
| model = ZipVoice(**model_config["model"], **tokenizer_config) | |
| else: | |
| model = ZipVoiceDistill(**model_config["model"], **tokenizer_config) | |
| load_checkpoint(filename=model_ckpt, model=model, strict=True) | |
| model = model.to(device) | |
| model.eval() | |
| _models_cache[model_name] = { | |
| "model": model, | |
| "sampling_rate": model_config["feature"]["sampling_rate"], | |
| } | |
| if _vocoder_cache is None: | |
| from vocos import Vocos | |
| _vocoder_cache = Vocos.from_pretrained("charactr/vocos-mel-24khz") | |
| _vocoder_cache = _vocoder_cache.to(device) | |
| _vocoder_cache.eval() | |
| if _feature_extractor_cache is None: | |
| _feature_extractor_cache = VocosFbank() | |
| entry = _models_cache[model_name] | |
| return ( | |
| entry["model"], | |
| _tokenizer_cache, | |
| _vocoder_cache, | |
| _feature_extractor_cache, | |
| entry["sampling_rate"], | |
| ) | |
| def transcribe_audio_whisper(audio_file): | |
| """Transcribe audio file using Whisper.""" | |
| if audio_file is None: | |
| return "Error: Please upload an audio file first." | |
| try: | |
| model = whisper.load_model("small") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: | |
| temp_audio_path = temp_audio.name | |
| with open(temp_audio_path, "wb") as f: | |
| f.write(audio_file) | |
| result = model.transcribe(temp_audio_path) | |
| os.unlink(temp_audio_path) | |
| return result["text"].strip() | |
| except Exception as exc: # pylint: disable=broad-except | |
| return f"Error during transcription: {exc}" | |
| def synthesize_speech_gradio( | |
| text: str, | |
| prompt_audio_file, | |
| prompt_text: str, | |
| model_name: str, | |
| speed: float, | |
| ): | |
| """Synthesize speech using ZipVoice for Gradio interface.""" | |
| if not text.strip(): | |
| return None, "Error: Please enter text to synthesize." | |
| if prompt_audio_file is None: | |
| return None, "Error: Please upload a prompt audio file." | |
| if not prompt_text.strip(): | |
| return None, "Error: Please enter the transcription of the prompt audio." | |
| try: | |
| fix_random_seed(666) | |
| model, tokenizer, vocoder, feature_extractor, sampling_rate = load_models_and_components(model_name) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: | |
| temp_audio_path = temp_audio.name | |
| with open(temp_audio_path, "wb") as f: | |
| f.write(prompt_audio_file) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output: | |
| output_path = temp_output.name | |
| print(f"Synthesizing: '{text}' using {model_name}") | |
| print(f"Prompt: {prompt_text}") | |
| print(f"Speed: {speed}") | |
| with torch.inference_mode(): | |
| metrics = generate_sentence( | |
| save_path=output_path, | |
| prompt_text=prompt_text, | |
| prompt_wav=temp_audio_path, | |
| text=text, | |
| model=model, | |
| vocoder=vocoder, | |
| tokenizer=tokenizer, | |
| feature_extractor=feature_extractor, | |
| device=device, | |
| num_step=16 if model_name == "zipvoice" else 8, | |
| guidance_scale=1.0 if model_name == "zipvoice" else 3.0, | |
| speed=speed, | |
| t_shift=0.5, | |
| target_rms=0.1, | |
| feat_scale=0.1, | |
| sampling_rate=sampling_rate, | |
| max_duration=100, | |
| remove_long_sil=False, | |
| ) | |
| with open(output_path, "rb") as f: | |
| audio_data = f.read() | |
| os.unlink(temp_audio_path) | |
| os.unlink(output_path) | |
| success_msg = f"Synthesis completed! Duration: {metrics['wav_seconds']:.2f}s, RTF: {metrics['rtf']:.2f}" | |
| return audio_data, success_msg | |
| except Exception as exc: # pylint: disable=broad-except | |
| error_msg = f"Error during synthesis: {exc}" | |
| print(error_msg) | |
| return None, error_msg | |
| def create_gradio_interface(): | |
| """Create the Gradio web interface.""" | |
| gpu_available = torch.cuda.is_available() | |
| css = """ | |
| :root { | |
| --primary-gradient: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| --accent-gradient: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); | |
| --success-gradient: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); | |
| --warning-gradient: linear-gradient(135deg, #fa709a 0%, #fee140 100%); | |
| --surface: #ffffff; | |
| --surface-muted: #f8fafc; | |
| --surface-soft: #f1f5f9; | |
| --text-strong: #0f172a; | |
| --text: #1f2937; | |
| --text-muted: #64748b; | |
| --border: #e2e8f0; | |
| --shadow-sm: 0 1px 3px rgba(15, 23, 42, 0.08); | |
| --shadow-md: 0 8px 24px rgba(15, 23, 42, 0.08); | |
| --radius-sm: 8px; | |
| --radius-md: 14px; | |
| --radius-lg: 20px; | |
| } | |
| body { | |
| background: var(--surface-muted); | |
| } | |
| .gradio-container { | |
| max-width: 1180px; | |
| margin: 0 auto; | |
| padding: 0 24px 48px; | |
| font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; | |
| color: var(--text-strong); | |
| } | |
| .header-section { | |
| background: var(--surface); | |
| border-radius: var(--radius-lg); | |
| padding: 2.4rem; | |
| margin: 2.5rem 0 2rem; | |
| box-shadow: var(--shadow-md); | |
| border: 1px solid var(--border); | |
| } | |
| .logo-section { | |
| display: flex; | |
| align-items: center; | |
| gap: 1rem; | |
| } | |
| .logo-icon { | |
| font-size: 3rem; | |
| background: var(--primary-gradient); | |
| -webkit-background-clip: text; | |
| color: transparent; | |
| } | |
| .title { | |
| font-size: 2.6rem; | |
| font-weight: 800; | |
| background: var(--primary-gradient); | |
| -webkit-background-clip: text; | |
| color: transparent; | |
| margin: 0; | |
| letter-spacing: -0.03em; | |
| } | |
| .subtitle { | |
| margin: 0.35rem 0 0; | |
| font-size: 1.05rem; | |
| color: var(--text-muted); | |
| font-weight: 500; | |
| } | |
| .status-badge { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| padding: 0.55rem 1.2rem; | |
| border-radius: 999px; | |
| font-size: 0.85rem; | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| color: #fff; | |
| box-shadow: var(--shadow-sm); | |
| } | |
| .status-badge.gpu { | |
| background: var(--success-gradient); | |
| } | |
| .status-badge.cpu { | |
| background: var(--warning-gradient); | |
| } | |
| .steps-row { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); | |
| gap: 1rem; | |
| margin-bottom: 2rem; | |
| } | |
| .step-chip { | |
| background: var(--surface); | |
| border-radius: var(--radius-md); | |
| padding: 1rem 1.2rem; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 0.35rem; | |
| box-shadow: var(--shadow-sm); | |
| border: 1px solid var(--border); | |
| } | |
| .step-chip span { | |
| font-size: 0.75rem; | |
| font-weight: 700; | |
| text-transform: uppercase; | |
| letter-spacing: 0.12em; | |
| color: var(--text-muted); | |
| } | |
| .step-chip strong { | |
| font-size: 0.95rem; | |
| color: var(--text-strong); | |
| } | |
| .layout-grid { | |
| display: grid; | |
| grid-template-columns: minmax(0, 3fr) minmax(0, 2fr); | |
| gap: 2rem; | |
| align-items: start; | |
| margin-bottom: 2.5rem; | |
| } | |
| .input-card, | |
| .output-card { | |
| background: var(--surface); | |
| border-radius: var(--radius-lg); | |
| padding: 1.8rem; | |
| box-shadow: var(--shadow-md); | |
| border: 1px solid var(--border); | |
| display: flex; | |
| flex-direction: column; | |
| gap: 1.25rem; | |
| } | |
| .section-title { | |
| font-size: 1.2rem; | |
| font-weight: 700; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.6rem; | |
| color: var(--text-strong); | |
| } | |
| .section-subtitle { | |
| font-size: 0.95rem; | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--text-muted); | |
| } | |
| .helper-text { | |
| font-size: 0.85rem; | |
| color: var(--text-muted); | |
| margin-top: -0.35rem; | |
| } | |
| .file-drop { | |
| border: 2px dashed var(--border) !important; | |
| border-radius: var(--radius-md) !important; | |
| background: var(--surface-soft) !important; | |
| transition: all 0.25s ease; | |
| padding: 1rem; | |
| } | |
| .file-drop:hover { | |
| border-color: #667eea !important; | |
| background: rgba(102, 126, 234, 0.08) !important; | |
| } | |
| .button-row { | |
| display: flex; | |
| gap: 0.6rem; | |
| flex-wrap: wrap; | |
| } | |
| .btn-primary { | |
| background: var(--primary-gradient) !important; | |
| color: #fff !important; | |
| border: none !important; | |
| border-radius: var(--radius-md) !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.05em; | |
| padding: 0.9rem 1.6rem !important; | |
| box-shadow: var(--shadow-md); | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .btn-secondary { | |
| background: var(--surface-soft) !important; | |
| color: var(--text-strong) !important; | |
| border-radius: var(--radius-md) !important; | |
| border: 1px solid var(--border) !important; | |
| font-weight: 600 !important; | |
| padding: 0.75rem 1.4rem !important; | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .btn-danger { | |
| background: var(--warning-gradient) !important; | |
| color: #fff !important; | |
| border-radius: var(--radius-md) !important; | |
| border: none !important; | |
| font-weight: 600 !important; | |
| padding: 0.75rem 1.2rem !important; | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .btn-primary:hover, | |
| .btn-secondary:hover, | |
| .btn-danger:hover { | |
| transform: translateY(-1px); | |
| box-shadow: var(--shadow-md); | |
| } | |
| .divider { | |
| height: 1px; | |
| width: 100%; | |
| background: var(--border); | |
| margin: 0.5rem 0 0.75rem; | |
| } | |
| .text-area textarea, | |
| .text-input textarea, | |
| .text-input input { | |
| background: var(--surface-soft); | |
| border: 1.5px solid var(--border); | |
| border-radius: var(--radius-md); | |
| transition: border-color 0.25s ease, box-shadow 0.25s ease; | |
| font-size: 1rem; | |
| } | |
| .text-area textarea:focus, | |
| .text-input textarea:focus, | |
| .text-input input:focus { | |
| border-color: #667eea; | |
| box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.15); | |
| background: var(--surface); | |
| } | |
| .advanced-settings { | |
| border-radius: var(--radius-md); | |
| background: var(--surface-soft); | |
| border: 1px solid var(--border); | |
| box-shadow: var(--shadow-sm); | |
| } | |
| .status-box { | |
| background: var(--surface-soft); | |
| border: 1px solid rgba(102, 126, 234, 0.25); | |
| border-radius: var(--radius-md); | |
| padding: 1rem; | |
| font-size: 0.95rem; | |
| color: #334155; | |
| box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.05); | |
| min-height: 82px; | |
| } | |
| .status-box pre { | |
| white-space: pre-wrap; | |
| } | |
| .progress-indicator { | |
| display: none; | |
| } | |
| .progress-indicator.active { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.85rem; | |
| background: rgba(102, 126, 234, 0.1); | |
| border: 1px solid rgba(102, 126, 234, 0.25); | |
| border-radius: var(--radius-md); | |
| padding: 0.85rem 1.1rem; | |
| color: #4c51bf; | |
| font-weight: 600; | |
| } | |
| .progress-indicator .spinner { | |
| width: 18px; | |
| height: 18px; | |
| border-radius: 50%; | |
| border: 3px solid rgba(102, 126, 234, 0.25); | |
| border-top-color: #6366f1; | |
| animation: spin 1s linear infinite; | |
| } | |
| @keyframes spin { | |
| to { transform: rotate(360deg); } | |
| } | |
| .audio-player { | |
| background: var(--surface-soft); | |
| border-radius: var(--radius-md); | |
| border: 1px solid var(--border); | |
| padding: 1rem; | |
| } | |
| .audio-player button.download { | |
| background: var(--primary-gradient) !important; | |
| color: #fff !important; | |
| border-radius: var(--radius-sm) !important; | |
| border: none !important; | |
| font-weight: 600 !important; | |
| margin-top: 0.75rem; | |
| box-shadow: var(--shadow-sm); | |
| } | |
| .examples-deck { | |
| background: var(--surface); | |
| border-radius: var(--radius-lg); | |
| padding: 1.6rem; | |
| box-shadow: var(--shadow-md); | |
| border: 1px solid var(--border); | |
| } | |
| .examples-deck .section-title { | |
| margin-bottom: 1rem; | |
| } | |
| .footer { | |
| text-align: center; | |
| margin-top: 2.5rem; | |
| padding: 1.5rem; | |
| background: var(--surface); | |
| border-radius: var(--radius-lg); | |
| border: 1px solid var(--border); | |
| box-shadow: var(--shadow-sm); | |
| color: var(--text-muted); | |
| font-size: 0.9rem; | |
| } | |
| .footer-links { | |
| margin-top: 0.75rem; | |
| display: flex; | |
| justify-content: center; | |
| gap: 1.75rem; | |
| } | |
| .footer-link { | |
| color: var(--text-muted); | |
| text-decoration: none; | |
| font-weight: 600; | |
| } | |
| .footer-link:hover { | |
| color: #6366f1; | |
| } | |
| @media (max-width: 1024px) { | |
| .layout-grid { | |
| grid-template-columns: 1fr; | |
| } | |
| } | |
| @media (max-width: 768px) { | |
| .gradio-container { | |
| padding: 0 16px 32px; | |
| } | |
| .header-section { | |
| padding: 1.8rem; | |
| } | |
| .logo-section { | |
| flex-direction: column; | |
| text-align: center; | |
| gap: 0.6rem; | |
| } | |
| .title { | |
| font-size: 2.1rem; | |
| } | |
| .steps-row { | |
| grid-template-columns: 1fr; | |
| } | |
| .button-row { | |
| flex-direction: column; | |
| } | |
| } | |
| @media (prefers-color-scheme: dark) { | |
| :root { | |
| --surface: #1f2937; | |
| --surface-muted: #0f172a; | |
| --surface-soft: #273549; | |
| --text-strong: #f8fafc; | |
| --text: #e2e8f0; | |
| --text-muted: #94a3b8; | |
| --border: #324155; | |
| } | |
| .status-box { | |
| border-color: rgba(99, 102, 241, 0.45); | |
| color: #cbd5f5; | |
| } | |
| .progress-indicator.active { | |
| background: rgba(99, 102, 241, 0.2); | |
| border-color: rgba(99, 102, 241, 0.4); | |
| color: #cbd5f5; | |
| } | |
| } | |
| """ | |
| with gr.Blocks(title="ZipVoice — Zero-Shot TTS", css=css, theme=gr.themes.Soft()) as interface: | |
| with gr.Column(elem_classes="header-section"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.HTML(""" | |
| <div class='logo-section'> | |
| <div class='logo-icon'>🎵</div> | |
| <div> | |
| <h1 class='title'>ZipVoice</h1> | |
| <p class='subtitle'>Zero-shot text-to-speech with instant voice cloning</p> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Column(scale=1, min_width=160): | |
| if gpu_available: | |
| gr.HTML("<div class='status-badge gpu'>⚡ GPU Ready</div>") | |
| else: | |
| gr.HTML("<div class='status-badge cpu'>💻 CPU Mode</div>") | |
| gr.HTML(""" | |
| <div class='steps-row'> | |
| <div class='step-chip'> | |
| <span>Step 1 / 步驟一</span> | |
| <strong>Drop your reference voice (1–3 s) / 拖放 1–3 秒的參考語音</strong> | |
| </div> | |
| <div class='step-chip'> | |
| <span>Step 2 / 步驟二</span> | |
| <strong>Transcribe the prompt or let ZipVoice auto-transcribe / 手動或自動生成轉寫</strong> | |
| </div> | |
| <div class='step-chip'> | |
| <span>Step 3 / 步驟三</span> | |
| <strong>Write the target text and generate / 輸入目標文本並開始合成</strong> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(elem_classes="layout-grid"): | |
| with gr.Column(elem_classes="input-card"): | |
| gr.HTML("<div class='section-title'>🎤 Voice Prompt / 參考語音</div>") | |
| prompt_audio = gr.File( | |
| label="Drop or select an audio file / 拖放或選擇音頻文件", | |
| file_types=["audio"], | |
| type="binary", | |
| elem_classes="file-drop" | |
| ) | |
| with gr.Row(elem_classes="button-row"): | |
| transcribe_btn = gr.Button( | |
| "🎧 Auto Transcribe / 自動轉寫", | |
| variant="secondary", | |
| size="sm", | |
| elem_classes="btn-secondary" | |
| ) | |
| clear_prompt = gr.Button( | |
| "🧹 Reset / 重置", | |
| size="sm", | |
| elem_classes="btn-danger" | |
| ) | |
| gr.HTML("<p class='helper-text'>Tip: use a clear 1–3 second sample for best results. 提示:請使用 1–3 秒的清晰語音,以獲得最佳效果。</p>") | |
| gr.HTML("<div class='section-subtitle'>📝 Prompt transcription / 提示文本</div>") | |
| prompt_text = gr.Textbox( | |
| placeholder="Type the exact words from the prompt audio or run auto-transcribe… / 輸入參考語音的原文或使用自動轉寫", | |
| lines=3, | |
| elem_classes="text-area" | |
| ) | |
| gr.HTML("<div class='divider'></div>") | |
| gr.HTML("<div class='section-title'>✍️ Text to Synthesize / 合成文本</div>") | |
| text_input = gr.Textbox( | |
| placeholder="Enter the text you want to speak (English, Chinese, etc.) / 輸入需要朗讀的文本(支援英文、中文等)", | |
| lines=5, | |
| value="Hello, this is a ZipVoice demo showing instant zero-shot voice cloning.", | |
| elem_classes="text-area" | |
| ) | |
| with gr.Row(elem_classes="button-row"): | |
| generate_btn = gr.Button( | |
| "🎵 Generate Voice / 開始合成", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="btn-primary" | |
| ) | |
| with gr.Accordion("Advanced settings / 高級設定", open=False, elem_classes="advanced-settings"): | |
| model_dropdown = gr.Dropdown( | |
| choices=["zipvoice", "zipvoice_distill"], | |
| value="zipvoice", | |
| label="Model / 模型", | |
| info="zipvoice = highest fidelity · zipvoice_distill = faster generation / zipvoice = 最高音質 · zipvoice_distill = 更快生成" | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speaking speed / 語速", | |
| info="0.5 = slower · 1.0 = natural · 2.0 = faster / 0.5 = 慢速 · 1.0 = 自然 · 2.0 = 快速" | |
| ) | |
| with gr.Column(elem_classes="output-card"): | |
| gr.HTML("<div class='section-title'>🔊 Result & Status / 輸出與狀態</div>") | |
| progress_bar = gr.HTML(value="", elem_classes="progress-indicator") | |
| output_audio = gr.Audio( | |
| label="Playback / 播放", | |
| type="filepath", | |
| elem_classes="audio-player", | |
| show_download_button=True | |
| ) | |
| status_text = gr.Markdown( | |
| value="Ready to synthesize. Please upload a prompt and click generate! / 準備就緒:請上傳參考語音並開始合成。", | |
| elem_classes="status-box" | |
| ) | |
| with gr.Column(elem_classes="examples-deck"): | |
| gr.HTML("<div class='section-title'>⚡ Quick Examples / 快速範例</div>") | |
| gr.Examples( | |
| examples=[ | |
| ["Hello everyone, welcome to ZipVoice.", "jfk.wav", "ask not what your country can do for you, ask what you can do for your country", "zipvoice", 1.0], | |
| ["請在會議開始時靜音您的麥克風。", "jfk.wav", "ask not what your country can do for you, ask what you can do for your country", "zipvoice", 1.0], | |
| ["Innovation starts with listening carefully to your users.", "jfk.wav", "ask not what your country can do for you, ask what you can do for your country", "zipvoice_distill", 1.2], | |
| ], | |
| inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider], | |
| examples_per_page=3, | |
| label="Try a scenario in one click / 一鍵體驗範例" | |
| ) | |
| gr.HTML(""" | |
| <div class='footer'> | |
| <p>Created with ❤️ by the ZipVoice team on Gradio / 由 ZipVoice 團隊基於 Gradio 構建</p> | |
| <div class='footer-links'> | |
| <a href='https://github.com/k2-fsa/ZipVoice' class='footer-link' target='_blank'>Source code / 原始碼</a> | |
| <a href='https://huggingface.co/k2-fsa' class='footer-link' target='_blank'>HuggingFace models / HuggingFace 模型</a> | |
| <a href='https://gradio.app' class='footer-link' target='_blank'>Gradio framework / Gradio 框架</a> | |
| </div> | |
| </div> | |
| """) | |
| def show_progress(): | |
| return """ | |
| <div class='progress-indicator active'> | |
| <div class='spinner'></div> | |
| <span>Generating audio… 音頻合成中…</span> | |
| </div> | |
| """ | |
| def hide_progress(): | |
| return "" | |
| transcribe_btn.click( | |
| fn=transcribe_audio_whisper, | |
| inputs=[prompt_audio], | |
| outputs=[prompt_text] | |
| ).then( | |
| fn=lambda: "✅ Transcription ready. Review it before synthesis. / 自動轉寫完成,請確認後繼續。", | |
| outputs=[status_text] | |
| ) | |
| clear_prompt.click( | |
| fn=lambda: (None, "", "🔄 Prompt cleared. Please upload a new sample. / 提示已清空,請重新上傳樣本。"), | |
| inputs=None, | |
| outputs=[prompt_audio, prompt_text, status_text] | |
| ).then( | |
| fn=lambda: "", | |
| outputs=[progress_bar] | |
| ) | |
| generate_btn.click( | |
| fn=show_progress, | |
| outputs=[progress_bar] | |
| ).then( | |
| fn=lambda: "🎵 Generating now… this may take a few seconds. / 正在合成,請稍候。", | |
| outputs=[status_text] | |
| ).then( | |
| fn=synthesize_speech_gradio, | |
| inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider], | |
| outputs=[output_audio, status_text] | |
| ).then( | |
| fn=hide_progress, | |
| outputs=[progress_bar] | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| # Create and launch the interface | |
| interface = create_gradio_interface() | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=int(os.environ.get("PORT", 7860)), | |
| show_error=True | |
| ) |