File size: 4,256 Bytes
e53ffe1
 
1d6cb69
e53ffe1
efb70b3
e53ffe1
0b72639
 
40b7081
efb70b3
1d6cb69
e53ffe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b72639
e53ffe1
efb70b3
 
a3b204a
d13d365
0b72639
 
d13d365
0b72639
 
 
 
 
 
 
 
 
 
 
 
14a7b56
0b72639
 
 
efb70b3
0b72639
 
e53ffe1
 
efb70b3
e53ffe1
a3b204a
0b72639
e53ffe1
0b72639
e53ffe1
 
 
 
d286934
da4f915
e53ffe1
 
0b72639
e53ffe1
 
 
0b72639
e53ffe1
 
0b72639
e53ffe1
 
 
0b72639
e53ffe1
9b81421
 
e53ffe1
9b81421
0b72639
e53ffe1
9b81421
e53ffe1
 
 
 
 
0b72639
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
from huggingface_hub import login
import gradio as gr
import tempfile
import numpy as np
from vinorm import TTSnorm

# Lấy các đối tượng đã load sẵn trong infer_zipvoice.py
from infer_zipvoice import model, tokenizer, feature_extractor, device, generate_sentence, vocoder
from utils import preprocess_ref_audio_text, save_spectrogram, chunk_text

# Retrieve token from secrets
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if hf_token:
    login(token=hf_token)

def post_process(text):
    text = " " + text + " "
    text = text.replace(" . . ", " . ")
    text = " " + text + " "
    text = text.replace(" .. ", " . ")
    text = " " + text + " "
    text = text.replace(" , , ", " , ")
    text = " " + text + " "
    text = text.replace(" ,, ", " , ")
    text = " " + text + " "
    text = text.replace('"', "")
    return " ".join(text.split())

def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
    if not ref_audio_orig:
        raise gr.Error("Please upload a sample audio file.")
    if not gen_text.strip():
        raise gr.Error("Please enter the text content to generate voice.")
    if len(gen_text.split()) > 1000:
        raise gr.Error("Please enter text content with less than 1000 words.")

    try:
        gen_texts = chunk_text(gen_text)
        final_wave_total = None
        final_sample_rate = 24000
        ref_audio, ref_text = "", ""

        for i, piece in enumerate(gen_texts):
            if i == 0:
                ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")

            wav = generate_sentence(
                ref_text.lower(),
                ref_audio,
                post_process(TTSnorm(piece)).lower(),
                model=model,
                vocoder=vocoder,
                tokenizer=tokenizer,
                feature_extractor=feature_extractor,
                device=device,
                speed=speed
            ).detach().numpy()[0]

            if final_wave_total is None:
                final_wave_total = wav
            else:
                final_wave_total = np.concatenate((final_wave_total, wav, np.zeros(12000, dtype=int)), axis=0)

        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
            spectrogram_path = tmp_spectrogram.name
            save_spectrogram(final_wave_total, spectrogram_path)

        return (final_sample_rate, final_wave_total), spectrogram_path

    except Exception as e:
        # Trả lỗi “gốc” cho dễ debug
        raise gr.Error(f"Error generating voice: {e}")

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎤 ZipVoice: Zero-shot Vietnamese Text-to-Speech Synthesis using Flow Matching with only 123M parameters.
    # The model was trained with approximately 2500 hours of data on a RTX 3090 GPU. 
    Enter text and upload a sample voice to generate natural speech.
    """)

    with gr.Row():
        ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
        gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)

    speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
    btn_synthesize = gr.Button("🔥 Generate Voice")

    with gr.Row():
        output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
        output_spectrogram = gr.Image(label="📊 Spectrogram")

    model_limitations = gr.Textbox(
        value="""1. This model may not perform well with numerical characters, dates, special characters, etc.
2. The rhythm of some generated audios may be inconsistent or choppy.
3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
4. Inference with overly long paragraphs may produce poor results.
5. This demo uses a for loop to generate audio for each sentence sequentially in long paragraphs, so the speed may be slow""",
        label="❗ Model Limitations",
        lines=5,
        interactive=False
    )

    btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])

demo.queue().launch()