nguyensu27 commited on
Commit
0b72639
·
verified ·
1 Parent(s): faf39d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -30
app.py CHANGED
@@ -1,18 +1,16 @@
1
- import spaces
2
  import os
3
  from huggingface_hub import login
4
  import gradio as gr
5
- from cached_path import cached_path
6
  import tempfile
7
  import numpy as np
8
  from vinorm import TTSnorm
 
 
9
  from infer_zipvoice import model, tokenizer, feature_extractor, device, generate_sentence, vocoder
10
  from utils import preprocess_ref_audio_text, save_spectrogram, chunk_text
11
 
12
  # Retrieve token from secrets
13
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
14
-
15
- # Log in to Hugging Face
16
  if hf_token:
17
  login(token=hf_token)
18
 
@@ -29,72 +27,75 @@ def post_process(text):
29
  text = text.replace('"', "")
30
  return " ".join(text.split())
31
 
32
- #@spaces.GPU
33
  def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
34
-
35
  if not ref_audio_orig:
36
  raise gr.Error("Please upload a sample audio file.")
37
  if not gen_text.strip():
38
  raise gr.Error("Please enter the text content to generate voice.")
39
  if len(gen_text.split()) > 1000:
40
  raise gr.Error("Please enter text content with less than 1000 words.")
41
-
42
  try:
43
  gen_texts = chunk_text(gen_text)
44
  final_wave_total = None
45
  final_sample_rate = 24000
46
  ref_audio, ref_text = "", ""
47
- for i, gen_text in enumerate(gen_texts):
 
48
  if i == 0:
49
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
50
- final_wave = generate_sentence(
51
- ref_text.lower(),
52
- ref_audio,
53
- post_process(TTSnorm(gen_text)).lower(),
54
- model=model,
55
- vocoder=vocoder,
56
- tokenizer=tokenizer,
57
- feature_extractor=feature_extractor,
58
- device=device,
59
- speed=speed
 
60
  ).detach().numpy()[0]
61
- if i == 0:
62
- final_wave_total = final_wave
 
63
  else:
64
- final_wave_total = np.concatenate((final_wave_total, final_wave, np.zeros(12000, dtype=int)), axis=0)
 
65
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
66
  spectrogram_path = tmp_spectrogram.name
67
  save_spectrogram(final_wave_total, spectrogram_path)
68
 
69
  return (final_sample_rate, final_wave_total), spectrogram_path
 
70
  except Exception as e:
 
71
  raise gr.Error(f"Error generating voice: {e}")
72
 
73
- # Gradio UI
74
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
75
  gr.Markdown("""
76
  # 🎤 ZipVoice: Zero-shot Vietnamese Text-to-Speech Synthesis using Flow Matching with only 123M parameters.
77
  # The model was trained with approximately 2500 hours of data on a RTX 3090 GPU.
78
  Enter text and upload a sample voice to generate natural speech.
79
  """)
80
-
81
  with gr.Row():
82
  ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
83
  gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
84
-
85
  speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
86
  btn_synthesize = gr.Button("🔥 Generate Voice")
87
-
88
  with gr.Row():
89
  output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
90
  output_spectrogram = gr.Image(label="📊 Spectrogram")
91
-
92
  model_limitations = gr.Textbox(
93
  value="""1. This model may not perform well with numerical characters, dates, special characters, etc.
94
  2. The rhythm of some generated audios may be inconsistent or choppy.
95
  3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
96
  4. Inference with overly long paragraphs may produce poor results.
97
- 5. This demo uses a for loop to generate audio for each sentence sequentially in long paragraphs, so the speed may be slow""",
98
  label="❗ Model Limitations",
99
  lines=5,
100
  interactive=False
@@ -102,5 +103,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
102
 
103
  btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
104
 
105
- # Run Gradio with share=True to get a gradio.live link
106
- demo.queue().launch()
 
 
1
  import os
2
  from huggingface_hub import login
3
  import gradio as gr
 
4
  import tempfile
5
  import numpy as np
6
  from vinorm import TTSnorm
7
+
8
+ # Lấy các đối tượng đã load sẵn trong infer_zipvoice.py
9
  from infer_zipvoice import model, tokenizer, feature_extractor, device, generate_sentence, vocoder
10
  from utils import preprocess_ref_audio_text, save_spectrogram, chunk_text
11
 
12
  # Retrieve token from secrets
13
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 
 
14
  if hf_token:
15
  login(token=hf_token)
16
 
 
27
  text = text.replace('"', "")
28
  return " ".join(text.split())
29
 
 
30
  def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
 
31
  if not ref_audio_orig:
32
  raise gr.Error("Please upload a sample audio file.")
33
  if not gen_text.strip():
34
  raise gr.Error("Please enter the text content to generate voice.")
35
  if len(gen_text.split()) > 1000:
36
  raise gr.Error("Please enter text content with less than 1000 words.")
37
+
38
  try:
39
  gen_texts = chunk_text(gen_text)
40
  final_wave_total = None
41
  final_sample_rate = 24000
42
  ref_audio, ref_text = "", ""
43
+
44
+ for i, piece in enumerate(gen_texts):
45
  if i == 0:
46
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
47
+
48
+ wav = generate_sentence(
49
+ ref_text.lower(),
50
+ ref_audio,
51
+ post_process(TTSnorm(piece)).lower(),
52
+ model=model,
53
+ vocoder=vocoder,
54
+ tokenizer=tokenizer,
55
+ feature_extractor=feature_extractor,
56
+ device=device,
57
+ speed=speed
58
  ).detach().numpy()[0]
59
+
60
+ if final_wave_total is None:
61
+ final_wave_total = wav
62
  else:
63
+ final_wave_total = np.concatenate((final_wave_total, wav, np.zeros(12000, dtype=int)), axis=0)
64
+
65
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
66
  spectrogram_path = tmp_spectrogram.name
67
  save_spectrogram(final_wave_total, spectrogram_path)
68
 
69
  return (final_sample_rate, final_wave_total), spectrogram_path
70
+
71
  except Exception as e:
72
+ # Trả lỗi “gốc” cho dễ debug
73
  raise gr.Error(f"Error generating voice: {e}")
74
 
 
75
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
76
  gr.Markdown("""
77
  # 🎤 ZipVoice: Zero-shot Vietnamese Text-to-Speech Synthesis using Flow Matching with only 123M parameters.
78
  # The model was trained with approximately 2500 hours of data on a RTX 3090 GPU.
79
  Enter text and upload a sample voice to generate natural speech.
80
  """)
81
+
82
  with gr.Row():
83
  ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
84
  gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
85
+
86
  speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
87
  btn_synthesize = gr.Button("🔥 Generate Voice")
88
+
89
  with gr.Row():
90
  output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
91
  output_spectrogram = gr.Image(label="📊 Spectrogram")
92
+
93
  model_limitations = gr.Textbox(
94
  value="""1. This model may not perform well with numerical characters, dates, special characters, etc.
95
  2. The rhythm of some generated audios may be inconsistent or choppy.
96
  3. Default, reference audio text uses the pho-whisper-medium model, which may not always accurately recognize Vietnamese, resulting in poor voice synthesis quality.
97
  4. Inference with overly long paragraphs may produce poor results.
98
+ 5. This demo uses a for loop to generate audio for each sentence sequentially in long paragraphs, so the speed may be slow""",
99
  label="❗ Model Limitations",
100
  lines=5,
101
  interactive=False
 
103
 
104
  btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
105
 
106
+ demo.queue().launch()