Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,22 @@ import numpy as np
|
|
| 15 |
from typing import Tuple, Dict, Any, Optional
|
| 16 |
from taproot import Task
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Create pipelines, downloading required files as necessary
|
| 19 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
| 20 |
hybrid_task.download_required_files(text_callback=print)
|
|
@@ -26,40 +42,31 @@ transformer_task = Task.get(
|
|
| 26 |
)
|
| 27 |
transformer_task.download_required_files(text_callback=print)
|
| 28 |
transformer_pipe = transformer_task()
|
| 29 |
-
transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
pipelines = {
|
| 33 |
"Zonos Transformer v0.1": transformer_pipe,
|
| 34 |
"Zonos Hybrid v0.1": hybrid_pipe,
|
| 35 |
}
|
| 36 |
pipeline_names = list(pipelines.keys())
|
| 37 |
supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
|
| 38 |
-
max_characters = 4500
|
| 39 |
-
header_markdown = """
|
| 40 |
-
# Zonos v0.1
|
| 41 |
-
State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
|
| 42 |
-
## Unleashed
|
| 43 |
-
Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
|
| 44 |
-
### Tips
|
| 45 |
-
- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
|
| 46 |
-
- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
|
| 47 |
-
- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
|
| 48 |
-
- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
|
| 49 |
-
- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
|
| 50 |
-
""".strip()
|
| 51 |
-
|
| 52 |
|
| 53 |
# Model toggle
|
| 54 |
def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
|
| 55 |
"""
|
| 56 |
Dynamically show/hide UI elements based on the model's conditioners.
|
| 57 |
"""
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
pipe = pipelines[pipeline_choice]
|
| 65 |
cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
|
|
@@ -202,16 +209,17 @@ with gr.Blocks() as demo:
|
|
| 202 |
)
|
| 203 |
|
| 204 |
with gr.Row():
|
| 205 |
-
if
|
| 206 |
limit_text = "Unlimited"
|
| 207 |
else:
|
| 208 |
limit_text = f"Up to {max_characters}"
|
|
|
|
| 209 |
text = gr.Textbox(
|
| 210 |
label=f"Speech Text ({limit_text} Characters)",
|
| 211 |
value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
|
| 212 |
lines=4,
|
| 213 |
max_lines=20,
|
| 214 |
-
max_length=max_characters,
|
| 215 |
)
|
| 216 |
|
| 217 |
with gr.Row():
|
|
|
|
| 15 |
from typing import Tuple, Dict, Any, Optional
|
| 16 |
from taproot import Task
|
| 17 |
|
| 18 |
+
# Configuration
|
| 19 |
+
is_hf_spaces = True # Set to false when running locally
|
| 20 |
+
max_characters = 4500
|
| 21 |
+
header_markdown = """
|
| 22 |
+
# Zonos v0.1
|
| 23 |
+
State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
|
| 24 |
+
## Unleashed
|
| 25 |
+
Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
|
| 26 |
+
### Tips
|
| 27 |
+
- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
|
| 28 |
+
- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
|
| 29 |
+
- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
|
| 30 |
+
- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
|
| 31 |
+
- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
|
| 32 |
+
""".strip()
|
| 33 |
+
|
| 34 |
# Create pipelines, downloading required files as necessary
|
| 35 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
| 36 |
hybrid_task.download_required_files(text_callback=print)
|
|
|
|
| 42 |
)
|
| 43 |
transformer_task.download_required_files(text_callback=print)
|
| 44 |
transformer_pipe = transformer_task()
|
|
|
|
| 45 |
|
| 46 |
+
if is_hf_spaces:
|
| 47 |
+
# Must load all models on GPU when using ZERO
|
| 48 |
+
transformer_pipe.load()
|
| 49 |
+
|
| 50 |
+
# Global state
|
| 51 |
pipelines = {
|
| 52 |
"Zonos Transformer v0.1": transformer_pipe,
|
| 53 |
"Zonos Hybrid v0.1": hybrid_pipe,
|
| 54 |
}
|
| 55 |
pipeline_names = list(pipelines.keys())
|
| 56 |
supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# Model toggle
|
| 59 |
def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
|
| 60 |
"""
|
| 61 |
Dynamically show/hide UI elements based on the model's conditioners.
|
| 62 |
"""
|
| 63 |
+
if not is_hf_spaces:
|
| 64 |
+
# When not using ZERO, we can onload/offload pipes
|
| 65 |
+
for pipeline_name, pipeline in pipelines.items():
|
| 66 |
+
if pipeline_name == pipeline_choice:
|
| 67 |
+
pipeline.load()
|
| 68 |
+
else:
|
| 69 |
+
pipeline.unload()
|
| 70 |
|
| 71 |
pipe = pipelines[pipeline_choice]
|
| 72 |
cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
|
|
|
|
| 209 |
)
|
| 210 |
|
| 211 |
with gr.Row():
|
| 212 |
+
if not is_hf_spaces:
|
| 213 |
limit_text = "Unlimited"
|
| 214 |
else:
|
| 215 |
limit_text = f"Up to {max_characters}"
|
| 216 |
+
|
| 217 |
text = gr.Textbox(
|
| 218 |
label=f"Speech Text ({limit_text} Characters)",
|
| 219 |
value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
|
| 220 |
lines=4,
|
| 221 |
max_lines=20,
|
| 222 |
+
max_length=max_characters if is_hf_spaces else None,
|
| 223 |
)
|
| 224 |
|
| 225 |
with gr.Row():
|