Spaces:

Luigi
/

ZipVoice-DEMO

Paused

App Files Files Community

Luigi commited on Sep 25

Commit

1b9d615

1 Parent(s): 7300238

Add @spaces.GPU decorator and CUDA support for HF Spaces

Browse files

Files changed (2) hide show

app.py +13 -7
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import tempfile
 import gradio as gr
 import torch
 from pathlib import Path
 # Add current directory to Python path for local zipvoice package
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -33,8 +34,8 @@ def load_models_and_components(model_name: str):
     """Load and cache models, tokenizer, vocoder, and feature extractor."""
     global _models_cache, _tokenizer_cache, _vocoder_cache, _feature_extractor_cache
-    # Set device (CPU for Spaces, but could be adapted for GPU)
-    device = torch.device("cpu")
     if model_name not in _models_cache:
         print(f"Loading {model_name} model...")
@@ -100,6 +101,7 @@ def load_models_and_components(model_name: str):
             model_config["feature"]["sampling_rate"])
 def synthesize_speech_gradio(
     text: str,
     prompt_audio_file,
@@ -124,7 +126,7 @@ def synthesize_speech_gradio(
         # Load models and components
         model, tokenizer, vocoder, feature_extractor, sampling_rate = load_models_and_components(model_name)
-        device = torch.device("cpu")
         # Save uploaded audio to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
@@ -224,7 +226,8 @@ def create_gradio_interface():
                     model_dropdown = gr.Dropdown(
                         choices=["zipvoice", "zipvoice_distill"],
                         value="zipvoice",
-                        label="Model"
                     )
                     speed_slider = gr.Slider(
@@ -232,19 +235,22 @@ def create_gradio_interface():
                         maximum=2.0,
                         value=1.0,
                         step=0.1,
-                        label="Speed"
                     )
                 prompt_audio = gr.File(
                     label="Prompt Audio",
                     file_types=["audio"],
-                    type="binary"
                 )
                 prompt_text = gr.Textbox(
                     label="Prompt Transcription",
                     placeholder="Enter the exact transcription of the prompt audio...",
-                    lines=2
                 )
                 generate_btn = gr.Button(

 import gradio as gr
 import torch
 from pathlib import Path
+from spaces import GPU
 # Add current directory to Python path for local zipvoice package
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
     """Load and cache models, tokenizer, vocoder, and feature extractor."""
     global _models_cache, _tokenizer_cache, _vocoder_cache, _feature_extractor_cache
+    # Set device (GPU if available, otherwise CPU)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if model_name not in _models_cache:
         print(f"Loading {model_name} model...")
             model_config["feature"]["sampling_rate"])
+@GPU
 def synthesize_speech_gradio(
     text: str,
     prompt_audio_file,
         # Load models and components
         model, tokenizer, vocoder, feature_extractor, sampling_rate = load_models_and_components(model_name)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Save uploaded audio to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
                     model_dropdown = gr.Dropdown(
                         choices=["zipvoice", "zipvoice_distill"],
                         value="zipvoice",
+                        label="Model",
+                        info="zipvoice_distill is faster but slightly less accurate"
                     )
                     speed_slider = gr.Slider(
                         maximum=2.0,
                         value=1.0,
                         step=0.1,
+                        label="Speed",
+                        info="1.0 = normal speed, >1.0 = faster, <1.0 = slower"
                     )
                 prompt_audio = gr.File(
                     label="Prompt Audio",
                     file_types=["audio"],
+                    type="binary",
+                    info="Upload a short audio clip (1-3 seconds recommended) to mimic the voice style"
                 )
                 prompt_text = gr.Textbox(
                     label="Prompt Transcription",
                     placeholder="Enter the exact transcription of the prompt audio...",
+                    lines=2,
+                    info="This should match what is spoken in the audio file"
                 )
                 generate_btn = gr.Button(

requirements.txt CHANGED Viewed

@@ -9,7 +9,7 @@ safetensors
 tensorboard
 vocos
 pydub
-gradio
 # Normalization
 cn2an

 tensorboard
 vocos
 pydub
+gradio>=4.44.0
 # Normalization
 cn2an