KaniTTS_Voice_Cloning_dev

Running on Zero

App Files Files Community

Simonlob commited on Sep 18

Commit

46cf002

verified ·

1 Parent(s): 41423b2

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -57

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ setup_dependencies()
 import spaces
 import gradio as gr
-from util import Config, NemoAudioPlayer, KaniModel
 import numpy as np
 import torch
@@ -52,6 +52,7 @@ models_configs = {
 # Global variables for models (loaded once)
 player = NemoAudioPlayer(Config())
 models = {}
 def initialize_models():
@@ -97,27 +98,26 @@ def generate_speech_gpu(text, model_choice):
         print(f"Generating speech with {model_choice}...")
         audio, _ = selected_model.run_model(text)
-        # Convert to Gradio format (sample_rate, audio_data)
-        sample_rate = 22050  # Standard sample rate for NeMo
         print("Speech generation completed!")
-        return (sample_rate, audio), f"✅ Audio generated successfully using {model_choice} on {device}"
     except Exception as e:
         print(f"Error during generation: {str(e)}")
-        return None, f"❌ Error during generation: {str(e)}"
-def validate_input(text, model_choice):
-    """Quick validation without GPU"""
-    if not text.strip():
-        return "⚠️ Please enter text for speech generation."
-    if not model_choice:
-        return "⚠️ Please select a model."
-    return f"✅ Ready to generate with {model_choice}"
 # Create Gradio interface
 with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
-    gr.Markdown("# 🎤 KaniTTS - Text to Speech with Zero GPU")
     gr.Markdown("Select a model and enter text to generate high-quality speech")
     with gr.Row():
@@ -139,7 +139,7 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
             # Quick validation button (CPU only)
-            validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
@@ -147,62 +147,79 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
                 type="numpy"
             )
-            status_text = gr.Textbox(
-                label="Status",
-                interactive=False,
-                value="Ready to generate speech"
-            )
     # GPU generation event
     generate_btn.click(
         fn=generate_speech_gpu,
         inputs=[text_input, model_dropdown],
-        outputs=[audio_output, status_text]
     )
-    # CPU validation event
-    validate_btn.click(
-        fn=validate_input,
-        inputs=[text_input, model_dropdown],
-        outputs=status_text
-    )
-    # Update status on input change
-    text_input.change(
-        fn=validate_input,
-        inputs=[text_input, model_dropdown],
-        outputs=status_text
-    )
     # Text examples
-    gr.Markdown("### 📝 Text Examples:")
-    examples = [
-        "Hello! How are you today?",
-        "Welcome to the world of artificial intelligence.",
-        "This is a demonstration of neural text-to-speech synthesis.",
-        "Zero GPU makes high-quality speech generation accessible to everyone!"
-    ]
-    gr.Examples(
-        examples=examples,
-        inputs=text_input,
-        label="Click on an example to use it"
-    )
-    # Information section
-    with gr.Accordion("ℹ️ Model Information", open=False):
-        gr.Markdown("""
-        **Available Models:**
-        - **Base Model**: Default pre-trained model for general use
-        - **Female Voice**: Optimized for female voice characteristics
-        - **Male Voice**: Optimized for male voice characteristics
-        **Features:**
-        - Powered by NVIDIA NeMo Toolkit
-        - High-quality 22kHz audio output
-        - Zero GPU acceleration for fast inference
-        - Support for long text sequences
-        """)
 if __name__ == "__main__":
     demo.launch(

 import spaces
 import gradio as gr
+from util import Config, NemoAudioPlayer, KaniModel, Demo
 import numpy as np
 import torch
 # Global variables for models (loaded once)
 player = NemoAudioPlayer(Config())
+demo_examples = Demo()()
 models = {}
 def initialize_models():
         print(f"Generating speech with {model_choice}...")
         audio, _ = selected_model.run_model(text)
+        sample_rate = 22050
         print("Speech generation completed!")
+        return (sample_rate, audio)   #, f"✅ Audio generated successfully using {model_choice} on {device}"
     except Exception as e:
         print(f"Error during generation: {str(e)}")
+        return None  #, f"❌ Error during generation: {str(e)}"
+# def validate_input(text, model_choice):
+#     """Quick validation without GPU"""
+#     if not text.strip():
+#         return "⚠️ Please enter text for speech generation."
+#     if not model_choice:
+#         return "⚠️ Please select a model."
+#     return f"✅ Ready to generate with {model_choice}"
 # Create Gradio interface
 with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
+    gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
     gr.Markdown("Select a model and enter text to generate high-quality speech")
     with gr.Row():
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
             # Quick validation button (CPU only)
+            # validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 type="numpy"
             )
+            # status_text = gr.Textbox(
+            #     label="Status",
+            #     interactive=False,
+            #     value="Ready to generate speech"
+            # )
     # GPU generation event
     generate_btn.click(
         fn=generate_speech_gpu,
         inputs=[text_input, model_dropdown],
+        outputs=[audio_output]
     )
+        # Demo Examples
+    gr.Markdown("## 🎯 Demo Examples")
+    def play_demo(text):
+        return demo_examples[text], f"Playing: {text}"
+    with gr.Row():
+        for text in list(demo_examples.keys())[:4]:
+            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output])
+    with gr.Row():
+        for text in list(demo_examples.keys())[4:8]:
+            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output])
+    # # CPU validation event
+    # validate_btn.click(
+    #     fn=validate_input,
+    #     inputs=[text_input, model_dropdown],
+    #     outputs=status_text
+    # )
+    # # Update status on input change
+    # text_input.change(
+    #     fn=validate_input,
+    #     inputs=[text_input, model_dropdown],
+    #     outputs=status_text
+    # )
     # Text examples
+    # gr.Markdown("### 📝 Text Examples:")
+    # examples = [
+    #     "Hello! How are you today?",
+    #     "Welcome to the world of artificial intelligence.",
+    #     "This is a demonstration of neural text-to-speech synthesis.",
+    #     "Zero GPU makes high-quality speech generation accessible to everyone!"
+    # ]
+    # gr.Examples(
+    #     examples=examples,
+    #     inputs=text_input,
+    #     label="Click on an example to use it"
+    # )
+    # # Information section
+    # with gr.Accordion("ℹ️ Model Information", open=False):
+    #     gr.Markdown("""
+    #     **Available Models:**
+    #     - **Base Model**: Default pre-trained model for general use
+    #     - **Female Voice**: Optimized for female voice characteristics
+    #     - **Male Voice**: Optimized for male voice characteristics
+    #     **Features:**
+    #     - Powered by NVIDIA NeMo Toolkit
+    #     - High-quality 22kHz audio output
+    #     - Zero GPU acceleration for fast inference
+    #     - Support for long text sequences
+    #     """)
 if __name__ == "__main__":
     demo.launch(