Spaces:

nineninesix
/

KaniTTS

Running on Zero

File size: 7,507 Bytes

import os
import subprocess
import sys

# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"

# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
    try:
        # Check if already installed
        if os.path.exists('/tmp/deps_installed'):
            return
            
        print("Installing transformers dev version...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
            "git+https://github.com/huggingface/transformers.git"
        ])
        
        # Mark as installed
        with open('/tmp/deps_installed', 'w') as f:
            f.write('done')
            
    except Exception as e:
        print(f"Dependencies setup error: {e}")

# Run setup
setup_dependencies()

import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel, Demo
import numpy as np
import torch

# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')

# Model configurations
models_configs = {
    'Base_pretrained_model': Config(),
    'Female_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
        temperature=0.2
    ),
    'Male_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
        temperature=0.2
    )
}

# Global variables for models (loaded once)
player = NemoAudioPlayer(Config())
demo_examples = Demo()()
models = {}
for model_name, config in models_configs.items():
    print(f"Loading {model_name}...")
    models[model_name] = KaniModel(config, player, token_)
    print(f"{model_name} loaded!")
print("All models loaded!")



# def initialize_models():
#     """Initialize models globally to avoid reloading"""
#     global models
    
#     # if player is None:
#     #     print("Initializing NeMo Audio Player...")
#     #     player = NemoAudioPlayer(Config())
#     #     print("NeMo Audio Player initialized!")
    
#     if not models:
#         print("Loading TTS models...")
#         for model_name, config in models_configs.items():
#             print(f"Loading {model_name}...")
#             models[model_name] = KaniModel(config, player, token_)
#             print(f"{model_name} loaded!")
#         print("All models loaded!")

@spaces.GPU
def generate_speech_gpu(text, model_choice):
    """
    Generate speech from text using the selected model on GPU
    """
    # Initialize models if not already done
    # initialize_models()
    
    if not text.strip():
        return None, "Please enter text for speech generation."
    
    if not model_choice:
        return None, "Please select a model."
    
    try:
        # Check GPU availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Get selected model
        selected_model = models[model_choice]
        
        # Generate audio
        print(f"Generating speech with {model_choice}...")
        audio, _, time_report = selected_model.run_model(text)
        
        sample_rate = 22050 
        print("Speech generation completed!")
        
        return (sample_rate, audio), time_report   #, f"✅ Audio generated successfully using {model_choice} on {device}"
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return None, f"❌ Error during generation: {str(e)}"

# def validate_input(text, model_choice):
#     """Quick validation without GPU"""
#     if not text.strip():
#         return "⚠️ Please enter text for speech generation."
#     if not model_choice:
#         return "⚠️ Please select a model."
#     return f"✅ Ready to generate with {model_choice}"

# Create Gradio interface
with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
    gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
    gr.Markdown("Select a model and enter text to generate high-quality speech")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=list(models_configs.keys()),
                value=list(models_configs.keys())[0],
                label="Select Model",
                info="Base - default model, Female - female voice, Male - male voice"
            )
            
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="Enter text for speech generation...",
                lines=3,
                max_lines=10
            )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
            
            # Quick validation button (CPU only)
            # validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
            
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="numpy"
            )
            
            time_report_output = gr.Textbox(
                label="Time Report",
                interactive=False,
                value="Ready to generate speech",
                lines=3
            )
    
    # GPU generation event
    generate_btn.click(
        fn=generate_speech_gpu,
        inputs=[text_input, model_dropdown],
        outputs=[audio_output, time_report_output]
    )
    
        # Demo Examples
    gr.Markdown("## 🎯 Demo Examples")
    
    def play_demo(text):
        return (22050, demo_examples[text]), 'DEMO'
    
    with gr.Row():
        for text in list(demo_examples.keys())[:4]:
            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
    
    with gr.Row():
        for text in list(demo_examples.keys())[4:8]:
            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])

            
    # # CPU validation event
    # validate_btn.click(
    #     fn=validate_input,
    #     inputs=[text_input, model_dropdown],
    #     outputs=status_text
    # )
    
    # # Update status on input change
    # text_input.change(
    #     fn=validate_input,
    #     inputs=[text_input, model_dropdown],
    #     outputs=status_text
    # )
    
    # Text examples
    # gr.Markdown("### 📝 Text Examples:")
    # examples = [
    #     "Hello! How are you today?",
    #     "Welcome to the world of artificial intelligence.",
    #     "This is a demonstration of neural text-to-speech synthesis.",
    #     "Zero GPU makes high-quality speech generation accessible to everyone!"
    # ]
    
    # gr.Examples(
    #     examples=examples,
    #     inputs=text_input,
    #     label="Click on an example to use it"
    # )
    
    # # Information section
    # with gr.Accordion("ℹ️ Model Information", open=False):
    #     gr.Markdown("""
    #     **Available Models:**
    #     - **Base Model**: Default pre-trained model for general use
    #     - **Female Voice**: Optimized for female voice characteristics
    #     - **Male Voice**: Optimized for male voice characteristics
        
    #     **Features:**
    #     - Powered by NVIDIA NeMo Toolkit
    #     - High-quality 22kHz audio output
    #     - Zero GPU acceleration for fast inference
    #     - Support for long text sequences
    #     """)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )