File size: 7,507 Bytes
164603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46cf002
164603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26adaf1
46cf002
164603c
e9bcb5a
 
 
 
 
164603c
e9bcb5a
 
 
 
 
164603c
e9bcb5a
 
 
 
164603c
e9bcb5a
 
 
 
 
 
 
164603c
 
 
 
 
 
 
e9bcb5a
164603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52c0d1f
164603c
46cf002
164603c
 
52c0d1f
164603c
 
 
52c0d1f
164603c
46cf002
 
 
 
 
 
 
164603c
 
26adaf1
46cf002
164603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46cf002
164603c
 
 
 
 
 
 
52c0d1f
 
 
aa6abd6
 
52c0d1f
164603c
 
 
 
 
52c0d1f
164603c
 
46cf002
 
164603c
46cf002
d12d4e0
46cf002
 
 
d12d4e0
46cf002
 
 
d12d4e0
46cf002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164603c
 
46cf002
 
 
 
 
 
 
164603c
46cf002
 
 
 
 
164603c
46cf002
 
 
 
 
 
 
164603c
46cf002
 
 
 
 
 
164603c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import subprocess
import sys

# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"

# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
    try:
        # Check if already installed
        if os.path.exists('/tmp/deps_installed'):
            return
            
        print("Installing transformers dev version...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
            "git+https://github.com/huggingface/transformers.git"
        ])
        
        # Mark as installed
        with open('/tmp/deps_installed', 'w') as f:
            f.write('done')
            
    except Exception as e:
        print(f"Dependencies setup error: {e}")

# Run setup
setup_dependencies()

import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel, Demo
import numpy as np
import torch

# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')

# Model configurations
models_configs = {
    'Base_pretrained_model': Config(),
    'Female_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
        temperature=0.2
    ),
    'Male_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
        temperature=0.2
    )
}

# Global variables for models (loaded once)
player = NemoAudioPlayer(Config())
demo_examples = Demo()()
models = {}
for model_name, config in models_configs.items():
    print(f"Loading {model_name}...")
    models[model_name] = KaniModel(config, player, token_)
    print(f"{model_name} loaded!")
print("All models loaded!")



# def initialize_models():
#     """Initialize models globally to avoid reloading"""
#     global models
    
#     # if player is None:
#     #     print("Initializing NeMo Audio Player...")
#     #     player = NemoAudioPlayer(Config())
#     #     print("NeMo Audio Player initialized!")
    
#     if not models:
#         print("Loading TTS models...")
#         for model_name, config in models_configs.items():
#             print(f"Loading {model_name}...")
#             models[model_name] = KaniModel(config, player, token_)
#             print(f"{model_name} loaded!")
#         print("All models loaded!")

@spaces.GPU
def generate_speech_gpu(text, model_choice):
    """
    Generate speech from text using the selected model on GPU
    """
    # Initialize models if not already done
    # initialize_models()
    
    if not text.strip():
        return None, "Please enter text for speech generation."
    
    if not model_choice:
        return None, "Please select a model."
    
    try:
        # Check GPU availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Get selected model
        selected_model = models[model_choice]
        
        # Generate audio
        print(f"Generating speech with {model_choice}...")
        audio, _, time_report = selected_model.run_model(text)
        
        sample_rate = 22050 
        print("Speech generation completed!")
        
        return (sample_rate, audio), time_report   #, f"βœ… Audio generated successfully using {model_choice} on {device}"
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return None, f"❌ Error during generation: {str(e)}"

# def validate_input(text, model_choice):
#     """Quick validation without GPU"""
#     if not text.strip():
#         return "⚠️ Please enter text for speech generation."
#     if not model_choice:
#         return "⚠️ Please select a model."
#     return f"βœ… Ready to generate with {model_choice}"

# Create Gradio interface
with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
    gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
    gr.Markdown("Select a model and enter text to generate high-quality speech")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=list(models_configs.keys()),
                value=list(models_configs.keys())[0],
                label="Select Model",
                info="Base - default model, Female - female voice, Male - male voice"
            )
            
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="Enter text for speech generation...",
                lines=3,
                max_lines=10
            )
            
            generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
            
            # Quick validation button (CPU only)
            # validate_btn = gr.Button("πŸ” Validate Input", variant="secondary")
            
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="numpy"
            )
            
            time_report_output = gr.Textbox(
                label="Time Report",
                interactive=False,
                value="Ready to generate speech",
                lines=3
            )
    
    # GPU generation event
    generate_btn.click(
        fn=generate_speech_gpu,
        inputs=[text_input, model_dropdown],
        outputs=[audio_output, time_report_output]
    )
    
        # Demo Examples
    gr.Markdown("## 🎯 Demo Examples")
    
    def play_demo(text):
        return (22050, demo_examples[text]), 'DEMO'
    
    with gr.Row():
        for text in list(demo_examples.keys())[:4]:
            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
    
    with gr.Row():
        for text in list(demo_examples.keys())[4:8]:
            gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])

            
    # # CPU validation event
    # validate_btn.click(
    #     fn=validate_input,
    #     inputs=[text_input, model_dropdown],
    #     outputs=status_text
    # )
    
    # # Update status on input change
    # text_input.change(
    #     fn=validate_input,
    #     inputs=[text_input, model_dropdown],
    #     outputs=status_text
    # )
    
    # Text examples
    # gr.Markdown("### πŸ“ Text Examples:")
    # examples = [
    #     "Hello! How are you today?",
    #     "Welcome to the world of artificial intelligence.",
    #     "This is a demonstration of neural text-to-speech synthesis.",
    #     "Zero GPU makes high-quality speech generation accessible to everyone!"
    # ]
    
    # gr.Examples(
    #     examples=examples,
    #     inputs=text_input,
    #     label="Click on an example to use it"
    # )
    
    # # Information section
    # with gr.Accordion("ℹ️ Model Information", open=False):
    #     gr.Markdown("""
    #     **Available Models:**
    #     - **Base Model**: Default pre-trained model for general use
    #     - **Female Voice**: Optimized for female voice characteristics
    #     - **Male Voice**: Optimized for male voice characteristics
        
    #     **Features:**
    #     - Powered by NVIDIA NeMo Toolkit
    #     - High-quality 22kHz audio output
    #     - Zero GPU acceleration for fast inference
    #     - Support for long text sequences
    #     """)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )