Spaces:
Runtime error
Runtime error
| #copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input. | |
| import gradio as gr | |
| import tempfile | |
| import uuid | |
| import os | |
| import re | |
| import numpy as np | |
| import soundfile as sf | |
| from kittentts import KittenTTS | |
| from tqdm.auto import tqdm | |
| # Initialize the TTS model | |
| model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
| def split_text_into_chunks(text, chunk_size=400): | |
| """ | |
| Split long text into smaller chunks of max length `chunk_size`. | |
| """ | |
| # Split by punctuation followed by space (preserves sentence boundaries) | |
| sentences = re.split(r'(?<=[.!?]) +', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) > chunk_size: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| current_chunk += sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| def generate_speech(text, voice, speed): | |
| """ | |
| Generate speech from long text in a memory-efficient way. | |
| Writes chunks directly to a shared WAV file instead of keeping them in memory. | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter some text to generate speech." | |
| try: | |
| # Break text into manageable chunks | |
| chunks = split_text_into_chunks(text, chunk_size=400) | |
| # Shared output directory (update this path to your shared disk) | |
| shared_dir = "./saved_audio" | |
| os.makedirs(shared_dir, exist_ok=True) | |
| unique_filename = f"kitten_tts_{uuid.uuid4()}.wav" | |
| output_path = os.path.join(shared_dir, unique_filename) | |
| # Open the WAV file for writing | |
| with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f: | |
| for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"): | |
| audio = model.generate(chunk+" ....", voice=voice, speed=speed) | |
| f.write(audio) # Write audio directly to disk | |
| return output_path | |
| except Exception as e: | |
| return None, f"Error during TTS generation: {str(e)}" | |
| def get_available_voices(): | |
| """Get list of available voices from the model.""" | |
| try: | |
| voices = model.available_voices | |
| return voices if voices else ["expr-voice-5-m"] | |
| except: | |
| return ["expr-voice-5-m"] | |
| # Get voices once on load | |
| available_voices = get_available_voices() | |
| # Create Gradio UI | |
| with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app: | |
| gr.Markdown("# π± KittenTTS - Text to Speech Generator") | |
| gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text to Convert", | |
| placeholder="Enter the text you want to convert to speech...", | |
| lines=4, | |
| max_lines=10 | |
| ) | |
| with gr.Row(): | |
| voice_dropdown = gr.Dropdown( | |
| choices=available_voices, | |
| value=available_voices[0], | |
| label="Voice Selection", | |
| info="Choose the voice for speech generation" | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| step=0.01, | |
| value=1, | |
| label="Speech Speed", | |
| info="Adjust the speed of speech (0.5x to 2.0x)" | |
| ) | |
| generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| interactive=False, | |
| autoplay=True | |
| ) | |
| gr.Markdown("## π Example Texts") | |
| gr.Examples( | |
| examples=[ | |
| ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1], | |
| ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25], | |
| ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5], | |
| ], | |
| inputs=[text_input, voice_dropdown, speed_slider], | |
| outputs=[audio_output], | |
| fn=generate_speech, | |
| label="Click on an example to try it out", | |
| # cache_examples="lazy" | |
| ) | |
| with gr.Accordion("βΉοΈ Model Information", open=False): | |
| gr.Markdown(""" | |
| **Model:** `KittenML/kitten-tts-nano-0.1` | |
| **Features:** | |
| - High-quality text-to-speech synthesis | |
| - Works without GPU acceleration | |
| - Multiple voice options | |
| - Adjustable speech speed | |
| - 24kHz audio output | |
| **Usage Instructions:** | |
| 1. Enter your text | |
| 2. Select a voice | |
| 3. Adjust the speech speed if needed | |
| 4. Click "Generate Speech" | |
| """) | |
| # Event Bindings | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown, speed_slider], | |
| outputs=[audio_output] | |
| ) | |
| text_input.submit( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown, speed_slider], | |
| outputs=[audio_output] | |
| ) | |
| # Run the app | |
| if __name__ == "__main__": | |
| app.queue().launch() | |