KaniTTS_Voice_Cloning_dev

Running on Zero

App Files Files Community

Den Pavloff commited on Sep 17

Commit

164603c

1 Parent(s): 91eb188

first

Browse files

Files changed (3) hide show

app.py +212 -0
requirements.txt +5 -0
util.py +222 -0

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import subprocess
+import sys
+# Fix OMP_NUM_THREADS issue before any imports
+os.environ["OMP_NUM_THREADS"] = "4"
+# Install dependencies programmatically to avoid conflicts
+def setup_dependencies():
+    try:
+        # Check if already installed
+        if os.path.exists('/tmp/deps_installed'):
+            return
+        print("Installing transformers dev version...")
+        subprocess.check_call([
+            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
+            "git+https://github.com/huggingface/transformers.git"
+        ])
+        # Mark as installed
+        with open('/tmp/deps_installed', 'w') as f:
+            f.write('done')
+    except Exception as e:
+        print(f"Dependencies setup error: {e}")
+# Run setup
+setup_dependencies()
+import spaces
+import gradio as gr
+from util import Config, NemoAudioPlayer, KaniModel
+import numpy as np
+import torch
+# Get HuggingFace token
+token_ = os.getenv('HF_TOKEN')
+# Model configurations
+models_configs = {
+    'Base_pretrained_model': Config(),
+    'Female_voice': Config(
+        model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
+        temperature=0.2
+    ),
+    'Male_voice': Config(
+        model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
+        temperature=0.2
+    )
+}
+# Global variables for models (loaded once)
+player = None
+models = {}
+def initialize_models():
+    """Initialize models globally to avoid reloading"""
+    global player, models
+    if player is None:
+        print("Initializing NeMo Audio Player...")
+        player = NemoAudioPlayer(Config())
+        print("NeMo Audio Player initialized!")
+    if not models:
+        print("Loading TTS models...")
+        for model_name, config in models_configs.items():
+            print(f"Loading {model_name}...")
+            models[model_name] = KaniModel(config, player, token_)
+            print(f"{model_name} loaded!")
+        print("All models loaded!")
+@spaces.GPU
+def generate_speech_gpu(text, model_choice):
+    """
+    Generate speech from text using the selected model on GPU
+    """
+    # Initialize models if not already done
+    initialize_models()
+    if not text.strip():
+        return None, "Please enter text for speech generation."
+    if not model_choice:
+        return None, "Please select a model."
+    try:
+        # Check GPU availability
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {device}")
+        # Get selected model
+        selected_model = models[model_choice]
+        # Generate audio
+        print(f"Generating speech with {model_choice}...")
+        audio, _ = selected_model.run_model(text)
+        # Convert to Gradio format (sample_rate, audio_data)
+        sample_rate = 22050  # Standard sample rate for NeMo
+        print("Speech generation completed!")
+        return (sample_rate, audio), f"✅ Audio generated successfully using {model_choice} on {device}"
+    except Exception as e:
+        print(f"Error during generation: {str(e)}")
+        return None, f"❌ Error during generation: {str(e)}"
+def validate_input(text, model_choice):
+    """Quick validation without GPU"""
+    if not text.strip():
+        return "⚠️ Please enter text for speech generation."
+    if not model_choice:
+        return "⚠️ Please select a model."
+    return f"✅ Ready to generate with {model_choice}"
+# Create Gradio interface
+with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎤 KaniTTS - Text to Speech with Zero GPU")
+    gr.Markdown("Select a model and enter text to generate high-quality speech")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                choices=list(models_configs.keys()),
+                value=list(models_configs.keys())[0],
+                label="Select Model",
+                info="Base - default model, Female - female voice, Male - male voice"
+            )
+            text_input = gr.Textbox(
+                label="Enter Text",
+                placeholder="Enter text for speech generation...",
+                lines=3,
+                max_lines=10
+            )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+            # Quick validation button (CPU only)
+            validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="numpy"
+            )
+            status_text = gr.Textbox(
+                label="Status",
+                interactive=False,
+                value="Ready to generate speech"
+            )
+    # GPU generation event
+    generate_btn.click(
+        fn=generate_speech_gpu,
+        inputs=[text_input, model_dropdown],
+        outputs=[audio_output, status_text]
+    )
+    # CPU validation event
+    validate_btn.click(
+        fn=validate_input,
+        inputs=[text_input, model_dropdown],
+        outputs=status_text
+    )
+    # Update status on input change
+    text_input.change(
+        fn=validate_input,
+        inputs=[text_input, model_dropdown],
+        outputs=status_text
+    )
+    # Text examples
+    gr.Markdown("### 📝 Text Examples:")
+    examples = [
+        "Hello! How are you today?",
+        "Welcome to the world of artificial intelligence.",
+        "This is a demonstration of neural text-to-speech synthesis.",
+        "Zero GPU makes high-quality speech generation accessible to everyone!"
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=text_input,
+        label="Click on an example to use it"
+    )
+    # Information section
+    with gr.Accordion("ℹ️ Model Information", open=False):
+        gr.Markdown("""
+        **Available Models:**
+        - **Base Model**: Default pre-trained model for general use
+        - **Female Voice**: Optimized for female voice characteristics
+        - **Male Voice**: Optimized for male voice characteristics
+        **Features:**
+        - Powered by NVIDIA NeMo Toolkit
+        - High-quality 22kHz audio output
+        - Zero GPU acceleration for fast inference
+        - Support for long text sequences
+        """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==2.8.0
+librosa==0.11.0
+nemo_toolkit[all]==2.4.0
+numpy==1.26.4
+gradio>=4.0.0

util.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+from nemo.collections.tts.models import AudioCodecModel
+from dataclasses import dataclass
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import os
+@dataclass
+class Config:
+    model_name: str = "nineninesix/lfm-nano-codec-tts-exp-4-large-61468-st"
+    audiocodec_name: str = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
+    device_map: str = "auto"
+    tokeniser_length: int = 64400
+    start_of_text: int = 1
+    end_of_text: int = 2
+    max_new_tokens: int = 2000
+    temperature: float = .6
+    top_p: float = .95
+    repetition_penalty: float = 1.1
+class NemoAudioPlayer:
+    def __init__(self, config, text_tokenizer_name: str = None) -> None:
+        self.conf = config
+        print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
+        # Load NeMo codec model
+        self.nemo_codec_model = AudioCodecModel.from_pretrained(
+            self.conf.audiocodec_name
+        ).eval()
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        print(f"Moving NeMo codec to device: {self.device}")
+        self.nemo_codec_model.to(self.device)
+        self.text_tokenizer_name = text_tokenizer_name
+        if self.text_tokenizer_name:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.text_tokenizer_name)
+        # Token configuration
+        self.tokeniser_length = self.conf.tokeniser_length
+        self.start_of_text = self.conf.start_of_text
+        self.end_of_text = self.conf.end_of_text
+        self.start_of_speech = self.tokeniser_length + 1
+        self.end_of_speech = self.tokeniser_length + 2
+        self.start_of_human = self.tokeniser_length + 3
+        self.end_of_human = self.tokeniser_length + 4
+        self.start_of_ai = self.tokeniser_length + 5
+        self.end_of_ai = self.tokeniser_length + 6
+        self.pad_token = self.tokeniser_length + 7
+        self.audio_tokens_start = self.tokeniser_length + 10
+        self.codebook_size = 4032
+    def output_validation(self, out_ids):
+        """Validate that output contains required speech tokens"""
+        start_of_speech_flag = self.start_of_speech in out_ids
+        end_of_speech_flag = self.end_of_speech in out_ids
+        if not (start_of_speech_flag and end_of_speech_flag):
+            raise ValueError('Special speech tokens not found in output!')
+        print("Output validation passed - speech tokens found")
+    def get_nano_codes(self, out_ids):
+        """Extract nano codec tokens from model output"""
+        try:
+            start_a_idx = (out_ids == self.start_of_speech).nonzero(as_tuple=True)[0].item()
+            end_a_idx = (out_ids == self.end_of_speech).nonzero(as_tuple=True)[0].item()
+        except IndexError:
+            raise ValueError('Speech start/end tokens not found!')
+        if start_a_idx >= end_a_idx:
+            raise ValueError('Invalid audio codes sequence!')
+        audio_codes = out_ids[start_a_idx + 1: end_a_idx]
+        if len(audio_codes) % 4:
+            raise ValueError('Audio codes length must be multiple of 4!')
+        audio_codes = audio_codes.reshape(-1, 4)
+        # Decode audio codes
+        audio_codes = audio_codes - torch.tensor([self.codebook_size * i for i in range(4)])
+        audio_codes = audio_codes - self.audio_tokens_start
+        if (audio_codes < 0).sum().item() > 0:
+            raise ValueError('Invalid audio tokens detected!')
+        audio_codes = audio_codes.T.unsqueeze(0)
+        len_ = torch.tensor([audio_codes.shape[-1]])
+        print(f"Extracted audio codes shape: {audio_codes.shape}")
+        return audio_codes, len_
+    def get_text(self, out_ids):
+        """Extract text from model output"""
+        try:
+            start_t_idx = (out_ids == self.start_of_text).nonzero(as_tuple=True)[0].item()
+            end_t_idx = (out_ids == self.end_of_text).nonzero(as_tuple=True)[0].item()
+        except IndexError:
+            raise ValueError('Text start/end tokens not found!')
+        txt_tokens = out_ids[start_t_idx: end_t_idx + 1]
+        text = self.tokenizer.decode(txt_tokens, skip_special_tokens=True)
+        return text
+    def get_waveform(self, out_ids):
+        """Convert model output to audio waveform"""
+        out_ids = out_ids.flatten()
+        print("Starting waveform generation...")
+        # Validate output
+        self.output_validation(out_ids)
+        # Extract audio codes
+        audio_codes, len_ = self.get_nano_codes(out_ids)
+        audio_codes, len_ = audio_codes.to(self.device), len_.to(self.device)
+        print("Decoding audio with NeMo codec...")
+        with torch.inference_mode():
+            reconstructed_audio, _ = self.nemo_codec_model.decode(
+                tokens=audio_codes,
+                tokens_len=len_
+            )
+            output_audio = reconstructed_audio.cpu().detach().numpy().squeeze()
+        print(f"Generated audio shape: {output_audio.shape}")
+        if self.text_tokenizer_name:
+            text = self.get_text(out_ids)
+            return output_audio, text
+        else:
+            return output_audio, None
+class KaniModel:
+    def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
+        self.conf = config
+        self.player = player
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        print(f"Loading model: {self.conf.model_name}")
+        print(f"Target device: {self.device}")
+        # Load model with proper configuration
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.conf.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=self.conf.device_map,
+            token=token,
+            trust_remote_code=True  # May be needed for some models
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.conf.model_name,
+            token=token,
+            trust_remote_code=True
+        )
+        print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
+    def get_input_ids(self, text_prompt: str) -> tuple[torch.tensor]:
+        """Prepare input tokens for the model"""
+        START_OF_HUMAN = self.player.start_of_human
+        END_OF_TEXT = self.player.end_of_text
+        END_OF_HUMAN = self.player.end_of_human
+        # Tokenize input text
+        input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
+        # Add special tokens
+        start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
+        end_tokens = torch.tensor([[END_OF_TEXT, END_OF_HUMAN]], dtype=torch.int64)
+        # Concatenate tokens
+        modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+        attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
+        print(f"Input sequence length: {modified_input_ids.shape[1]}")
+        return modified_input_ids, attention_mask
+    def model_request(self, input_ids: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
+        """Generate tokens using the model"""
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        print("Starting model generation...")
+        print(f"Generation parameters: max_tokens={self.conf.max_new_tokens}, "
+              f"temp={self.conf.temperature}, top_p={self.conf.top_p}")
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=self.conf.max_new_tokens,
+                do_sample=True,
+                temperature=self.conf.temperature,
+                top_p=self.conf.top_p,
+                repetition_penalty=self.conf.repetition_penalty,
+                num_return_sequences=1,
+                eos_token_id=self.player.end_of_speech,
+                pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
+            )
+        print(f"Generated sequence length: {generated_ids.shape[1]}")
+        return generated_ids.to('cpu')
+    def run_model(self, text: str):
+        """Complete pipeline: text -> tokens -> generation -> audio"""
+        print(f"Processing text: '{text[:50]}{'...' if len(text) > 50 else ''}'")
+        # Prepare input
+        input_ids, attention_mask = self.get_input_ids(text)
+        # Generate tokens
+        model_output = self.model_request(input_ids, attention_mask)
+        # Convert to audio
+        audio, _ = self.player.get_waveform(model_output)
+        print("Text-to-speech generation completed successfully!")
+        return audio, text