diff --git a/app.py b/app.py deleted file mode 100644 index 35af657fe47469befad734c8194e4b64bf6f3b2a..0000000000000000000000000000000000000000 --- a/app.py +++ /dev/null @@ -1,805 +0,0 @@ -import os -import time -import numpy as np -import gradio as gr -import librosa -import soundfile as sf -import torch -import traceback -import threading -from spaces import GPU -from datetime import datetime -from contextlib import contextmanager - -from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference -from processor.vibevoice_processor import VibeVoiceProcessor -from modular.streamer import AudioStreamer -from transformers.utils import logging -from transformers import set_seed - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - - -class VibeVoiceDemo: - def __init__(self, model_paths: dict, device: str = "cuda", inference_steps: int = 5): - """ - model_paths: dict like {"VibeVoice-1.5B": "microsoft/VibeVoice-1.5B", - "VibeVoice-7B": "microsoft/VibeVoice-7B"} - """ - self.model_paths = model_paths - self.device = device - self.inference_steps = inference_steps - - self.is_generating = False - - # Multi-model holders - self.models = {} # name -> model - self.processors = {} # name -> processor - self.current_model_name = None - - self.available_voices = {} - - # Set compiler flags for better performance - if torch.cuda.is_available() and hasattr(torch, '_inductor'): - if hasattr(torch._inductor, 'config'): - torch._inductor.config.conv_1x1_as_mm = True - torch._inductor.config.coordinate_descent_tuning = True - torch._inductor.config.epilogue_fusion = False - torch._inductor.config.coordinate_descent_check_all_directions = True - - self.load_models() # load all on CPU - self.setup_voice_presets() - self.load_example_scripts() - - def load_models(self): - print("Loading processors and models on CPU...") - - # Debug: Show cache location - import os - cache_dir = os.path.expanduser("~/.cache/huggingface/hub") - print(f"HuggingFace cache directory: {cache_dir}") - if os.path.exists(cache_dir): - print(f"Cache exists. Size: {sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk(cache_dir) for filename in filenames) / (1024**3):.2f} GB") - print("Cached models:") - for item in os.listdir(cache_dir): - if item.startswith("models--"): - print(f" - {item}") - - for name, path in self.model_paths.items(): - print(f" - {name} from {path}") - proc = VibeVoiceProcessor.from_pretrained(path) - # Use SDPA (Scaled Dot Product Attention) for better memory efficiency - # Flash Attention 2 disabled to reduce memory usage on L4 GPUs - mdl = VibeVoiceForConditionalGenerationInference.from_pretrained( - path, - torch_dtype=torch.bfloat16, - attn_implementation="sdpa" # More memory efficient than flash_attention_2 - ) - print(f" SDPA (memory-efficient) attention enabled for {name}") - # Keep on CPU initially - self.processors[name] = proc - self.models[name] = mdl - # choose default - self.current_model_name = next(iter(self.models)) - print(f"Default model is {self.current_model_name}") - - def _place_model(self, target_name: str): - """ - Move the selected model to CUDA and push all others back to CPU. - """ - # Clear GPU cache before moving models - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - for name, mdl in self.models.items(): - if name == target_name: - self.models[name] = mdl.to(self.device) - else: - self.models[name] = mdl.to("cpu") - - # Clear cache again after model placement - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - self.current_model_name = target_name - print(f"Model {target_name} is now on {self.device}. Others moved to CPU.") - - def setup_voice_presets(self): - voices_dir = os.path.join(os.path.dirname(__file__), "voices") - if not os.path.exists(voices_dir): - print(f"Warning: Voices directory not found at {voices_dir}") - return - wav_files = [f for f in os.listdir(voices_dir) - if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))] - for wav_file in wav_files: - name = os.path.splitext(wav_file)[0] - self.available_voices[name] = os.path.join(voices_dir, wav_file) - print(f"Voices loaded: {list(self.available_voices.keys())}") - - # Organize voices by gender - self.male_voices = [ - "en-Carter_man", - "en-Frank_man", - "en-Yasser_man", - "in-Samuel_man", - "zh-Anchen_man_bgm", - "zh-Bowen_man" - ] - self.female_voices = [ - "en-Alice_woman_bgm", - "en-Alice_woman", - "en-Maya_woman", - "zh-Xinran_woman" - ] - - def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray: - try: - wav, sr = sf.read(audio_path) - if len(wav.shape) > 1: - wav = np.mean(wav, axis=1) - if sr != target_sr: - wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) - return wav - except Exception as e: - print(f"Error reading audio {audio_path}: {e}") - return np.array([]) - - @GPU(duration=120) - def generate_podcast(self, - num_speakers: int, - script: str, - speaker_1: str = None, - speaker_2: str = None, - speaker_3: str = None, - speaker_4: str = None, - cfg_scale: float = 1.3, - model_name: str = None): - """ - Generates a conference as a single audio file from a script and saves it. - Non-streaming. - """ - try: - # pick model - model_name = model_name or self.current_model_name - if model_name not in self.models: - raise gr.Error(f"Unknown model: {model_name}") - - # place models on devices - self._place_model(model_name) - model = self.models[model_name] - processor = self.processors[model_name] - - print(f"Using model {model_name} on {self.device}") - - # Additional cache clear before generation - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - model.eval() - model.set_ddpm_inference_steps(num_steps=self.inference_steps) - - self.is_generating = True - - if not script.strip(): - raise gr.Error("Error: Please provide a script.") - - script = script.replace("’", "'") - - if not 1 <= num_speakers <= 4: - raise gr.Error("Error: Number of speakers must be between 1 and 4.") - - selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers] - for i, speaker_name in enumerate(selected_speakers): - if not speaker_name or speaker_name not in self.available_voices: - raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.") - - log = f"Generating conference with {num_speakers} speakers\n" - log += f"Model: {model_name}\n" - log += f"Parameters: CFG Scale={cfg_scale}\n" - log += f"Speakers: {', '.join(selected_speakers)}\n" - - voice_samples = [] - for speaker_name in selected_speakers: - audio_path = self.available_voices[speaker_name] - audio_data = self.read_audio(audio_path) - if len(audio_data) == 0: - raise gr.Error(f"Error: Failed to load audio for {speaker_name}") - voice_samples.append(audio_data) - - log += f"Loaded {len(voice_samples)} voice samples\n" - - lines = script.strip().split('\n') - formatted_script_lines = [] - for line in lines: - line = line.strip() - if not line: - continue - if line.startswith('Speaker ') and ':' in line: - formatted_script_lines.append(line) - else: - speaker_id = len(formatted_script_lines) % num_speakers - formatted_script_lines.append(f"Speaker {speaker_id}: {line}") - - formatted_script = '\n'.join(formatted_script_lines) - log += f"Formatted script with {len(formatted_script_lines)} turns\n" - log += "Processing with VibeVoice...\n" - - inputs = processor( - text=[formatted_script], - voice_samples=[voice_samples], - padding=True, - return_tensors="pt", - return_attention_mask=True, - ) - - start_time = time.time() - - # Use efficient attention backend - if torch.cuda.is_available() and hasattr(torch.nn.attention, 'SDPBackend'): - from torch.nn.attention import SDPBackend, sdpa_kernel - with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION): - outputs = model.generate( - **inputs, - max_new_tokens=None, - cfg_scale=cfg_scale, - tokenizer=processor.tokenizer, - generation_config={'do_sample': False}, - verbose=False, - ) - else: - outputs = model.generate( - **inputs, - max_new_tokens=None, - cfg_scale=cfg_scale, - tokenizer=processor.tokenizer, - generation_config={'do_sample': False}, - verbose=False, - ) - generation_time = time.time() - start_time - - if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None: - audio_tensor = outputs.speech_outputs[0] - audio = audio_tensor.cpu().float().numpy() - else: - raise gr.Error("Error: No audio was generated by the model. Please try again.") - - if audio.ndim > 1: - audio = audio.squeeze() - - sample_rate = 24000 - - output_dir = "outputs" - os.makedirs(output_dir, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - file_path = os.path.join(output_dir, f"conference_{timestamp}.wav") - sf.write(file_path, audio, sample_rate) - print(f"Conference saved to {file_path}") - - total_duration = len(audio) / sample_rate - log += f"Generation completed in {generation_time:.2f} seconds\n" - log += f"Final audio duration: {total_duration:.2f} seconds\n" - log += f"Successfully saved conference to: {file_path}\n" - - self.is_generating = False - return (sample_rate, audio), log - - except gr.Error as e: - self.is_generating = False - error_msg = f"Input Error: {str(e)}" - print(error_msg) - return None, error_msg - - except Exception as e: - self.is_generating = False - error_msg = f"An unexpected error occurred: {str(e)}" - print(error_msg) - traceback.print_exc() - return None, error_msg - - - @staticmethod - def _infer_num_speakers_from_script(script: str) -> int: - """ - Infer number of speakers by counting distinct 'Speaker X:' tags in the script. - Robust to 0- or 1-indexed labels and repeated turns. - Falls back to 1 if none found. - """ - import re - ids = re.findall(r'(?mi)^\s*Speaker\s+(\d+)\s*:', script) - return len({int(x) for x in ids}) if ids else 1 - - def load_example_scripts(self): - examples_dir = os.path.join(os.path.dirname(__file__), "text_examples") - self.example_scripts = [] - self.example_scripts_natural = [] - if not os.path.exists(examples_dir): - return - - original_files = [ - "1p_ai_tedtalk.txt", - "1p_politcal_speech.txt", - "2p_financeipo_meeting.txt", - "2p_telehealth_meeting.txt", - "3p_military_meeting.txt", - "3p_oil_meeting.txt", - "4p_gamecreation_meeting.txt", - "4p_product_meeting.txt" - ] - - # Gender mapping for each script's speakers - self.script_speaker_genders = [ - ["female"], # AI TED Talk - Rachel - ["neutral"], # Political Speech - generic speaker - ["male", "female"], # Finance IPO - James, Patricia - ["female", "male"], # Telehealth - Jennifer, Tom - ["female", "male", "female"], # Military - Sarah, David, Lisa - ["male", "female", "male"], # Oil - Robert, Lisa, Michael - ["male", "female", "male", "male"], # Game Creation - Alex, Sarah, Marcus, Emma - ["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David - ] - - for txt_file in original_files: - try: - with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f: - script_content = f.read().strip() - if script_content: - num_speakers = self._infer_num_speakers_from_script(script_content) - self.example_scripts.append([num_speakers, script_content]) - - natural_file = txt_file.replace('.txt', '_natural.txt') - natural_path = os.path.join(examples_dir, natural_file) - if os.path.exists(natural_path): - with open(natural_path, 'r', encoding='utf-8') as f: - natural_content = f.read().strip() - if natural_content: - num_speakers = self._infer_num_speakers_from_script(natural_content) - self.example_scripts_natural.append([num_speakers, natural_content]) - else: - self.example_scripts_natural.append([num_speakers, script_content]) - except Exception as e: - print(f"Error loading {txt_file}: {e}") - - -def convert_to_16_bit_wav(data): - if torch.is_tensor(data): - data = data.detach().cpu().numpy() - data = np.array(data) - if np.max(np.abs(data)) > 1.0: - data = data / np.max(np.abs(data)) - return (data * 32767).astype(np.int16) - -# Set synthwave theme -theme = gr.themes.Ocean( - primary_hue="indigo", - secondary_hue="fuchsia", - neutral_hue="slate", -).set( - button_large_radius='*radius_sm' -) - -def set_working_state(*components, transcript_box=None): - """ - Disable all interactive components and show progress in transcript/log box. - Usage: set_working_state(generate_btn, random_example_btn, transcript_box=log_output) - """ - updates = [gr.update(interactive=False) for _ in components] - if transcript_box is not None: - updates.append(gr.update(value="Generating... please wait", interactive=False)) - return tuple(updates) - -def set_idle_state(*components, transcript_box=None): - """ - Re-enable all interactive components and transcript/log box. - Usage: set_idle_state(generate_btn, random_example_btn, transcript_box=log_output) - """ - updates = [gr.update(interactive=True) for _ in components] - if transcript_box is not None: - updates.append(gr.update(interactive=True)) - return tuple(updates) - - -def create_demo_interface(demo_instance: VibeVoiceDemo): - custom_css = """ """ - - with gr.Blocks( - title="VibeVoice - Conference Generator", - css=custom_css, - theme=theme, - ) as interface: - - # Simple image - gr.HTML(""" -
- Canary-Qwen Transcriber Banner -
- """) - gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5 first.") - - with gr.Tabs(): - with gr.Tab("Generate"): - gr.Markdown("### Generated Conference") - complete_audio_output = gr.Audio( - label="Complete Conference (Download)", - type="numpy", - elem_classes="audio-output complete-audio-section", - autoplay=False, - show_download_button=True, - visible=True - ) - - with gr.Row(): - with gr.Column(scale=1, elem_classes="settings-card"): - gr.Markdown("### Conference Settings") - - # Model dropdown - model_dropdown = gr.Dropdown( - choices=list(demo_instance.models.keys()), - value=demo_instance.current_model_name, - label="Model", - ) - - num_speakers = gr.Slider( - minimum=1, maximum=4, value=2, step=1, - label="Number of Speakers", - elem_classes="slider-container" - ) - - gr.Markdown("### Speaker Selection") - available_speaker_names = list(demo_instance.available_voices.keys()) - default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman'] - - speaker_selections = [] - for i in range(4): - default_value = default_speakers[i] if i < len(default_speakers) else None - speaker = gr.Dropdown( - choices=available_speaker_names, - value=default_value, - label=f"Speaker {i+1}", - visible=(i < 2), - elem_classes="speaker-item" - ) - speaker_selections.append(speaker) - - gr.Markdown("### Advanced Settings") - with gr.Accordion("Generation Parameters", open=False): - cfg_scale = gr.Slider( - minimum=1.0, maximum=2.0, value=1.3, step=0.05, - label="CFG Scale (Guidance Strength)", - elem_classes="slider-container" - ) - - with gr.Column(scale=2, elem_classes="generation-card"): - gr.Markdown("### Script Input") - script_input = gr.Textbox( - label="Conversation Script", - placeholder="Enter your conference script here...", - lines=12, - max_lines=20, - elem_classes="script-input" - ) - - with gr.Row(): - random_example_btn = gr.Button( - "Random Example", size="lg", - variant="secondary", elem_classes="random-btn", scale=1 - ) - generate_btn = gr.Button( - "šŸš€ Generate Conference", size="lg", - variant="primary", elem_classes="generate-btn", scale=2 - ) - - with gr.Row(): - with gr.Column(scale=1): - gr.Markdown("### Example Scripts") - with gr.Row(): - use_natural = gr.Checkbox( - value=True, - label="Natural talking sounds", - scale=1 - ) - duration_display = gr.Textbox( - value="", - label="Est. Duration", - interactive=False, - scale=1 - ) - - example_names = [ - "AI TED Talk", - "Political Speech", - "Finance IPO Meeting", - "Telehealth Meeting", - "Military Meeting", - "Oil Meeting", - "Game Creation Meeting", - "Product Meeting" - ] - - example_buttons = [] - with gr.Row(): - for i in range(min(4, len(example_names))): - btn = gr.Button(example_names[i], size="sm", variant="secondary") - example_buttons.append(btn) - - with gr.Row(): - for i in range(4, min(8, len(example_names))): - btn = gr.Button(example_names[i], size="sm", variant="secondary") - example_buttons.append(btn) - - log_output = gr.Textbox( - label="Generation Log", - lines=8, max_lines=15, - interactive=False, - elem_classes="log-output" - ) - - def update_speaker_visibility(num_speakers): - return [gr.update(visible=(i < num_speakers)) for i in range(4)] - - num_speakers.change( - fn=update_speaker_visibility, - inputs=[num_speakers], - outputs=speaker_selections - ) - - def update_duration_display(script_text): - if not script_text or script_text.strip() == "": - return "" - - words = script_text.split() - word_count = len(words) - wpm = 150 - estimated_minutes = word_count / wpm - - if estimated_minutes < 1: - duration_str = f"{int(estimated_minutes * 60)} sec" - else: - minutes = int(estimated_minutes) - seconds = int((estimated_minutes - minutes) * 60) - if seconds > 0: - duration_str = f"{minutes}m {seconds}s" - else: - duration_str = f"{minutes} min" - - return f"{word_count} words • ~{duration_str}" - - script_input.change( - fn=update_duration_display, - inputs=[script_input], - outputs=[duration_display] - ) - - def generate_podcast_wrapper(model_choice, num_speakers, script, *speakers_and_params): - try: - speakers = speakers_and_params[:4] - cfg_scale_val = speakers_and_params[4] - audio, log = demo_instance.generate_podcast( - num_speakers=int(num_speakers), - script=script, - speaker_1=speakers[0], - speaker_2=speakers[1], - speaker_3=speakers[2], - speaker_4=speakers[3], - cfg_scale=cfg_scale_val, - model_name=model_choice - ) - return audio, log - except Exception as e: - traceback.print_exc() - return None, f"Error: {str(e)}" - - def on_generate_start(): - return gr.update(interactive=False), gr.update(interactive=False), gr.update(value="šŸ”„ Initializing generation...\nā³ This may take up to 2 minutes depending on script length...") - - def on_generate_complete(audio, log): - return gr.update(interactive=True), gr.update(interactive=True), audio, log - - generate_click = generate_btn.click( - fn=on_generate_start, - inputs=[], - outputs=[generate_btn, random_example_btn, log_output], - queue=False - ).then( - fn=generate_podcast_wrapper, - inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale], - outputs=[complete_audio_output, log_output], - queue=True - ).then( - fn=lambda: (gr.update(interactive=True), gr.update(interactive=True)), - inputs=[], - outputs=[generate_btn, random_example_btn], - queue=False - ) - - def load_random_example(use_natural_checkbox): - import random - scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts - if scripts_list: - idx = random.randint(0, len(scripts_list) - 1) - num_speakers_value, script_value = scripts_list[idx] - - # Get gender preferences for this script - genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else [] - - # Select appropriate voices based on gender - voice_selections = [] - for i in range(4): - if i < len(genders): - gender = genders[i] - if gender == "male" and demo_instance.male_voices: - voice = random.choice(demo_instance.male_voices) - elif gender == "female" and demo_instance.female_voices: - voice = random.choice(demo_instance.female_voices) - else: - # neutral or fallback - all_voices = list(demo_instance.available_voices.keys()) - voice = random.choice(all_voices) if all_voices else None - else: - voice = None - voice_selections.append(voice) - - return [num_speakers_value, script_value] + voice_selections - return [2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"] + [None, None, None, None] - - random_example_btn.click( - fn=load_random_example, - inputs=[use_natural], - outputs=[num_speakers, script_input] + speaker_selections, - queue=False - ) - - def load_specific_example(idx, use_natural_checkbox): - import random - scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts - if idx < len(scripts_list): - num_speakers_value, script_value = scripts_list[idx] - # Get gender preferences for this script - genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else [] - - # Select appropriate voices based on gender - voice_selections = [] - for i in range(4): - if i < len(genders): - gender = genders[i] - if gender == "male" and demo_instance.male_voices: - voice = random.choice(demo_instance.male_voices) - elif gender == "female" and demo_instance.female_voices: - voice = random.choice(demo_instance.female_voices) - else: - # neutral or fallback - all_voices = list(demo_instance.available_voices.keys()) - voice = random.choice(all_voices) if all_voices else None - else: - voice = None - voice_selections.append(voice) - - # Return values for all outputs - return [num_speakers_value, script_value] + voice_selections - return [2, ""] + [None, None, None, None] - - for idx, btn in enumerate(example_buttons): - btn.click( - fn=lambda nat, i=idx: load_specific_example(i, nat), - inputs=[use_natural], - outputs=[num_speakers, script_input] + speaker_selections, - queue=False - ) - - with gr.Tab("Architecture"): - with gr.Row(): - gr.Markdown('''VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, " - "such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly " - "in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous " - "speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently " - "preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice " - "employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and " - "dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to " - "90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.''') - with gr.Row(): - with gr.Column(): - gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model") - - gr.Markdown(""" - ### Overview - - VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, - such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, - particularly in scalability, speaker consistency, and natural turn-taking. - - ### Training Architecture - - **Transformer-based Large Language Model** integrated with specialized acoustic and semantic tokenizers and a diffusion-based decoding head. - - **Core Components:** - - **LLM**: Qwen2.5-1.5B for this release - - **Acoustic Tokenizer**: Based on a σ-VAE variant with mirror-symmetric encoder-decoder structure (~340M parameters each) - - 7 stages of modified Transformer blocks - - Achieves 3200x downsampling from 24kHz input - - **Semantic Tokenizer**: Encoder mirrors the Acoustic Tokenizer's architecture - - Trained with an ASR proxy task - - **Diffusion Head**: Lightweight module (4 layers, ~123M parameters) - - Conditioned on LLM hidden states - - Uses DDPM process with Classifier-Free Guidance - - ### Training Details - - **Context Length**: Trained with curriculum up to 65,536 tokens - - **Training Stages:** - 1. **Tokenizer Pre-training**: Acoustic and Semantic tokenizers trained separately - 2. **VibeVoice Training**: Frozen tokenizers, only LLM and diffusion head trained - - Curriculum learning: 4k → 16K → 32K → 64K tokens - - ### Model Variants - - | Model | Context Length | Generation Length | Parameters | - |-------|---------------|-------------------|------------| - | VibeVoice-0.5B-Streaming | - | - | Coming Soon | - | **VibeVoice-1.5B** | 64K | ~90 min | 2.7B | - | VibeVoice-Large | 32K | ~45 min | Redacted | - - ### Technical Specifications - - **Frame Rate**: Ultra-low 7.5 Hz for efficiency - - **Sample Rate**: 24kHz audio output - - **Max Duration**: Up to 90 minutes - - **Speaker Capacity**: 1-4 distinct speakers - - **Languages**: English and Chinese - - ### Key Innovations - - Continuous speech tokenizers at ultra-low frame rate - - Next-token diffusion framework - - Curriculum learning for long-form generation - - Multi-speaker consistency without explicit modeling - """) - - with gr.Column(scale=2): - gr.HTML(""" -
-
- VibeVoice Architecture Diagram -
-
- VibeVoice Performance Chart -
-
- """) - - return interface - -def run_demo( - model_paths: dict = None, - device: str = "cuda", - inference_steps: int = 5, - share: bool = True, -): - """ - model_paths default includes two entries. Replace paths as needed. - """ - if model_paths is None: - model_paths = { - "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B", - "VibeVoice-7B": "vibevoice/VibeVoice-7B", - } - - set_seed(42) - demo_instance = VibeVoiceDemo(model_paths, device, inference_steps) - interface = create_demo_interface(demo_instance) - interface.queue().launch( - share=share, - server_name="0.0.0.0" if share else "127.0.0.1", - show_error=True, - show_api=False - ) - - - -if __name__ == "__main__": - run_demo() diff --git a/configs/qwen2.5_1.5b_64k.json b/backend_modal/configs/qwen2.5_1.5b_64k.json similarity index 100% rename from configs/qwen2.5_1.5b_64k.json rename to backend_modal/configs/qwen2.5_1.5b_64k.json diff --git a/configs/qwen2.5_7b_32k.json b/backend_modal/configs/qwen2.5_7b_32k.json similarity index 100% rename from configs/qwen2.5_7b_32k.json rename to backend_modal/configs/qwen2.5_7b_32k.json diff --git a/example/1p_EN2CH.mp4 b/backend_modal/example/1p_EN2CH.mp4 similarity index 100% rename from example/1p_EN2CH.mp4 rename to backend_modal/example/1p_EN2CH.mp4 diff --git a/example/2p_see_u_again.mp4 b/backend_modal/example/2p_see_u_again.mp4 similarity index 100% rename from example/2p_see_u_again.mp4 rename to backend_modal/example/2p_see_u_again.mp4 diff --git a/example/4p_climate_45min.mp4 b/backend_modal/example/4p_climate_45min.mp4 similarity index 100% rename from example/4p_climate_45min.mp4 rename to backend_modal/example/4p_climate_45min.mp4 diff --git a/backend_modal/modal_runner.py b/backend_modal/modal_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..5a6e7e2e9e50700c282944c0d6b6ce9d2fd1052e --- /dev/null +++ b/backend_modal/modal_runner.py @@ -0,0 +1,230 @@ +import os +import time +import numpy as np +import librosa +import soundfile as sf +import torch +from datetime import datetime + +# Modal-specific imports +import modal + +# Define the Modal Stub +image = ( + modal.Image.debian_slim(python_version="3.10") + .pip_install( + "torch", + "accelerate==1.6.0", + "transformers==4.51.3", + "diffusers", + "tqdm", + "numpy", + "scipy", + "ml-collections", + "absl-py", + "soundfile", + "librosa", + "pydub", + ) + .add_local_dir("./modular", remote_path="/root/modular") + .add_local_dir("./processor", remote_path="/root/processor") + .add_local_dir("./voices", remote_path="/root/voices") + .add_local_dir("./text_examples", remote_path="/root/text_examples") + .add_local_dir("./schedule", remote_path="/root/schedule") +) + +app = modal.App( + name="vibevoice-generator", + image=image, +) + + +@app.cls(gpu="T4", scaledown_window=300, secrets=[modal.Secret.from_name("hf-secret")]) +class VibeVoiceModel: + def __init__(self, model_paths: dict = None): + if model_paths is None: + self.model_paths = { + "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B", + "VibeVoice-7B": "vibevoice/VibeVoice-7B", + } + else: + self.model_paths = model_paths + + self.device = "cuda" + self.inference_steps = 5 + + @modal.enter() + def load_models(self): + """ + This method is run once when the container starts. + It downloads and loads all models onto the GPU. + """ + # Project-specific imports are moved here to run inside the container + from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference + from processor.vibevoice_processor import VibeVoiceProcessor + + print("Entering container and loading models to GPU...") + + # Set compiler flags for better performance + if torch.cuda.is_available() and hasattr(torch, '_inductor'): + if hasattr(torch._inductor, 'config'): + torch._inductor.config.conv_1x1_as_mm = True + torch._inductor.config.coordinate_descent_tuning = True + torch._inductor.config.epilogue_fusion = False + torch._inductor.config.coordinate_descent_check_all_directions = True + + self.models = {} + self.processors = {} + + for name, path in self.model_paths.items(): + print(f" - Loading {name} from {path}") + proc = VibeVoiceProcessor.from_pretrained(path) + mdl = VibeVoiceForConditionalGenerationInference.from_pretrained( + path, + torch_dtype=torch.bfloat16, + attn_implementation="sdpa" + ).to(self.device) + mdl.eval() + print(f" {name} loaded to {self.device}") + self.processors[name] = proc + self.models[name] = mdl + + self.setup_voice_presets() + print("Model loading complete.") + + def setup_voice_presets(self): + self.available_voices = {} + voices_dir = "/root/voices" # Using remote path from Mount + if not os.path.exists(voices_dir): + print(f"Warning: Voices directory not found at {voices_dir}") + return + wav_files = [f for f in os.listdir(voices_dir) + if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))] + for wav_file in wav_files: + name = os.path.splitext(wav_file)[0] + self.available_voices[name] = os.path.join(voices_dir, wav_file) + print(f"Voices loaded: {list(self.available_voices.keys())}") + + def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray: + try: + wav, sr = sf.read(audio_path) + if len(wav.shape) > 1: + wav = np.mean(wav, axis=1) + if sr != target_sr: + wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) + return wav + except Exception as e: + print(f"Error reading audio {audio_path}: {e}") + return np.array([]) + + @modal.method() + def generate_podcast(self, + num_speakers: int, + script: str, + model_name: str, + cfg_scale: float, + speaker_1: str = None, + speaker_2: str = None, + speaker_3: str = None, + speaker_4: str = None): + """ + This is the main inference function that will be called from the Gradio app. + """ + try: + if model_name not in self.models: + raise ValueError(f"Unknown model: {model_name}") + + model = self.models[model_name] + processor = self.processors[model_name] + model.set_ddpm_inference_steps(num_steps=self.inference_steps) + + print(f"Generating with model {model_name} on {self.device}") + + if not script.strip(): + raise ValueError("Error: Please provide a script.") + + script = script.replace("’", "'") + + if not 1 <= num_speakers <= 4: + raise ValueError("Error: Number of speakers must be between 1 and 4.") + + selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers] + for i, speaker_name in enumerate(selected_speakers): + if not speaker_name or speaker_name not in self.available_voices: + raise ValueError(f"Error: Please select a valid speaker for Speaker {i+1}.") + + log = f"Generating conference with {num_speakers} speakers\n" + log += f"Model: {model_name}\n" + log += f"Parameters: CFG Scale={cfg_scale}\n" + log += f"Speakers: {', '.join(selected_speakers)}\n" + + voice_samples = [] + for speaker_name in selected_speakers: + audio_path = self.available_voices[speaker_name] + audio_data = self.read_audio(audio_path) + if len(audio_data) == 0: + raise ValueError(f"Error: Failed to load audio for {speaker_name}") + voice_samples.append(audio_data) + + log += f"Loaded {len(voice_samples)} voice samples\n" + + lines = script.strip().split('\n') + formatted_script_lines = [] + for line in lines: + line = line.strip() + if not line: continue + if line.startswith('Speaker ') and ':' in line: + formatted_script_lines.append(line) + else: + speaker_id = len(formatted_script_lines) % num_speakers + formatted_script_lines.append(f"Speaker {speaker_id}: {line}") + + formatted_script = '\n'.join(formatted_script_lines) + log += f"Formatted script with {len(formatted_script_lines)} turns\n" + log += "Processing with VibeVoice...\n" + + inputs = processor( + text=[formatted_script], + voice_samples=[voice_samples], + padding=True, + return_tensors="pt", + return_attention_mask=True, + ).to(self.device) + + start_time = time.time() + + with torch.inference_mode(): + outputs = model.generate( + **inputs, + max_new_tokens=None, + cfg_scale=cfg_scale, + tokenizer=processor.tokenizer, + generation_config={'do_sample': False}, + verbose=False, + ) + generation_time = time.time() - start_time + + if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None: + audio_tensor = outputs.speech_outputs[0] + audio = audio_tensor.cpu().float().numpy() + else: + raise RuntimeError("Error: No audio was generated by the model.") + + if audio.ndim > 1: + audio = audio.squeeze() + + sample_rate = 24000 + total_duration = len(audio) / sample_rate + log += f"Generation completed in {generation_time:.2f} seconds\n" + log += f"Final audio duration: {total_duration:.2f} seconds\n" + + # Return the raw audio data and sample rate, Gradio will handle the rest + return (sample_rate, audio), log + + except Exception as e: + import traceback + error_msg = f"An unexpected error occurred on Modal: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + # Return a special value or raise an exception that the client can handle + # For Gradio, returning a log message is often best. + return None, error_msg \ No newline at end of file diff --git a/modular/__init__.py b/backend_modal/modular/__init__.py similarity index 100% rename from modular/__init__.py rename to backend_modal/modular/__init__.py diff --git a/modular/configuration_vibevoice.py b/backend_modal/modular/configuration_vibevoice.py similarity index 100% rename from modular/configuration_vibevoice.py rename to backend_modal/modular/configuration_vibevoice.py diff --git a/modular/modeling_vibevoice.py b/backend_modal/modular/modeling_vibevoice.py similarity index 100% rename from modular/modeling_vibevoice.py rename to backend_modal/modular/modeling_vibevoice.py diff --git a/modular/modeling_vibevoice_inference.py b/backend_modal/modular/modeling_vibevoice_inference.py similarity index 100% rename from modular/modeling_vibevoice_inference.py rename to backend_modal/modular/modeling_vibevoice_inference.py diff --git a/modular/modular_vibevoice_diffusion_head.py b/backend_modal/modular/modular_vibevoice_diffusion_head.py similarity index 100% rename from modular/modular_vibevoice_diffusion_head.py rename to backend_modal/modular/modular_vibevoice_diffusion_head.py diff --git a/modular/modular_vibevoice_text_tokenizer.py b/backend_modal/modular/modular_vibevoice_text_tokenizer.py similarity index 100% rename from modular/modular_vibevoice_text_tokenizer.py rename to backend_modal/modular/modular_vibevoice_text_tokenizer.py diff --git a/modular/modular_vibevoice_tokenizer.py b/backend_modal/modular/modular_vibevoice_tokenizer.py similarity index 100% rename from modular/modular_vibevoice_tokenizer.py rename to backend_modal/modular/modular_vibevoice_tokenizer.py diff --git a/modular/streamer.py b/backend_modal/modular/streamer.py similarity index 100% rename from modular/streamer.py rename to backend_modal/modular/streamer.py diff --git a/packages.txt b/backend_modal/packages.txt similarity index 100% rename from packages.txt rename to backend_modal/packages.txt diff --git a/processor/__init__.py b/backend_modal/processor/__init__.py similarity index 100% rename from processor/__init__.py rename to backend_modal/processor/__init__.py diff --git a/processor/vibevoice_processor.py b/backend_modal/processor/vibevoice_processor.py similarity index 100% rename from processor/vibevoice_processor.py rename to backend_modal/processor/vibevoice_processor.py diff --git a/processor/vibevoice_tokenizer_processor.py b/backend_modal/processor/vibevoice_tokenizer_processor.py similarity index 100% rename from processor/vibevoice_tokenizer_processor.py rename to backend_modal/processor/vibevoice_tokenizer_processor.py diff --git a/schedule/__init__.py b/backend_modal/schedule/__init__.py similarity index 100% rename from schedule/__init__.py rename to backend_modal/schedule/__init__.py diff --git a/schedule/dpm_solver.py b/backend_modal/schedule/dpm_solver.py similarity index 100% rename from schedule/dpm_solver.py rename to backend_modal/schedule/dpm_solver.py diff --git a/schedule/timestep_sampler.py b/backend_modal/schedule/timestep_sampler.py similarity index 100% rename from schedule/timestep_sampler.py rename to backend_modal/schedule/timestep_sampler.py diff --git a/scripts/__init__.py b/backend_modal/scripts/__init__.py similarity index 100% rename from scripts/__init__.py rename to backend_modal/scripts/__init__.py diff --git a/scripts/convert_nnscaler_checkpoint_to_transformers.py b/backend_modal/scripts/convert_nnscaler_checkpoint_to_transformers.py similarity index 100% rename from scripts/convert_nnscaler_checkpoint_to_transformers.py rename to backend_modal/scripts/convert_nnscaler_checkpoint_to_transformers.py diff --git a/setup_voices.sh b/backend_modal/setup_voices.sh similarity index 100% rename from setup_voices.sh rename to backend_modal/setup_voices.sh diff --git a/text_examples/1p_ai_tedtalk.txt b/backend_modal/text_examples/1p_ai_tedtalk.txt similarity index 100% rename from text_examples/1p_ai_tedtalk.txt rename to backend_modal/text_examples/1p_ai_tedtalk.txt diff --git a/text_examples/1p_ai_tedtalk_natural.txt b/backend_modal/text_examples/1p_ai_tedtalk_natural.txt similarity index 100% rename from text_examples/1p_ai_tedtalk_natural.txt rename to backend_modal/text_examples/1p_ai_tedtalk_natural.txt diff --git a/text_examples/1p_politcal_speech.txt b/backend_modal/text_examples/1p_politcal_speech.txt similarity index 100% rename from text_examples/1p_politcal_speech.txt rename to backend_modal/text_examples/1p_politcal_speech.txt diff --git a/text_examples/1p_politcal_speech_natural.txt b/backend_modal/text_examples/1p_politcal_speech_natural.txt similarity index 100% rename from text_examples/1p_politcal_speech_natural.txt rename to backend_modal/text_examples/1p_politcal_speech_natural.txt diff --git a/text_examples/2p_financeipo_meeting.txt b/backend_modal/text_examples/2p_financeipo_meeting.txt similarity index 100% rename from text_examples/2p_financeipo_meeting.txt rename to backend_modal/text_examples/2p_financeipo_meeting.txt diff --git a/text_examples/2p_financeipo_meeting_natural.txt b/backend_modal/text_examples/2p_financeipo_meeting_natural.txt similarity index 100% rename from text_examples/2p_financeipo_meeting_natural.txt rename to backend_modal/text_examples/2p_financeipo_meeting_natural.txt diff --git a/text_examples/2p_telehealth_meeting.txt b/backend_modal/text_examples/2p_telehealth_meeting.txt similarity index 100% rename from text_examples/2p_telehealth_meeting.txt rename to backend_modal/text_examples/2p_telehealth_meeting.txt diff --git a/text_examples/2p_telehealth_meeting_natural.txt b/backend_modal/text_examples/2p_telehealth_meeting_natural.txt similarity index 100% rename from text_examples/2p_telehealth_meeting_natural.txt rename to backend_modal/text_examples/2p_telehealth_meeting_natural.txt diff --git a/text_examples/3p_military_meeting.txt b/backend_modal/text_examples/3p_military_meeting.txt similarity index 100% rename from text_examples/3p_military_meeting.txt rename to backend_modal/text_examples/3p_military_meeting.txt diff --git a/text_examples/3p_military_meeting_natural.txt b/backend_modal/text_examples/3p_military_meeting_natural.txt similarity index 100% rename from text_examples/3p_military_meeting_natural.txt rename to backend_modal/text_examples/3p_military_meeting_natural.txt diff --git a/text_examples/3p_oil_meeting.txt b/backend_modal/text_examples/3p_oil_meeting.txt similarity index 100% rename from text_examples/3p_oil_meeting.txt rename to backend_modal/text_examples/3p_oil_meeting.txt diff --git a/text_examples/3p_oil_meeting_natural.txt b/backend_modal/text_examples/3p_oil_meeting_natural.txt similarity index 100% rename from text_examples/3p_oil_meeting_natural.txt rename to backend_modal/text_examples/3p_oil_meeting_natural.txt diff --git a/text_examples/4p_gamecreation_meeting.txt b/backend_modal/text_examples/4p_gamecreation_meeting.txt similarity index 100% rename from text_examples/4p_gamecreation_meeting.txt rename to backend_modal/text_examples/4p_gamecreation_meeting.txt diff --git a/text_examples/4p_gamecreation_meeting_natural.txt b/backend_modal/text_examples/4p_gamecreation_meeting_natural.txt similarity index 100% rename from text_examples/4p_gamecreation_meeting_natural.txt rename to backend_modal/text_examples/4p_gamecreation_meeting_natural.txt diff --git a/text_examples/4p_product_meeting.txt b/backend_modal/text_examples/4p_product_meeting.txt similarity index 100% rename from text_examples/4p_product_meeting.txt rename to backend_modal/text_examples/4p_product_meeting.txt diff --git a/text_examples/4p_product_meeting_natural.txt b/backend_modal/text_examples/4p_product_meeting_natural.txt similarity index 100% rename from text_examples/4p_product_meeting_natural.txt rename to backend_modal/text_examples/4p_product_meeting_natural.txt diff --git a/voices/en-Alice_woman.wav b/backend_modal/voices/en-Alice_woman.wav similarity index 100% rename from voices/en-Alice_woman.wav rename to backend_modal/voices/en-Alice_woman.wav diff --git a/voices/en-Alice_woman_bgm.wav b/backend_modal/voices/en-Alice_woman_bgm.wav similarity index 100% rename from voices/en-Alice_woman_bgm.wav rename to backend_modal/voices/en-Alice_woman_bgm.wav diff --git a/voices/en-Carter_man.wav b/backend_modal/voices/en-Carter_man.wav similarity index 100% rename from voices/en-Carter_man.wav rename to backend_modal/voices/en-Carter_man.wav diff --git a/voices/en-Frank_man.wav b/backend_modal/voices/en-Frank_man.wav similarity index 100% rename from voices/en-Frank_man.wav rename to backend_modal/voices/en-Frank_man.wav diff --git a/voices/en-Maya_woman.wav b/backend_modal/voices/en-Maya_woman.wav similarity index 100% rename from voices/en-Maya_woman.wav rename to backend_modal/voices/en-Maya_woman.wav diff --git a/voices/en-Yasser_man.wav b/backend_modal/voices/en-Yasser_man.wav similarity index 100% rename from voices/en-Yasser_man.wav rename to backend_modal/voices/en-Yasser_man.wav diff --git a/voices/in-Samuel_man.wav b/backend_modal/voices/in-Samuel_man.wav similarity index 100% rename from voices/in-Samuel_man.wav rename to backend_modal/voices/in-Samuel_man.wav diff --git a/voices/zh-Anchen_man_bgm.wav b/backend_modal/voices/zh-Anchen_man_bgm.wav similarity index 100% rename from voices/zh-Anchen_man_bgm.wav rename to backend_modal/voices/zh-Anchen_man_bgm.wav diff --git a/voices/zh-Bowen_man.wav b/backend_modal/voices/zh-Bowen_man.wav similarity index 100% rename from voices/zh-Bowen_man.wav rename to backend_modal/voices/zh-Bowen_man.wav diff --git a/voices/zh-Xinran_woman.wav b/backend_modal/voices/zh-Xinran_woman.wav similarity index 100% rename from voices/zh-Xinran_woman.wav rename to backend_modal/voices/zh-Xinran_woman.wav diff --git a/README.md b/frontend_app/README.md similarity index 100% rename from README.md rename to frontend_app/README.md diff --git a/frontend_app/app.py b/frontend_app/app.py new file mode 100644 index 0000000000000000000000000000000000000000..fc02d1d73705684f0e54947d7b5d9f31393fc00d --- /dev/null +++ b/frontend_app/app.py @@ -0,0 +1,174 @@ +import os +import gradio as gr +import modal +import traceback + +# --- Configuration --- +# This is the name of your Modal stub. +MODAL_STUB_NAME = "vibevoice-generator" +# This is the name of the remote class and method to call. +MODAL_FUNCTION_NAME = "VibeVoiceModel.generate_podcast" + +# These lists are now hardcoded because the data lives on the Modal container. +# For a more dynamic app, you could create a small Modal function to fetch these lists. +AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"] +AVAILABLE_VOICES = [ + "en-Alice_woman_bgm", "en-Alice_woman", "en-Carter_man", "en-Frank_man", + "en-Maya_woman", "en-Yasser_man", "in-Samuel_man", "zh-Anchen_man_bgm", + "zh-Bowen_man", "zh-Xinran_woman" +] +DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman'] + +# --- Modal Connection --- +try: + # This looks up the remote function on Modal + # It will raise an error if the app isn't deployed (`modal deploy modal_runner.py`) + remote_generate_function = modal.Function.lookup(MODAL_STUB_NAME, MODAL_FUNCTION_NAME) + print("Successfully connected to Modal function.") +except modal.exception.NotFoundError: + print("ERROR: Modal function not found.") + print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py") + remote_generate_function = None + +# --- Gradio UI Definition --- +theme = gr.themes.Ocean( + primary_hue="indigo", + secondary_hue="fuchsia", + neutral_hue="slate", +).set( + button_large_radius='*radius_sm' +) + +def create_demo_interface(): + with gr.Blocks( + title="VibeVoice - Conference Generator", + theme=theme, + ) as interface: + gr.HTML(""" +
+ VibeVoice Banner +
+ """) + gr.Markdown("## GPU processing is now offloaded to a Modal.com backend!") + + with gr.Tabs(): + with gr.Tab("Generate"): + gr.Markdown("### Generated Conference") + complete_audio_output = gr.Audio( + label="Complete Conference (Download)", + type="numpy", + autoplay=False, + show_download_button=True, + ) + + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown("### Conference Settings") + model_dropdown = gr.Dropdown( + choices=AVAILABLE_MODELS, + value=AVAILABLE_MODELS[0], + label="Model", + ) + num_speakers = gr.Slider( + minimum=1, maximum=4, value=2, step=1, + label="Number of Speakers", + ) + + gr.Markdown("### Speaker Selection") + speaker_selections = [] + for i in range(4): + speaker = gr.Dropdown( + choices=AVAILABLE_VOICES, + value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None, + label=f"Speaker {i+1}", + visible=(i < 2), + ) + speaker_selections.append(speaker) + + with gr.Accordion("Advanced Settings", open=False): + cfg_scale = gr.Slider( + minimum=1.0, maximum=2.0, value=1.3, step=0.05, + label="CFG Scale (Guidance Strength)", + ) + + with gr.Column(scale=2): + gr.Markdown("### Script Input") + script_input = gr.Textbox( + label="Conversation Script", + placeholder="Enter your conference script here...", + lines=12, + max_lines=20, + ) + generate_btn = gr.Button( + "šŸš€ Generate Conference (on Modal)", size="lg", + variant="primary", + ) + log_output = gr.Textbox( + label="Generation Log", + lines=8, max_lines=15, + interactive=False, + ) + + def update_speaker_visibility(num_speakers): + return [gr.update(visible=(i < num_speakers)) for i in range(4)] + + num_speakers.change( + fn=update_speaker_visibility, + inputs=[num_speakers], + outputs=speaker_selections + ) + + def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params): + if remote_generate_function is None: + return None, "ERROR: Modal function not deployed. Please contact the space owner." + + # Show a message that we are calling the remote function + yield None, "šŸ”„ Calling remote GPU on Modal.com... this may take a moment to start." + + try: + speakers = speakers_and_params[:4] + cfg_scale_val = speakers_and_params[4] + + # This is the call to the remote Modal function + result, log = remote_generate_function.remote( + num_speakers=int(num_speakers_val), + script=script, + speaker_1=speakers[0], + speaker_2=speakers[1], + speaker_3=speakers[2], + speaker_4=speakers[3], + cfg_scale=cfg_scale_val, + model_name=model_choice + ) + yield result, log + except Exception as e: + tb = traceback.format_exc() + print(f"Error calling Modal: {e}") + yield None, f"An error occurred in the Gradio wrapper: {e}\n\n{tb}" + + generate_btn.click( + fn=generate_podcast_wrapper, + inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale], + outputs=[complete_audio_output, log_output] + ) + return interface + +# --- Main Execution --- +if __name__ == "__main__": + if remote_generate_function is None: + # If Modal isn't set up, we can't launch the full app. + # We'll show a simplified UI with an error message. + with gr.Blocks(theme=theme) as interface: + gr.Markdown("# āŒ Configuration Error") + gr.Markdown( + "The Gradio application cannot connect to the Modal backend. " + "The Modal app has not been deployed yet. " + "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page." + ) + interface.launch() + else: + # Launch the full Gradio interface + interface = create_demo_interface() + interface.queue().launch(show_error=True) \ No newline at end of file diff --git a/public/images/banner.png b/frontend_app/public/images/banner.png similarity index 100% rename from public/images/banner.png rename to frontend_app/public/images/banner.png diff --git a/public/images/chart.png b/frontend_app/public/images/chart.png similarity index 100% rename from public/images/chart.png rename to frontend_app/public/images/chart.png diff --git a/public/images/diagram.jpg b/frontend_app/public/images/diagram.jpg similarity index 100% rename from public/images/diagram.jpg rename to frontend_app/public/images/diagram.jpg diff --git a/public/voices/Cherry.mp3 b/frontend_app/public/voices/Cherry.mp3 similarity index 100% rename from public/voices/Cherry.mp3 rename to frontend_app/public/voices/Cherry.mp3 diff --git a/public/voices/Chicago.mp3 b/frontend_app/public/voices/Chicago.mp3 similarity index 100% rename from public/voices/Chicago.mp3 rename to frontend_app/public/voices/Chicago.mp3 diff --git a/public/voices/Janus.mp3 b/frontend_app/public/voices/Janus.mp3 similarity index 100% rename from public/voices/Janus.mp3 rename to frontend_app/public/voices/Janus.mp3 diff --git a/public/voices/Mantis.mp3 b/frontend_app/public/voices/Mantis.mp3 similarity index 100% rename from public/voices/Mantis.mp3 rename to frontend_app/public/voices/Mantis.mp3 diff --git a/public/voices/Sponge.mp3 b/frontend_app/public/voices/Sponge.mp3 similarity index 100% rename from public/voices/Sponge.mp3 rename to frontend_app/public/voices/Sponge.mp3 diff --git a/public/voices/Starchild.mp3 b/frontend_app/public/voices/Starchild.mp3 similarity index 100% rename from public/voices/Starchild.mp3 rename to frontend_app/public/voices/Starchild.mp3 diff --git a/frontend_app/requirements.txt b/frontend_app/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..10238e633a766b89d5f3e297576a41ec36f5f2fd --- /dev/null +++ b/frontend_app/requirements.txt @@ -0,0 +1,2 @@ +gradio +modal \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 738b88c793a76b870f7d2e2b077713d430f21f65..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -spaces -torch -accelerate==1.6.0 -transformers==4.51.3 -diffusers -tqdm -numpy -scipy -ml-collections -absl-py -gradio -av -aiortc -soundfile -librosa -pydub -requests -python-dotenv -