diff --git a/app.py b/app.py
deleted file mode 100644
index 35af657fe47469befad734c8194e4b64bf6f3b2a..0000000000000000000000000000000000000000
--- a/app.py
+++ /dev/null
@@ -1,805 +0,0 @@
-import os
-import time
-import numpy as np
-import gradio as gr
-import librosa
-import soundfile as sf
-import torch
-import traceback
-import threading
-from spaces import GPU
-from datetime import datetime
-from contextlib import contextmanager
-
-from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from processor.vibevoice_processor import VibeVoiceProcessor
-from modular.streamer import AudioStreamer
-from transformers.utils import logging
-from transformers import set_seed
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-
-class VibeVoiceDemo:
- def __init__(self, model_paths: dict, device: str = "cuda", inference_steps: int = 5):
- """
- model_paths: dict like {"VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
- "VibeVoice-7B": "microsoft/VibeVoice-7B"}
- """
- self.model_paths = model_paths
- self.device = device
- self.inference_steps = inference_steps
-
- self.is_generating = False
-
- # Multi-model holders
- self.models = {} # name -> model
- self.processors = {} # name -> processor
- self.current_model_name = None
-
- self.available_voices = {}
-
- # Set compiler flags for better performance
- if torch.cuda.is_available() and hasattr(torch, '_inductor'):
- if hasattr(torch._inductor, 'config'):
- torch._inductor.config.conv_1x1_as_mm = True
- torch._inductor.config.coordinate_descent_tuning = True
- torch._inductor.config.epilogue_fusion = False
- torch._inductor.config.coordinate_descent_check_all_directions = True
-
- self.load_models() # load all on CPU
- self.setup_voice_presets()
- self.load_example_scripts()
-
- def load_models(self):
- print("Loading processors and models on CPU...")
-
- # Debug: Show cache location
- import os
- cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
- print(f"HuggingFace cache directory: {cache_dir}")
- if os.path.exists(cache_dir):
- print(f"Cache exists. Size: {sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk(cache_dir) for filename in filenames) / (1024**3):.2f} GB")
- print("Cached models:")
- for item in os.listdir(cache_dir):
- if item.startswith("models--"):
- print(f" - {item}")
-
- for name, path in self.model_paths.items():
- print(f" - {name} from {path}")
- proc = VibeVoiceProcessor.from_pretrained(path)
- # Use SDPA (Scaled Dot Product Attention) for better memory efficiency
- # Flash Attention 2 disabled to reduce memory usage on L4 GPUs
- mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
- path,
- torch_dtype=torch.bfloat16,
- attn_implementation="sdpa" # More memory efficient than flash_attention_2
- )
- print(f" SDPA (memory-efficient) attention enabled for {name}")
- # Keep on CPU initially
- self.processors[name] = proc
- self.models[name] = mdl
- # choose default
- self.current_model_name = next(iter(self.models))
- print(f"Default model is {self.current_model_name}")
-
- def _place_model(self, target_name: str):
- """
- Move the selected model to CUDA and push all others back to CPU.
- """
- # Clear GPU cache before moving models
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
-
- for name, mdl in self.models.items():
- if name == target_name:
- self.models[name] = mdl.to(self.device)
- else:
- self.models[name] = mdl.to("cpu")
-
- # Clear cache again after model placement
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
-
- self.current_model_name = target_name
- print(f"Model {target_name} is now on {self.device}. Others moved to CPU.")
-
- def setup_voice_presets(self):
- voices_dir = os.path.join(os.path.dirname(__file__), "voices")
- if not os.path.exists(voices_dir):
- print(f"Warning: Voices directory not found at {voices_dir}")
- return
- wav_files = [f for f in os.listdir(voices_dir)
- if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))]
- for wav_file in wav_files:
- name = os.path.splitext(wav_file)[0]
- self.available_voices[name] = os.path.join(voices_dir, wav_file)
- print(f"Voices loaded: {list(self.available_voices.keys())}")
-
- # Organize voices by gender
- self.male_voices = [
- "en-Carter_man",
- "en-Frank_man",
- "en-Yasser_man",
- "in-Samuel_man",
- "zh-Anchen_man_bgm",
- "zh-Bowen_man"
- ]
- self.female_voices = [
- "en-Alice_woman_bgm",
- "en-Alice_woman",
- "en-Maya_woman",
- "zh-Xinran_woman"
- ]
-
- def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
- try:
- wav, sr = sf.read(audio_path)
- if len(wav.shape) > 1:
- wav = np.mean(wav, axis=1)
- if sr != target_sr:
- wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
- return wav
- except Exception as e:
- print(f"Error reading audio {audio_path}: {e}")
- return np.array([])
-
- @GPU(duration=120)
- def generate_podcast(self,
- num_speakers: int,
- script: str,
- speaker_1: str = None,
- speaker_2: str = None,
- speaker_3: str = None,
- speaker_4: str = None,
- cfg_scale: float = 1.3,
- model_name: str = None):
- """
- Generates a conference as a single audio file from a script and saves it.
- Non-streaming.
- """
- try:
- # pick model
- model_name = model_name or self.current_model_name
- if model_name not in self.models:
- raise gr.Error(f"Unknown model: {model_name}")
-
- # place models on devices
- self._place_model(model_name)
- model = self.models[model_name]
- processor = self.processors[model_name]
-
- print(f"Using model {model_name} on {self.device}")
-
- # Additional cache clear before generation
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
-
- model.eval()
- model.set_ddpm_inference_steps(num_steps=self.inference_steps)
-
- self.is_generating = True
-
- if not script.strip():
- raise gr.Error("Error: Please provide a script.")
-
- script = script.replace("ā", "'")
-
- if not 1 <= num_speakers <= 4:
- raise gr.Error("Error: Number of speakers must be between 1 and 4.")
-
- selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
- for i, speaker_name in enumerate(selected_speakers):
- if not speaker_name or speaker_name not in self.available_voices:
- raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
-
- log = f"Generating conference with {num_speakers} speakers\n"
- log += f"Model: {model_name}\n"
- log += f"Parameters: CFG Scale={cfg_scale}\n"
- log += f"Speakers: {', '.join(selected_speakers)}\n"
-
- voice_samples = []
- for speaker_name in selected_speakers:
- audio_path = self.available_voices[speaker_name]
- audio_data = self.read_audio(audio_path)
- if len(audio_data) == 0:
- raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
- voice_samples.append(audio_data)
-
- log += f"Loaded {len(voice_samples)} voice samples\n"
-
- lines = script.strip().split('\n')
- formatted_script_lines = []
- for line in lines:
- line = line.strip()
- if not line:
- continue
- if line.startswith('Speaker ') and ':' in line:
- formatted_script_lines.append(line)
- else:
- speaker_id = len(formatted_script_lines) % num_speakers
- formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
-
- formatted_script = '\n'.join(formatted_script_lines)
- log += f"Formatted script with {len(formatted_script_lines)} turns\n"
- log += "Processing with VibeVoice...\n"
-
- inputs = processor(
- text=[formatted_script],
- voice_samples=[voice_samples],
- padding=True,
- return_tensors="pt",
- return_attention_mask=True,
- )
-
- start_time = time.time()
-
- # Use efficient attention backend
- if torch.cuda.is_available() and hasattr(torch.nn.attention, 'SDPBackend'):
- from torch.nn.attention import SDPBackend, sdpa_kernel
- with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
- outputs = model.generate(
- **inputs,
- max_new_tokens=None,
- cfg_scale=cfg_scale,
- tokenizer=processor.tokenizer,
- generation_config={'do_sample': False},
- verbose=False,
- )
- else:
- outputs = model.generate(
- **inputs,
- max_new_tokens=None,
- cfg_scale=cfg_scale,
- tokenizer=processor.tokenizer,
- generation_config={'do_sample': False},
- verbose=False,
- )
- generation_time = time.time() - start_time
-
- if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
- audio_tensor = outputs.speech_outputs[0]
- audio = audio_tensor.cpu().float().numpy()
- else:
- raise gr.Error("Error: No audio was generated by the model. Please try again.")
-
- if audio.ndim > 1:
- audio = audio.squeeze()
-
- sample_rate = 24000
-
- output_dir = "outputs"
- os.makedirs(output_dir, exist_ok=True)
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- file_path = os.path.join(output_dir, f"conference_{timestamp}.wav")
- sf.write(file_path, audio, sample_rate)
- print(f"Conference saved to {file_path}")
-
- total_duration = len(audio) / sample_rate
- log += f"Generation completed in {generation_time:.2f} seconds\n"
- log += f"Final audio duration: {total_duration:.2f} seconds\n"
- log += f"Successfully saved conference to: {file_path}\n"
-
- self.is_generating = False
- return (sample_rate, audio), log
-
- except gr.Error as e:
- self.is_generating = False
- error_msg = f"Input Error: {str(e)}"
- print(error_msg)
- return None, error_msg
-
- except Exception as e:
- self.is_generating = False
- error_msg = f"An unexpected error occurred: {str(e)}"
- print(error_msg)
- traceback.print_exc()
- return None, error_msg
-
-
- @staticmethod
- def _infer_num_speakers_from_script(script: str) -> int:
- """
- Infer number of speakers by counting distinct 'Speaker X:' tags in the script.
- Robust to 0- or 1-indexed labels and repeated turns.
- Falls back to 1 if none found.
- """
- import re
- ids = re.findall(r'(?mi)^\s*Speaker\s+(\d+)\s*:', script)
- return len({int(x) for x in ids}) if ids else 1
-
- def load_example_scripts(self):
- examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
- self.example_scripts = []
- self.example_scripts_natural = []
- if not os.path.exists(examples_dir):
- return
-
- original_files = [
- "1p_ai_tedtalk.txt",
- "1p_politcal_speech.txt",
- "2p_financeipo_meeting.txt",
- "2p_telehealth_meeting.txt",
- "3p_military_meeting.txt",
- "3p_oil_meeting.txt",
- "4p_gamecreation_meeting.txt",
- "4p_product_meeting.txt"
- ]
-
- # Gender mapping for each script's speakers
- self.script_speaker_genders = [
- ["female"], # AI TED Talk - Rachel
- ["neutral"], # Political Speech - generic speaker
- ["male", "female"], # Finance IPO - James, Patricia
- ["female", "male"], # Telehealth - Jennifer, Tom
- ["female", "male", "female"], # Military - Sarah, David, Lisa
- ["male", "female", "male"], # Oil - Robert, Lisa, Michael
- ["male", "female", "male", "male"], # Game Creation - Alex, Sarah, Marcus, Emma
- ["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David
- ]
-
- for txt_file in original_files:
- try:
- with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
- script_content = f.read().strip()
- if script_content:
- num_speakers = self._infer_num_speakers_from_script(script_content)
- self.example_scripts.append([num_speakers, script_content])
-
- natural_file = txt_file.replace('.txt', '_natural.txt')
- natural_path = os.path.join(examples_dir, natural_file)
- if os.path.exists(natural_path):
- with open(natural_path, 'r', encoding='utf-8') as f:
- natural_content = f.read().strip()
- if natural_content:
- num_speakers = self._infer_num_speakers_from_script(natural_content)
- self.example_scripts_natural.append([num_speakers, natural_content])
- else:
- self.example_scripts_natural.append([num_speakers, script_content])
- except Exception as e:
- print(f"Error loading {txt_file}: {e}")
-
-
-def convert_to_16_bit_wav(data):
- if torch.is_tensor(data):
- data = data.detach().cpu().numpy()
- data = np.array(data)
- if np.max(np.abs(data)) > 1.0:
- data = data / np.max(np.abs(data))
- return (data * 32767).astype(np.int16)
-
-# Set synthwave theme
-theme = gr.themes.Ocean(
- primary_hue="indigo",
- secondary_hue="fuchsia",
- neutral_hue="slate",
-).set(
- button_large_radius='*radius_sm'
-)
-
-def set_working_state(*components, transcript_box=None):
- """
- Disable all interactive components and show progress in transcript/log box.
- Usage: set_working_state(generate_btn, random_example_btn, transcript_box=log_output)
- """
- updates = [gr.update(interactive=False) for _ in components]
- if transcript_box is not None:
- updates.append(gr.update(value="Generating... please wait", interactive=False))
- return tuple(updates)
-
-def set_idle_state(*components, transcript_box=None):
- """
- Re-enable all interactive components and transcript/log box.
- Usage: set_idle_state(generate_btn, random_example_btn, transcript_box=log_output)
- """
- updates = [gr.update(interactive=True) for _ in components]
- if transcript_box is not None:
- updates.append(gr.update(interactive=True))
- return tuple(updates)
-
-
-def create_demo_interface(demo_instance: VibeVoiceDemo):
- custom_css = """ """
-
- with gr.Blocks(
- title="VibeVoice - Conference Generator",
- css=custom_css,
- theme=theme,
- ) as interface:
-
- # Simple image
- gr.HTML("""
-
-

-
- """)
- gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5 first.")
-
- with gr.Tabs():
- with gr.Tab("Generate"):
- gr.Markdown("### Generated Conference")
- complete_audio_output = gr.Audio(
- label="Complete Conference (Download)",
- type="numpy",
- elem_classes="audio-output complete-audio-section",
- autoplay=False,
- show_download_button=True,
- visible=True
- )
-
- with gr.Row():
- with gr.Column(scale=1, elem_classes="settings-card"):
- gr.Markdown("### Conference Settings")
-
- # Model dropdown
- model_dropdown = gr.Dropdown(
- choices=list(demo_instance.models.keys()),
- value=demo_instance.current_model_name,
- label="Model",
- )
-
- num_speakers = gr.Slider(
- minimum=1, maximum=4, value=2, step=1,
- label="Number of Speakers",
- elem_classes="slider-container"
- )
-
- gr.Markdown("### Speaker Selection")
- available_speaker_names = list(demo_instance.available_voices.keys())
- default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
-
- speaker_selections = []
- for i in range(4):
- default_value = default_speakers[i] if i < len(default_speakers) else None
- speaker = gr.Dropdown(
- choices=available_speaker_names,
- value=default_value,
- label=f"Speaker {i+1}",
- visible=(i < 2),
- elem_classes="speaker-item"
- )
- speaker_selections.append(speaker)
-
- gr.Markdown("### Advanced Settings")
- with gr.Accordion("Generation Parameters", open=False):
- cfg_scale = gr.Slider(
- minimum=1.0, maximum=2.0, value=1.3, step=0.05,
- label="CFG Scale (Guidance Strength)",
- elem_classes="slider-container"
- )
-
- with gr.Column(scale=2, elem_classes="generation-card"):
- gr.Markdown("### Script Input")
- script_input = gr.Textbox(
- label="Conversation Script",
- placeholder="Enter your conference script here...",
- lines=12,
- max_lines=20,
- elem_classes="script-input"
- )
-
- with gr.Row():
- random_example_btn = gr.Button(
- "Random Example", size="lg",
- variant="secondary", elem_classes="random-btn", scale=1
- )
- generate_btn = gr.Button(
- "š Generate Conference", size="lg",
- variant="primary", elem_classes="generate-btn", scale=2
- )
-
- with gr.Row():
- with gr.Column(scale=1):
- gr.Markdown("### Example Scripts")
- with gr.Row():
- use_natural = gr.Checkbox(
- value=True,
- label="Natural talking sounds",
- scale=1
- )
- duration_display = gr.Textbox(
- value="",
- label="Est. Duration",
- interactive=False,
- scale=1
- )
-
- example_names = [
- "AI TED Talk",
- "Political Speech",
- "Finance IPO Meeting",
- "Telehealth Meeting",
- "Military Meeting",
- "Oil Meeting",
- "Game Creation Meeting",
- "Product Meeting"
- ]
-
- example_buttons = []
- with gr.Row():
- for i in range(min(4, len(example_names))):
- btn = gr.Button(example_names[i], size="sm", variant="secondary")
- example_buttons.append(btn)
-
- with gr.Row():
- for i in range(4, min(8, len(example_names))):
- btn = gr.Button(example_names[i], size="sm", variant="secondary")
- example_buttons.append(btn)
-
- log_output = gr.Textbox(
- label="Generation Log",
- lines=8, max_lines=15,
- interactive=False,
- elem_classes="log-output"
- )
-
- def update_speaker_visibility(num_speakers):
- return [gr.update(visible=(i < num_speakers)) for i in range(4)]
-
- num_speakers.change(
- fn=update_speaker_visibility,
- inputs=[num_speakers],
- outputs=speaker_selections
- )
-
- def update_duration_display(script_text):
- if not script_text or script_text.strip() == "":
- return ""
-
- words = script_text.split()
- word_count = len(words)
- wpm = 150
- estimated_minutes = word_count / wpm
-
- if estimated_minutes < 1:
- duration_str = f"{int(estimated_minutes * 60)} sec"
- else:
- minutes = int(estimated_minutes)
- seconds = int((estimated_minutes - minutes) * 60)
- if seconds > 0:
- duration_str = f"{minutes}m {seconds}s"
- else:
- duration_str = f"{minutes} min"
-
- return f"{word_count} words ⢠~{duration_str}"
-
- script_input.change(
- fn=update_duration_display,
- inputs=[script_input],
- outputs=[duration_display]
- )
-
- def generate_podcast_wrapper(model_choice, num_speakers, script, *speakers_and_params):
- try:
- speakers = speakers_and_params[:4]
- cfg_scale_val = speakers_and_params[4]
- audio, log = demo_instance.generate_podcast(
- num_speakers=int(num_speakers),
- script=script,
- speaker_1=speakers[0],
- speaker_2=speakers[1],
- speaker_3=speakers[2],
- speaker_4=speakers[3],
- cfg_scale=cfg_scale_val,
- model_name=model_choice
- )
- return audio, log
- except Exception as e:
- traceback.print_exc()
- return None, f"Error: {str(e)}"
-
- def on_generate_start():
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(value="š Initializing generation...\nā³ This may take up to 2 minutes depending on script length...")
-
- def on_generate_complete(audio, log):
- return gr.update(interactive=True), gr.update(interactive=True), audio, log
-
- generate_click = generate_btn.click(
- fn=on_generate_start,
- inputs=[],
- outputs=[generate_btn, random_example_btn, log_output],
- queue=False
- ).then(
- fn=generate_podcast_wrapper,
- inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
- outputs=[complete_audio_output, log_output],
- queue=True
- ).then(
- fn=lambda: (gr.update(interactive=True), gr.update(interactive=True)),
- inputs=[],
- outputs=[generate_btn, random_example_btn],
- queue=False
- )
-
- def load_random_example(use_natural_checkbox):
- import random
- scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
- if scripts_list:
- idx = random.randint(0, len(scripts_list) - 1)
- num_speakers_value, script_value = scripts_list[idx]
-
- # Get gender preferences for this script
- genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
-
- # Select appropriate voices based on gender
- voice_selections = []
- for i in range(4):
- if i < len(genders):
- gender = genders[i]
- if gender == "male" and demo_instance.male_voices:
- voice = random.choice(demo_instance.male_voices)
- elif gender == "female" and demo_instance.female_voices:
- voice = random.choice(demo_instance.female_voices)
- else:
- # neutral or fallback
- all_voices = list(demo_instance.available_voices.keys())
- voice = random.choice(all_voices) if all_voices else None
- else:
- voice = None
- voice_selections.append(voice)
-
- return [num_speakers_value, script_value] + voice_selections
- return [2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"] + [None, None, None, None]
-
- random_example_btn.click(
- fn=load_random_example,
- inputs=[use_natural],
- outputs=[num_speakers, script_input] + speaker_selections,
- queue=False
- )
-
- def load_specific_example(idx, use_natural_checkbox):
- import random
- scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
- if idx < len(scripts_list):
- num_speakers_value, script_value = scripts_list[idx]
- # Get gender preferences for this script
- genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
-
- # Select appropriate voices based on gender
- voice_selections = []
- for i in range(4):
- if i < len(genders):
- gender = genders[i]
- if gender == "male" and demo_instance.male_voices:
- voice = random.choice(demo_instance.male_voices)
- elif gender == "female" and demo_instance.female_voices:
- voice = random.choice(demo_instance.female_voices)
- else:
- # neutral or fallback
- all_voices = list(demo_instance.available_voices.keys())
- voice = random.choice(all_voices) if all_voices else None
- else:
- voice = None
- voice_selections.append(voice)
-
- # Return values for all outputs
- return [num_speakers_value, script_value] + voice_selections
- return [2, ""] + [None, None, None, None]
-
- for idx, btn in enumerate(example_buttons):
- btn.click(
- fn=lambda nat, i=idx: load_specific_example(i, nat),
- inputs=[use_natural],
- outputs=[num_speakers, script_input] + speaker_selections,
- queue=False
- )
-
- with gr.Tab("Architecture"):
- with gr.Row():
- gr.Markdown('''VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, "
- "such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly "
- "in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous "
- "speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently "
- "preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice "
- "employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and "
- "dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to "
- "90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.''')
- with gr.Row():
- with gr.Column():
- gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
-
- gr.Markdown("""
- ### Overview
-
- VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
- such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
- particularly in scalability, speaker consistency, and natural turn-taking.
-
- ### Training Architecture
-
- **Transformer-based Large Language Model** integrated with specialized acoustic and semantic tokenizers and a diffusion-based decoding head.
-
- **Core Components:**
- - **LLM**: Qwen2.5-1.5B for this release
- - **Acoustic Tokenizer**: Based on a Ļ-VAE variant with mirror-symmetric encoder-decoder structure (~340M parameters each)
- - 7 stages of modified Transformer blocks
- - Achieves 3200x downsampling from 24kHz input
- - **Semantic Tokenizer**: Encoder mirrors the Acoustic Tokenizer's architecture
- - Trained with an ASR proxy task
- - **Diffusion Head**: Lightweight module (4 layers, ~123M parameters)
- - Conditioned on LLM hidden states
- - Uses DDPM process with Classifier-Free Guidance
-
- ### Training Details
-
- **Context Length**: Trained with curriculum up to 65,536 tokens
-
- **Training Stages:**
- 1. **Tokenizer Pre-training**: Acoustic and Semantic tokenizers trained separately
- 2. **VibeVoice Training**: Frozen tokenizers, only LLM and diffusion head trained
- - Curriculum learning: 4k ā 16K ā 32K ā 64K tokens
-
- ### Model Variants
-
- | Model | Context Length | Generation Length | Parameters |
- |-------|---------------|-------------------|------------|
- | VibeVoice-0.5B-Streaming | - | - | Coming Soon |
- | **VibeVoice-1.5B** | 64K | ~90 min | 2.7B |
- | VibeVoice-Large | 32K | ~45 min | Redacted |
-
- ### Technical Specifications
- - **Frame Rate**: Ultra-low 7.5 Hz for efficiency
- - **Sample Rate**: 24kHz audio output
- - **Max Duration**: Up to 90 minutes
- - **Speaker Capacity**: 1-4 distinct speakers
- - **Languages**: English and Chinese
-
- ### Key Innovations
- - Continuous speech tokenizers at ultra-low frame rate
- - Next-token diffusion framework
- - Curriculum learning for long-form generation
- - Multi-speaker consistency without explicit modeling
- """)
-
- with gr.Column(scale=2):
- gr.HTML("""
-
-
-

-
-
-

-
-
- """)
-
- return interface
-
-def run_demo(
- model_paths: dict = None,
- device: str = "cuda",
- inference_steps: int = 5,
- share: bool = True,
-):
- """
- model_paths default includes two entries. Replace paths as needed.
- """
- if model_paths is None:
- model_paths = {
- "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
- "VibeVoice-7B": "vibevoice/VibeVoice-7B",
- }
-
- set_seed(42)
- demo_instance = VibeVoiceDemo(model_paths, device, inference_steps)
- interface = create_demo_interface(demo_instance)
- interface.queue().launch(
- share=share,
- server_name="0.0.0.0" if share else "127.0.0.1",
- show_error=True,
- show_api=False
- )
-
-
-
-if __name__ == "__main__":
- run_demo()
diff --git a/configs/qwen2.5_1.5b_64k.json b/backend_modal/configs/qwen2.5_1.5b_64k.json
similarity index 100%
rename from configs/qwen2.5_1.5b_64k.json
rename to backend_modal/configs/qwen2.5_1.5b_64k.json
diff --git a/configs/qwen2.5_7b_32k.json b/backend_modal/configs/qwen2.5_7b_32k.json
similarity index 100%
rename from configs/qwen2.5_7b_32k.json
rename to backend_modal/configs/qwen2.5_7b_32k.json
diff --git a/example/1p_EN2CH.mp4 b/backend_modal/example/1p_EN2CH.mp4
similarity index 100%
rename from example/1p_EN2CH.mp4
rename to backend_modal/example/1p_EN2CH.mp4
diff --git a/example/2p_see_u_again.mp4 b/backend_modal/example/2p_see_u_again.mp4
similarity index 100%
rename from example/2p_see_u_again.mp4
rename to backend_modal/example/2p_see_u_again.mp4
diff --git a/example/4p_climate_45min.mp4 b/backend_modal/example/4p_climate_45min.mp4
similarity index 100%
rename from example/4p_climate_45min.mp4
rename to backend_modal/example/4p_climate_45min.mp4
diff --git a/backend_modal/modal_runner.py b/backend_modal/modal_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a6e7e2e9e50700c282944c0d6b6ce9d2fd1052e
--- /dev/null
+++ b/backend_modal/modal_runner.py
@@ -0,0 +1,230 @@
+import os
+import time
+import numpy as np
+import librosa
+import soundfile as sf
+import torch
+from datetime import datetime
+
+# Modal-specific imports
+import modal
+
+# Define the Modal Stub
+image = (
+ modal.Image.debian_slim(python_version="3.10")
+ .pip_install(
+ "torch",
+ "accelerate==1.6.0",
+ "transformers==4.51.3",
+ "diffusers",
+ "tqdm",
+ "numpy",
+ "scipy",
+ "ml-collections",
+ "absl-py",
+ "soundfile",
+ "librosa",
+ "pydub",
+ )
+ .add_local_dir("./modular", remote_path="/root/modular")
+ .add_local_dir("./processor", remote_path="/root/processor")
+ .add_local_dir("./voices", remote_path="/root/voices")
+ .add_local_dir("./text_examples", remote_path="/root/text_examples")
+ .add_local_dir("./schedule", remote_path="/root/schedule")
+)
+
+app = modal.App(
+ name="vibevoice-generator",
+ image=image,
+)
+
+
+@app.cls(gpu="T4", scaledown_window=300, secrets=[modal.Secret.from_name("hf-secret")])
+class VibeVoiceModel:
+ def __init__(self, model_paths: dict = None):
+ if model_paths is None:
+ self.model_paths = {
+ "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
+ "VibeVoice-7B": "vibevoice/VibeVoice-7B",
+ }
+ else:
+ self.model_paths = model_paths
+
+ self.device = "cuda"
+ self.inference_steps = 5
+
+ @modal.enter()
+ def load_models(self):
+ """
+ This method is run once when the container starts.
+ It downloads and loads all models onto the GPU.
+ """
+ # Project-specific imports are moved here to run inside the container
+ from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+ from processor.vibevoice_processor import VibeVoiceProcessor
+
+ print("Entering container and loading models to GPU...")
+
+ # Set compiler flags for better performance
+ if torch.cuda.is_available() and hasattr(torch, '_inductor'):
+ if hasattr(torch._inductor, 'config'):
+ torch._inductor.config.conv_1x1_as_mm = True
+ torch._inductor.config.coordinate_descent_tuning = True
+ torch._inductor.config.epilogue_fusion = False
+ torch._inductor.config.coordinate_descent_check_all_directions = True
+
+ self.models = {}
+ self.processors = {}
+
+ for name, path in self.model_paths.items():
+ print(f" - Loading {name} from {path}")
+ proc = VibeVoiceProcessor.from_pretrained(path)
+ mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
+ path,
+ torch_dtype=torch.bfloat16,
+ attn_implementation="sdpa"
+ ).to(self.device)
+ mdl.eval()
+ print(f" {name} loaded to {self.device}")
+ self.processors[name] = proc
+ self.models[name] = mdl
+
+ self.setup_voice_presets()
+ print("Model loading complete.")
+
+ def setup_voice_presets(self):
+ self.available_voices = {}
+ voices_dir = "/root/voices" # Using remote path from Mount
+ if not os.path.exists(voices_dir):
+ print(f"Warning: Voices directory not found at {voices_dir}")
+ return
+ wav_files = [f for f in os.listdir(voices_dir)
+ if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))]
+ for wav_file in wav_files:
+ name = os.path.splitext(wav_file)[0]
+ self.available_voices[name] = os.path.join(voices_dir, wav_file)
+ print(f"Voices loaded: {list(self.available_voices.keys())}")
+
+ def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
+ try:
+ wav, sr = sf.read(audio_path)
+ if len(wav.shape) > 1:
+ wav = np.mean(wav, axis=1)
+ if sr != target_sr:
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
+ return wav
+ except Exception as e:
+ print(f"Error reading audio {audio_path}: {e}")
+ return np.array([])
+
+ @modal.method()
+ def generate_podcast(self,
+ num_speakers: int,
+ script: str,
+ model_name: str,
+ cfg_scale: float,
+ speaker_1: str = None,
+ speaker_2: str = None,
+ speaker_3: str = None,
+ speaker_4: str = None):
+ """
+ This is the main inference function that will be called from the Gradio app.
+ """
+ try:
+ if model_name not in self.models:
+ raise ValueError(f"Unknown model: {model_name}")
+
+ model = self.models[model_name]
+ processor = self.processors[model_name]
+ model.set_ddpm_inference_steps(num_steps=self.inference_steps)
+
+ print(f"Generating with model {model_name} on {self.device}")
+
+ if not script.strip():
+ raise ValueError("Error: Please provide a script.")
+
+ script = script.replace("ā", "'")
+
+ if not 1 <= num_speakers <= 4:
+ raise ValueError("Error: Number of speakers must be between 1 and 4.")
+
+ selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+ for i, speaker_name in enumerate(selected_speakers):
+ if not speaker_name or speaker_name not in self.available_voices:
+ raise ValueError(f"Error: Please select a valid speaker for Speaker {i+1}.")
+
+ log = f"Generating conference with {num_speakers} speakers\n"
+ log += f"Model: {model_name}\n"
+ log += f"Parameters: CFG Scale={cfg_scale}\n"
+ log += f"Speakers: {', '.join(selected_speakers)}\n"
+
+ voice_samples = []
+ for speaker_name in selected_speakers:
+ audio_path = self.available_voices[speaker_name]
+ audio_data = self.read_audio(audio_path)
+ if len(audio_data) == 0:
+ raise ValueError(f"Error: Failed to load audio for {speaker_name}")
+ voice_samples.append(audio_data)
+
+ log += f"Loaded {len(voice_samples)} voice samples\n"
+
+ lines = script.strip().split('\n')
+ formatted_script_lines = []
+ for line in lines:
+ line = line.strip()
+ if not line: continue
+ if line.startswith('Speaker ') and ':' in line:
+ formatted_script_lines.append(line)
+ else:
+ speaker_id = len(formatted_script_lines) % num_speakers
+ formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
+
+ formatted_script = '\n'.join(formatted_script_lines)
+ log += f"Formatted script with {len(formatted_script_lines)} turns\n"
+ log += "Processing with VibeVoice...\n"
+
+ inputs = processor(
+ text=[formatted_script],
+ voice_samples=[voice_samples],
+ padding=True,
+ return_tensors="pt",
+ return_attention_mask=True,
+ ).to(self.device)
+
+ start_time = time.time()
+
+ with torch.inference_mode():
+ outputs = model.generate(
+ **inputs,
+ max_new_tokens=None,
+ cfg_scale=cfg_scale,
+ tokenizer=processor.tokenizer,
+ generation_config={'do_sample': False},
+ verbose=False,
+ )
+ generation_time = time.time() - start_time
+
+ if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
+ audio_tensor = outputs.speech_outputs[0]
+ audio = audio_tensor.cpu().float().numpy()
+ else:
+ raise RuntimeError("Error: No audio was generated by the model.")
+
+ if audio.ndim > 1:
+ audio = audio.squeeze()
+
+ sample_rate = 24000
+ total_duration = len(audio) / sample_rate
+ log += f"Generation completed in {generation_time:.2f} seconds\n"
+ log += f"Final audio duration: {total_duration:.2f} seconds\n"
+
+ # Return the raw audio data and sample rate, Gradio will handle the rest
+ return (sample_rate, audio), log
+
+ except Exception as e:
+ import traceback
+ error_msg = f"An unexpected error occurred on Modal: {str(e)}\n{traceback.format_exc()}"
+ print(error_msg)
+ # Return a special value or raise an exception that the client can handle
+ # For Gradio, returning a log message is often best.
+ return None, error_msg
\ No newline at end of file
diff --git a/modular/__init__.py b/backend_modal/modular/__init__.py
similarity index 100%
rename from modular/__init__.py
rename to backend_modal/modular/__init__.py
diff --git a/modular/configuration_vibevoice.py b/backend_modal/modular/configuration_vibevoice.py
similarity index 100%
rename from modular/configuration_vibevoice.py
rename to backend_modal/modular/configuration_vibevoice.py
diff --git a/modular/modeling_vibevoice.py b/backend_modal/modular/modeling_vibevoice.py
similarity index 100%
rename from modular/modeling_vibevoice.py
rename to backend_modal/modular/modeling_vibevoice.py
diff --git a/modular/modeling_vibevoice_inference.py b/backend_modal/modular/modeling_vibevoice_inference.py
similarity index 100%
rename from modular/modeling_vibevoice_inference.py
rename to backend_modal/modular/modeling_vibevoice_inference.py
diff --git a/modular/modular_vibevoice_diffusion_head.py b/backend_modal/modular/modular_vibevoice_diffusion_head.py
similarity index 100%
rename from modular/modular_vibevoice_diffusion_head.py
rename to backend_modal/modular/modular_vibevoice_diffusion_head.py
diff --git a/modular/modular_vibevoice_text_tokenizer.py b/backend_modal/modular/modular_vibevoice_text_tokenizer.py
similarity index 100%
rename from modular/modular_vibevoice_text_tokenizer.py
rename to backend_modal/modular/modular_vibevoice_text_tokenizer.py
diff --git a/modular/modular_vibevoice_tokenizer.py b/backend_modal/modular/modular_vibevoice_tokenizer.py
similarity index 100%
rename from modular/modular_vibevoice_tokenizer.py
rename to backend_modal/modular/modular_vibevoice_tokenizer.py
diff --git a/modular/streamer.py b/backend_modal/modular/streamer.py
similarity index 100%
rename from modular/streamer.py
rename to backend_modal/modular/streamer.py
diff --git a/packages.txt b/backend_modal/packages.txt
similarity index 100%
rename from packages.txt
rename to backend_modal/packages.txt
diff --git a/processor/__init__.py b/backend_modal/processor/__init__.py
similarity index 100%
rename from processor/__init__.py
rename to backend_modal/processor/__init__.py
diff --git a/processor/vibevoice_processor.py b/backend_modal/processor/vibevoice_processor.py
similarity index 100%
rename from processor/vibevoice_processor.py
rename to backend_modal/processor/vibevoice_processor.py
diff --git a/processor/vibevoice_tokenizer_processor.py b/backend_modal/processor/vibevoice_tokenizer_processor.py
similarity index 100%
rename from processor/vibevoice_tokenizer_processor.py
rename to backend_modal/processor/vibevoice_tokenizer_processor.py
diff --git a/schedule/__init__.py b/backend_modal/schedule/__init__.py
similarity index 100%
rename from schedule/__init__.py
rename to backend_modal/schedule/__init__.py
diff --git a/schedule/dpm_solver.py b/backend_modal/schedule/dpm_solver.py
similarity index 100%
rename from schedule/dpm_solver.py
rename to backend_modal/schedule/dpm_solver.py
diff --git a/schedule/timestep_sampler.py b/backend_modal/schedule/timestep_sampler.py
similarity index 100%
rename from schedule/timestep_sampler.py
rename to backend_modal/schedule/timestep_sampler.py
diff --git a/scripts/__init__.py b/backend_modal/scripts/__init__.py
similarity index 100%
rename from scripts/__init__.py
rename to backend_modal/scripts/__init__.py
diff --git a/scripts/convert_nnscaler_checkpoint_to_transformers.py b/backend_modal/scripts/convert_nnscaler_checkpoint_to_transformers.py
similarity index 100%
rename from scripts/convert_nnscaler_checkpoint_to_transformers.py
rename to backend_modal/scripts/convert_nnscaler_checkpoint_to_transformers.py
diff --git a/setup_voices.sh b/backend_modal/setup_voices.sh
similarity index 100%
rename from setup_voices.sh
rename to backend_modal/setup_voices.sh
diff --git a/text_examples/1p_ai_tedtalk.txt b/backend_modal/text_examples/1p_ai_tedtalk.txt
similarity index 100%
rename from text_examples/1p_ai_tedtalk.txt
rename to backend_modal/text_examples/1p_ai_tedtalk.txt
diff --git a/text_examples/1p_ai_tedtalk_natural.txt b/backend_modal/text_examples/1p_ai_tedtalk_natural.txt
similarity index 100%
rename from text_examples/1p_ai_tedtalk_natural.txt
rename to backend_modal/text_examples/1p_ai_tedtalk_natural.txt
diff --git a/text_examples/1p_politcal_speech.txt b/backend_modal/text_examples/1p_politcal_speech.txt
similarity index 100%
rename from text_examples/1p_politcal_speech.txt
rename to backend_modal/text_examples/1p_politcal_speech.txt
diff --git a/text_examples/1p_politcal_speech_natural.txt b/backend_modal/text_examples/1p_politcal_speech_natural.txt
similarity index 100%
rename from text_examples/1p_politcal_speech_natural.txt
rename to backend_modal/text_examples/1p_politcal_speech_natural.txt
diff --git a/text_examples/2p_financeipo_meeting.txt b/backend_modal/text_examples/2p_financeipo_meeting.txt
similarity index 100%
rename from text_examples/2p_financeipo_meeting.txt
rename to backend_modal/text_examples/2p_financeipo_meeting.txt
diff --git a/text_examples/2p_financeipo_meeting_natural.txt b/backend_modal/text_examples/2p_financeipo_meeting_natural.txt
similarity index 100%
rename from text_examples/2p_financeipo_meeting_natural.txt
rename to backend_modal/text_examples/2p_financeipo_meeting_natural.txt
diff --git a/text_examples/2p_telehealth_meeting.txt b/backend_modal/text_examples/2p_telehealth_meeting.txt
similarity index 100%
rename from text_examples/2p_telehealth_meeting.txt
rename to backend_modal/text_examples/2p_telehealth_meeting.txt
diff --git a/text_examples/2p_telehealth_meeting_natural.txt b/backend_modal/text_examples/2p_telehealth_meeting_natural.txt
similarity index 100%
rename from text_examples/2p_telehealth_meeting_natural.txt
rename to backend_modal/text_examples/2p_telehealth_meeting_natural.txt
diff --git a/text_examples/3p_military_meeting.txt b/backend_modal/text_examples/3p_military_meeting.txt
similarity index 100%
rename from text_examples/3p_military_meeting.txt
rename to backend_modal/text_examples/3p_military_meeting.txt
diff --git a/text_examples/3p_military_meeting_natural.txt b/backend_modal/text_examples/3p_military_meeting_natural.txt
similarity index 100%
rename from text_examples/3p_military_meeting_natural.txt
rename to backend_modal/text_examples/3p_military_meeting_natural.txt
diff --git a/text_examples/3p_oil_meeting.txt b/backend_modal/text_examples/3p_oil_meeting.txt
similarity index 100%
rename from text_examples/3p_oil_meeting.txt
rename to backend_modal/text_examples/3p_oil_meeting.txt
diff --git a/text_examples/3p_oil_meeting_natural.txt b/backend_modal/text_examples/3p_oil_meeting_natural.txt
similarity index 100%
rename from text_examples/3p_oil_meeting_natural.txt
rename to backend_modal/text_examples/3p_oil_meeting_natural.txt
diff --git a/text_examples/4p_gamecreation_meeting.txt b/backend_modal/text_examples/4p_gamecreation_meeting.txt
similarity index 100%
rename from text_examples/4p_gamecreation_meeting.txt
rename to backend_modal/text_examples/4p_gamecreation_meeting.txt
diff --git a/text_examples/4p_gamecreation_meeting_natural.txt b/backend_modal/text_examples/4p_gamecreation_meeting_natural.txt
similarity index 100%
rename from text_examples/4p_gamecreation_meeting_natural.txt
rename to backend_modal/text_examples/4p_gamecreation_meeting_natural.txt
diff --git a/text_examples/4p_product_meeting.txt b/backend_modal/text_examples/4p_product_meeting.txt
similarity index 100%
rename from text_examples/4p_product_meeting.txt
rename to backend_modal/text_examples/4p_product_meeting.txt
diff --git a/text_examples/4p_product_meeting_natural.txt b/backend_modal/text_examples/4p_product_meeting_natural.txt
similarity index 100%
rename from text_examples/4p_product_meeting_natural.txt
rename to backend_modal/text_examples/4p_product_meeting_natural.txt
diff --git a/voices/en-Alice_woman.wav b/backend_modal/voices/en-Alice_woman.wav
similarity index 100%
rename from voices/en-Alice_woman.wav
rename to backend_modal/voices/en-Alice_woman.wav
diff --git a/voices/en-Alice_woman_bgm.wav b/backend_modal/voices/en-Alice_woman_bgm.wav
similarity index 100%
rename from voices/en-Alice_woman_bgm.wav
rename to backend_modal/voices/en-Alice_woman_bgm.wav
diff --git a/voices/en-Carter_man.wav b/backend_modal/voices/en-Carter_man.wav
similarity index 100%
rename from voices/en-Carter_man.wav
rename to backend_modal/voices/en-Carter_man.wav
diff --git a/voices/en-Frank_man.wav b/backend_modal/voices/en-Frank_man.wav
similarity index 100%
rename from voices/en-Frank_man.wav
rename to backend_modal/voices/en-Frank_man.wav
diff --git a/voices/en-Maya_woman.wav b/backend_modal/voices/en-Maya_woman.wav
similarity index 100%
rename from voices/en-Maya_woman.wav
rename to backend_modal/voices/en-Maya_woman.wav
diff --git a/voices/en-Yasser_man.wav b/backend_modal/voices/en-Yasser_man.wav
similarity index 100%
rename from voices/en-Yasser_man.wav
rename to backend_modal/voices/en-Yasser_man.wav
diff --git a/voices/in-Samuel_man.wav b/backend_modal/voices/in-Samuel_man.wav
similarity index 100%
rename from voices/in-Samuel_man.wav
rename to backend_modal/voices/in-Samuel_man.wav
diff --git a/voices/zh-Anchen_man_bgm.wav b/backend_modal/voices/zh-Anchen_man_bgm.wav
similarity index 100%
rename from voices/zh-Anchen_man_bgm.wav
rename to backend_modal/voices/zh-Anchen_man_bgm.wav
diff --git a/voices/zh-Bowen_man.wav b/backend_modal/voices/zh-Bowen_man.wav
similarity index 100%
rename from voices/zh-Bowen_man.wav
rename to backend_modal/voices/zh-Bowen_man.wav
diff --git a/voices/zh-Xinran_woman.wav b/backend_modal/voices/zh-Xinran_woman.wav
similarity index 100%
rename from voices/zh-Xinran_woman.wav
rename to backend_modal/voices/zh-Xinran_woman.wav
diff --git a/README.md b/frontend_app/README.md
similarity index 100%
rename from README.md
rename to frontend_app/README.md
diff --git a/frontend_app/app.py b/frontend_app/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc02d1d73705684f0e54947d7b5d9f31393fc00d
--- /dev/null
+++ b/frontend_app/app.py
@@ -0,0 +1,174 @@
+import os
+import gradio as gr
+import modal
+import traceback
+
+# --- Configuration ---
+# This is the name of your Modal stub.
+MODAL_STUB_NAME = "vibevoice-generator"
+# This is the name of the remote class and method to call.
+MODAL_FUNCTION_NAME = "VibeVoiceModel.generate_podcast"
+
+# These lists are now hardcoded because the data lives on the Modal container.
+# For a more dynamic app, you could create a small Modal function to fetch these lists.
+AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
+AVAILABLE_VOICES = [
+ "en-Alice_woman_bgm", "en-Alice_woman", "en-Carter_man", "en-Frank_man",
+ "en-Maya_woman", "en-Yasser_man", "in-Samuel_man", "zh-Anchen_man_bgm",
+ "zh-Bowen_man", "zh-Xinran_woman"
+]
+DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
+
+# --- Modal Connection ---
+try:
+ # This looks up the remote function on Modal
+ # It will raise an error if the app isn't deployed (`modal deploy modal_runner.py`)
+ remote_generate_function = modal.Function.lookup(MODAL_STUB_NAME, MODAL_FUNCTION_NAME)
+ print("Successfully connected to Modal function.")
+except modal.exception.NotFoundError:
+ print("ERROR: Modal function not found.")
+ print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py")
+ remote_generate_function = None
+
+# --- Gradio UI Definition ---
+theme = gr.themes.Ocean(
+ primary_hue="indigo",
+ secondary_hue="fuchsia",
+ neutral_hue="slate",
+).set(
+ button_large_radius='*radius_sm'
+)
+
+def create_demo_interface():
+ with gr.Blocks(
+ title="VibeVoice - Conference Generator",
+ theme=theme,
+ ) as interface:
+ gr.HTML("""
+
+

+
+ """)
+ gr.Markdown("## GPU processing is now offloaded to a Modal.com backend!")
+
+ with gr.Tabs():
+ with gr.Tab("Generate"):
+ gr.Markdown("### Generated Conference")
+ complete_audio_output = gr.Audio(
+ label="Complete Conference (Download)",
+ type="numpy",
+ autoplay=False,
+ show_download_button=True,
+ )
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ gr.Markdown("### Conference Settings")
+ model_dropdown = gr.Dropdown(
+ choices=AVAILABLE_MODELS,
+ value=AVAILABLE_MODELS[0],
+ label="Model",
+ )
+ num_speakers = gr.Slider(
+ minimum=1, maximum=4, value=2, step=1,
+ label="Number of Speakers",
+ )
+
+ gr.Markdown("### Speaker Selection")
+ speaker_selections = []
+ for i in range(4):
+ speaker = gr.Dropdown(
+ choices=AVAILABLE_VOICES,
+ value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
+ label=f"Speaker {i+1}",
+ visible=(i < 2),
+ )
+ speaker_selections.append(speaker)
+
+ with gr.Accordion("Advanced Settings", open=False):
+ cfg_scale = gr.Slider(
+ minimum=1.0, maximum=2.0, value=1.3, step=0.05,
+ label="CFG Scale (Guidance Strength)",
+ )
+
+ with gr.Column(scale=2):
+ gr.Markdown("### Script Input")
+ script_input = gr.Textbox(
+ label="Conversation Script",
+ placeholder="Enter your conference script here...",
+ lines=12,
+ max_lines=20,
+ )
+ generate_btn = gr.Button(
+ "š Generate Conference (on Modal)", size="lg",
+ variant="primary",
+ )
+ log_output = gr.Textbox(
+ label="Generation Log",
+ lines=8, max_lines=15,
+ interactive=False,
+ )
+
+ def update_speaker_visibility(num_speakers):
+ return [gr.update(visible=(i < num_speakers)) for i in range(4)]
+
+ num_speakers.change(
+ fn=update_speaker_visibility,
+ inputs=[num_speakers],
+ outputs=speaker_selections
+ )
+
+ def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
+ if remote_generate_function is None:
+ return None, "ERROR: Modal function not deployed. Please contact the space owner."
+
+ # Show a message that we are calling the remote function
+ yield None, "š Calling remote GPU on Modal.com... this may take a moment to start."
+
+ try:
+ speakers = speakers_and_params[:4]
+ cfg_scale_val = speakers_and_params[4]
+
+ # This is the call to the remote Modal function
+ result, log = remote_generate_function.remote(
+ num_speakers=int(num_speakers_val),
+ script=script,
+ speaker_1=speakers[0],
+ speaker_2=speakers[1],
+ speaker_3=speakers[2],
+ speaker_4=speakers[3],
+ cfg_scale=cfg_scale_val,
+ model_name=model_choice
+ )
+ yield result, log
+ except Exception as e:
+ tb = traceback.format_exc()
+ print(f"Error calling Modal: {e}")
+ yield None, f"An error occurred in the Gradio wrapper: {e}\n\n{tb}"
+
+ generate_btn.click(
+ fn=generate_podcast_wrapper,
+ inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
+ outputs=[complete_audio_output, log_output]
+ )
+ return interface
+
+# --- Main Execution ---
+if __name__ == "__main__":
+ if remote_generate_function is None:
+ # If Modal isn't set up, we can't launch the full app.
+ # We'll show a simplified UI with an error message.
+ with gr.Blocks(theme=theme) as interface:
+ gr.Markdown("# ā Configuration Error")
+ gr.Markdown(
+ "The Gradio application cannot connect to the Modal backend. "
+ "The Modal app has not been deployed yet. "
+ "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page."
+ )
+ interface.launch()
+ else:
+ # Launch the full Gradio interface
+ interface = create_demo_interface()
+ interface.queue().launch(show_error=True)
\ No newline at end of file
diff --git a/public/images/banner.png b/frontend_app/public/images/banner.png
similarity index 100%
rename from public/images/banner.png
rename to frontend_app/public/images/banner.png
diff --git a/public/images/chart.png b/frontend_app/public/images/chart.png
similarity index 100%
rename from public/images/chart.png
rename to frontend_app/public/images/chart.png
diff --git a/public/images/diagram.jpg b/frontend_app/public/images/diagram.jpg
similarity index 100%
rename from public/images/diagram.jpg
rename to frontend_app/public/images/diagram.jpg
diff --git a/public/voices/Cherry.mp3 b/frontend_app/public/voices/Cherry.mp3
similarity index 100%
rename from public/voices/Cherry.mp3
rename to frontend_app/public/voices/Cherry.mp3
diff --git a/public/voices/Chicago.mp3 b/frontend_app/public/voices/Chicago.mp3
similarity index 100%
rename from public/voices/Chicago.mp3
rename to frontend_app/public/voices/Chicago.mp3
diff --git a/public/voices/Janus.mp3 b/frontend_app/public/voices/Janus.mp3
similarity index 100%
rename from public/voices/Janus.mp3
rename to frontend_app/public/voices/Janus.mp3
diff --git a/public/voices/Mantis.mp3 b/frontend_app/public/voices/Mantis.mp3
similarity index 100%
rename from public/voices/Mantis.mp3
rename to frontend_app/public/voices/Mantis.mp3
diff --git a/public/voices/Sponge.mp3 b/frontend_app/public/voices/Sponge.mp3
similarity index 100%
rename from public/voices/Sponge.mp3
rename to frontend_app/public/voices/Sponge.mp3
diff --git a/public/voices/Starchild.mp3 b/frontend_app/public/voices/Starchild.mp3
similarity index 100%
rename from public/voices/Starchild.mp3
rename to frontend_app/public/voices/Starchild.mp3
diff --git a/frontend_app/requirements.txt b/frontend_app/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10238e633a766b89d5f3e297576a41ec36f5f2fd
--- /dev/null
+++ b/frontend_app/requirements.txt
@@ -0,0 +1,2 @@
+gradio
+modal
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 738b88c793a76b870f7d2e2b077713d430f21f65..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-spaces
-torch
-accelerate==1.6.0
-transformers==4.51.3
-diffusers
-tqdm
-numpy
-scipy
-ml-collections
-absl-py
-gradio
-av
-aiortc
-soundfile
-librosa
-pydub
-requests
-python-dotenv
-