Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running on CPU Upgrade

ACloudCenter commited on Sep 12

Commit

749c08c

1 Parent(s): 1d188b4

Refactor: Separate frontend and backend code

Moved Gradio app and related files to 'frontend_app/' and Modal backend
code to 'backend_modal/' for clearer separation and easier deployment.

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +0 -805
{configs → backend_modal/configs}/qwen2.5_1.5b_64k.json +0 -0
{configs → backend_modal/configs}/qwen2.5_7b_32k.json +0 -0
{example → backend_modal/example}/1p_EN2CH.mp4 +0 -0
{example → backend_modal/example}/2p_see_u_again.mp4 +0 -0
{example → backend_modal/example}/4p_climate_45min.mp4 +0 -0
backend_modal/modal_runner.py +230 -0
{modular → backend_modal/modular}/__init__.py +0 -0
{modular → backend_modal/modular}/configuration_vibevoice.py +0 -0
{modular → backend_modal/modular}/modeling_vibevoice.py +0 -0
{modular → backend_modal/modular}/modeling_vibevoice_inference.py +0 -0
{modular → backend_modal/modular}/modular_vibevoice_diffusion_head.py +0 -0
{modular → backend_modal/modular}/modular_vibevoice_text_tokenizer.py +0 -0
{modular → backend_modal/modular}/modular_vibevoice_tokenizer.py +0 -0
{modular → backend_modal/modular}/streamer.py +0 -0
packages.txt → backend_modal/packages.txt +0 -0
{processor → backend_modal/processor}/__init__.py +0 -0
{processor → backend_modal/processor}/vibevoice_processor.py +0 -0
{processor → backend_modal/processor}/vibevoice_tokenizer_processor.py +0 -0
{schedule → backend_modal/schedule}/__init__.py +0 -0
{schedule → backend_modal/schedule}/dpm_solver.py +0 -0
{schedule → backend_modal/schedule}/timestep_sampler.py +0 -0
{scripts → backend_modal/scripts}/__init__.py +0 -0
{scripts → backend_modal/scripts}/convert_nnscaler_checkpoint_to_transformers.py +0 -0
setup_voices.sh → backend_modal/setup_voices.sh +0 -0
{text_examples → backend_modal/text_examples}/1p_ai_tedtalk.txt +0 -0
{text_examples → backend_modal/text_examples}/1p_ai_tedtalk_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/1p_politcal_speech.txt +0 -0
{text_examples → backend_modal/text_examples}/1p_politcal_speech_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/2p_financeipo_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/2p_financeipo_meeting_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/2p_telehealth_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/2p_telehealth_meeting_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/3p_military_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/3p_military_meeting_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/3p_oil_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/3p_oil_meeting_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/4p_gamecreation_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/4p_gamecreation_meeting_natural.txt +0 -0
{text_examples → backend_modal/text_examples}/4p_product_meeting.txt +0 -0
{text_examples → backend_modal/text_examples}/4p_product_meeting_natural.txt +0 -0
{voices → backend_modal/voices}/en-Alice_woman.wav +0 -0
{voices → backend_modal/voices}/en-Alice_woman_bgm.wav +0 -0
{voices → backend_modal/voices}/en-Carter_man.wav +0 -0
{voices → backend_modal/voices}/en-Frank_man.wav +0 -0
{voices → backend_modal/voices}/en-Maya_woman.wav +0 -0
{voices → backend_modal/voices}/en-Yasser_man.wav +0 -0
{voices → backend_modal/voices}/in-Samuel_man.wav +0 -0
{voices → backend_modal/voices}/zh-Anchen_man_bgm.wav +0 -0
{voices → backend_modal/voices}/zh-Bowen_man.wav +0 -0

app.py DELETED Viewed

@@ -1,805 +0,0 @@
-import os
-import time
-import numpy as np
-import gradio as gr
-import librosa
-import soundfile as sf
-import torch
-import traceback
-import threading
-from spaces import GPU
-from datetime import datetime
-from contextlib import contextmanager
-from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from processor.vibevoice_processor import VibeVoiceProcessor
-from modular.streamer import AudioStreamer
-from transformers.utils import logging
-from transformers import set_seed
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-class VibeVoiceDemo:
-    def __init__(self, model_paths: dict, device: str = "cuda", inference_steps: int = 5):
-        """
-        model_paths: dict like {"VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
-                                "VibeVoice-7B": "microsoft/VibeVoice-7B"}
-        """
-        self.model_paths = model_paths
-        self.device = device
-        self.inference_steps = inference_steps
-        self.is_generating = False
-        # Multi-model holders
-        self.models = {}        # name -> model
-        self.processors = {}    # name -> processor
-        self.current_model_name = None
-        self.available_voices = {}
-        # Set compiler flags for better performance
-        if torch.cuda.is_available() and hasattr(torch, '_inductor'):
-            if hasattr(torch._inductor, 'config'):
-                torch._inductor.config.conv_1x1_as_mm = True
-                torch._inductor.config.coordinate_descent_tuning = True
-                torch._inductor.config.epilogue_fusion = False
-                torch._inductor.config.coordinate_descent_check_all_directions = True
-        self.load_models()          # load all on CPU
-        self.setup_voice_presets()
-        self.load_example_scripts()
-    def load_models(self):
-        print("Loading processors and models on CPU...")
-        # Debug: Show cache location
-        import os
-        cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
-        print(f"HuggingFace cache directory: {cache_dir}")
-        if os.path.exists(cache_dir):
-            print(f"Cache exists. Size: {sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk(cache_dir) for filename in filenames) / (1024**3):.2f} GB")
-            print("Cached models:")
-            for item in os.listdir(cache_dir):
-                if item.startswith("models--"):
-                    print(f"  - {item}")
-        for name, path in self.model_paths.items():
-            print(f" - {name} from {path}")
-            proc = VibeVoiceProcessor.from_pretrained(path)
-            # Use SDPA (Scaled Dot Product Attention) for better memory efficiency
-            # Flash Attention 2 disabled to reduce memory usage on L4 GPUs
-            mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
-                path,
-                torch_dtype=torch.bfloat16,
-                attn_implementation="sdpa"  # More memory efficient than flash_attention_2
-            )
-            print(f"  SDPA (memory-efficient) attention enabled for {name}")
-            # Keep on CPU initially
-            self.processors[name] = proc
-            self.models[name] = mdl
-        # choose default
-        self.current_model_name = next(iter(self.models))
-        print(f"Default model is {self.current_model_name}")
-    def _place_model(self, target_name: str):
-        """
-        Move the selected model to CUDA and push all others back to CPU.
-        """
-        # Clear GPU cache before moving models
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        for name, mdl in self.models.items():
-            if name == target_name:
-                self.models[name] = mdl.to(self.device)
-            else:
-                self.models[name] = mdl.to("cpu")
-        # Clear cache again after model placement
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        self.current_model_name = target_name
-        print(f"Model {target_name} is now on {self.device}. Others moved to CPU.")
-    def setup_voice_presets(self):
-        voices_dir = os.path.join(os.path.dirname(__file__), "voices")
-        if not os.path.exists(voices_dir):
-            print(f"Warning: Voices directory not found at {voices_dir}")
-            return
-        wav_files = [f for f in os.listdir(voices_dir)
-                     if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))]
-        for wav_file in wav_files:
-            name = os.path.splitext(wav_file)[0]
-            self.available_voices[name] = os.path.join(voices_dir, wav_file)
-        print(f"Voices loaded: {list(self.available_voices.keys())}")
-        # Organize voices by gender
-        self.male_voices = [
-            "en-Carter_man",
-            "en-Frank_man",
-            "en-Yasser_man",
-            "in-Samuel_man",
-            "zh-Anchen_man_bgm",
-            "zh-Bowen_man"
-        ]
-        self.female_voices = [
-            "en-Alice_woman_bgm",
-            "en-Alice_woman",
-            "en-Maya_woman",
-            "zh-Xinran_woman"
-        ]
-    def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
-        try:
-            wav, sr = sf.read(audio_path)
-            if len(wav.shape) > 1:
-                wav = np.mean(wav, axis=1)
-            if sr != target_sr:
-                wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
-            return wav
-        except Exception as e:
-            print(f"Error reading audio {audio_path}: {e}")
-            return np.array([])
-    @GPU(duration=120)
-    def generate_podcast(self,
-                         num_speakers: int,
-                         script: str,
-                         speaker_1: str = None,
-                         speaker_2: str = None,
-                         speaker_3: str = None,
-                         speaker_4: str = None,
-                         cfg_scale: float = 1.3,
-                         model_name: str = None):
-        """
-        Generates a conference as a single audio file from a script and saves it.
-        Non-streaming.
-        """
-        try:
-            # pick model
-            model_name = model_name or self.current_model_name
-            if model_name not in self.models:
-                raise gr.Error(f"Unknown model: {model_name}")
-            # place models on devices
-            self._place_model(model_name)
-            model = self.models[model_name]
-            processor = self.processors[model_name]
-            print(f"Using model {model_name} on {self.device}")
-            # Additional cache clear before generation
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            model.eval()
-            model.set_ddpm_inference_steps(num_steps=self.inference_steps)
-            self.is_generating = True
-            if not script.strip():
-                raise gr.Error("Error: Please provide a script.")
-            script = script.replace("’", "'")
-            if not 1 <= num_speakers <= 4:
-                raise gr.Error("Error: Number of speakers must be between 1 and 4.")
-            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
-            for i, speaker_name in enumerate(selected_speakers):
-                if not speaker_name or speaker_name not in self.available_voices:
-                    raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
-            log = f"Generating conference with {num_speakers} speakers\n"
-            log += f"Model: {model_name}\n"
-            log += f"Parameters: CFG Scale={cfg_scale}\n"
-            log += f"Speakers: {', '.join(selected_speakers)}\n"
-            voice_samples = []
-            for speaker_name in selected_speakers:
-                audio_path = self.available_voices[speaker_name]
-                audio_data = self.read_audio(audio_path)
-                if len(audio_data) == 0:
-                    raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
-                voice_samples.append(audio_data)
-            log += f"Loaded {len(voice_samples)} voice samples\n"
-            lines = script.strip().split('\n')
-            formatted_script_lines = []
-            for line in lines:
-                line = line.strip()
-                if not line:
-                    continue
-                if line.startswith('Speaker ') and ':' in line:
-                    formatted_script_lines.append(line)
-                else:
-                    speaker_id = len(formatted_script_lines) % num_speakers
-                    formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
-            formatted_script = '\n'.join(formatted_script_lines)
-            log += f"Formatted script with {len(formatted_script_lines)} turns\n"
-            log += "Processing with VibeVoice...\n"
-            inputs = processor(
-                text=[formatted_script],
-                voice_samples=[voice_samples],
-                padding=True,
-                return_tensors="pt",
-                return_attention_mask=True,
-            )
-            start_time = time.time()
-            # Use efficient attention backend
-            if torch.cuda.is_available() and hasattr(torch.nn.attention, 'SDPBackend'):
-                from torch.nn.attention import SDPBackend, sdpa_kernel
-                with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=None,
-                        cfg_scale=cfg_scale,
-                        tokenizer=processor.tokenizer,
-                        generation_config={'do_sample': False},
-                        verbose=False,
-                    )
-            else:
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=None,
-                    cfg_scale=cfg_scale,
-                    tokenizer=processor.tokenizer,
-                    generation_config={'do_sample': False},
-                    verbose=False,
-                )
-            generation_time = time.time() - start_time
-            if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
-                audio_tensor = outputs.speech_outputs[0]
-                audio = audio_tensor.cpu().float().numpy()
-            else:
-                raise gr.Error("Error: No audio was generated by the model. Please try again.")
-            if audio.ndim > 1:
-                audio = audio.squeeze()
-            sample_rate = 24000
-            output_dir = "outputs"
-            os.makedirs(output_dir, exist_ok=True)
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            file_path = os.path.join(output_dir, f"conference_{timestamp}.wav")
-            sf.write(file_path, audio, sample_rate)
-            print(f"Conference saved to {file_path}")
-            total_duration = len(audio) / sample_rate
-            log += f"Generation completed in {generation_time:.2f} seconds\n"
-            log += f"Final audio duration: {total_duration:.2f} seconds\n"
-            log += f"Successfully saved conference to: {file_path}\n"
-            self.is_generating = False
-            return (sample_rate, audio), log
-        except gr.Error as e:
-            self.is_generating = False
-            error_msg = f"Input Error: {str(e)}"
-            print(error_msg)
-            return None, error_msg
-        except Exception as e:
-            self.is_generating = False
-            error_msg = f"An unexpected error occurred: {str(e)}"
-            print(error_msg)
-            traceback.print_exc()
-            return None, error_msg
-    @staticmethod
-    def _infer_num_speakers_from_script(script: str) -> int:
-        """
-        Infer number of speakers by counting distinct 'Speaker X:' tags in the script.
-        Robust to 0- or 1-indexed labels and repeated turns.
-        Falls back to 1 if none found.
-        """
-        import re
-        ids = re.findall(r'(?mi)^\s*Speaker\s+(\d+)\s*:', script)
-        return len({int(x) for x in ids}) if ids else 1
-    def load_example_scripts(self):
-        examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
-        self.example_scripts = []
-        self.example_scripts_natural = []
-        if not os.path.exists(examples_dir):
-            return
-        original_files = [
-            "1p_ai_tedtalk.txt",
-            "1p_politcal_speech.txt",
-            "2p_financeipo_meeting.txt",
-            "2p_telehealth_meeting.txt",
-            "3p_military_meeting.txt",
-            "3p_oil_meeting.txt",
-            "4p_gamecreation_meeting.txt",
-            "4p_product_meeting.txt"
-        ]
-        # Gender mapping for each script's speakers
-        self.script_speaker_genders = [
-            ["female"],  # AI TED Talk - Rachel
-            ["neutral"],  # Political Speech - generic speaker
-            ["male", "female"],  # Finance IPO - James, Patricia
-            ["female", "male"],  # Telehealth - Jennifer, Tom
-            ["female", "male", "female"],  # Military - Sarah, David, Lisa
-            ["male", "female", "male"],  # Oil - Robert, Lisa, Michael
-            ["male", "female", "male", "male"],  # Game Creation - Alex, Sarah, Marcus, Emma
-            ["female", "male", "female", "male"]  # Product Meeting - Sarah, Marcus, Jennifer, David
-        ]
-        for txt_file in original_files:
-            try:
-                with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
-                    script_content = f.read().strip()
-                if script_content:
-                    num_speakers = self._infer_num_speakers_from_script(script_content)
-                    self.example_scripts.append([num_speakers, script_content])
-                natural_file = txt_file.replace('.txt', '_natural.txt')
-                natural_path = os.path.join(examples_dir, natural_file)
-                if os.path.exists(natural_path):
-                    with open(natural_path, 'r', encoding='utf-8') as f:
-                        natural_content = f.read().strip()
-                    if natural_content:
-                        num_speakers = self._infer_num_speakers_from_script(natural_content)
-                        self.example_scripts_natural.append([num_speakers, natural_content])
-                else:
-                    self.example_scripts_natural.append([num_speakers, script_content])
-            except Exception as e:
-                print(f"Error loading {txt_file}: {e}")
-def convert_to_16_bit_wav(data):
-    if torch.is_tensor(data):
-        data = data.detach().cpu().numpy()
-    data = np.array(data)
-    if np.max(np.abs(data)) > 1.0:
-        data = data / np.max(np.abs(data))
-    return (data * 32767).astype(np.int16)
-# Set synthwave theme
-theme = gr.themes.Ocean(
-    primary_hue="indigo",
-    secondary_hue="fuchsia",
-    neutral_hue="slate",
-).set(
-    button_large_radius='*radius_sm'
-)
-def set_working_state(*components, transcript_box=None):
-    """
-    Disable all interactive components and show progress in transcript/log box.
-    Usage: set_working_state(generate_btn, random_example_btn, transcript_box=log_output)
-    """
-    updates = [gr.update(interactive=False) for _ in components]
-    if transcript_box is not None:
-        updates.append(gr.update(value="Generating... please wait", interactive=False))
-    return tuple(updates)
-def set_idle_state(*components, transcript_box=None):
-    """
-    Re-enable all interactive components and transcript/log box.
-    Usage: set_idle_state(generate_btn, random_example_btn, transcript_box=log_output)
-    """
-    updates = [gr.update(interactive=True) for _ in components]
-    if transcript_box is not None:
-        updates.append(gr.update(interactive=True))
-    return tuple(updates)
-def create_demo_interface(demo_instance: VibeVoiceDemo):
-    custom_css = """ """
-    with gr.Blocks(
-        title="VibeVoice - Conference Generator",
-        css=custom_css,
-        theme=theme,
-    ) as interface:
-        # Simple image
-        gr.HTML("""
-        <div style="width: 100%; margin-bottom: 20px;">
-            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png"
-                style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
-                alt="Canary-Qwen Transcriber Banner">
-        </div>
-        """)
-        gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5 first.")
-        with gr.Tabs():
-            with gr.Tab("Generate"):
-                gr.Markdown("### Generated Conference")
-                complete_audio_output = gr.Audio(
-                    label="Complete Conference (Download)",
-                    type="numpy",
-                    elem_classes="audio-output complete-audio-section",
-                    autoplay=False,
-                    show_download_button=True,
-                    visible=True
-                )
-                with gr.Row():
-                    with gr.Column(scale=1, elem_classes="settings-card"):
-                        gr.Markdown("### Conference Settings")
-                        # Model dropdown
-                        model_dropdown = gr.Dropdown(
-                            choices=list(demo_instance.models.keys()),
-                            value=demo_instance.current_model_name,
-                            label="Model",
-                        )
-                        num_speakers = gr.Slider(
-                            minimum=1, maximum=4, value=2, step=1,
-                            label="Number of Speakers",
-                            elem_classes="slider-container"
-                        )
-                        gr.Markdown("### Speaker Selection")
-                        available_speaker_names = list(demo_instance.available_voices.keys())
-                        default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
-                        speaker_selections = []
-                        for i in range(4):
-                            default_value = default_speakers[i] if i < len(default_speakers) else None
-                            speaker = gr.Dropdown(
-                                choices=available_speaker_names,
-                                value=default_value,
-                                label=f"Speaker {i+1}",
-                                visible=(i < 2),
-                                elem_classes="speaker-item"
-                            )
-                            speaker_selections.append(speaker)
-                        gr.Markdown("### Advanced Settings")
-                        with gr.Accordion("Generation Parameters", open=False):
-                            cfg_scale = gr.Slider(
-                                minimum=1.0, maximum=2.0, value=1.3, step=0.05,
-                                label="CFG Scale (Guidance Strength)",
-                                elem_classes="slider-container"
-                            )
-                    with gr.Column(scale=2, elem_classes="generation-card"):
-                        gr.Markdown("### Script Input")
-                        script_input = gr.Textbox(
-                            label="Conversation Script",
-                            placeholder="Enter your conference script here...",
-                            lines=12,
-                            max_lines=20,
-                            elem_classes="script-input"
-                        )
-                        with gr.Row():
-                            random_example_btn = gr.Button(
-                                "Random Example", size="lg",
-                                variant="secondary", elem_classes="random-btn", scale=1
-                            )
-                            generate_btn = gr.Button(
-                                "🚀 Generate Conference", size="lg",
-                                variant="primary", elem_classes="generate-btn", scale=2
-                            )
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                gr.Markdown("### Example Scripts")
-                                with gr.Row():
-                                    use_natural = gr.Checkbox(
-                                        value=True,
-                                        label="Natural talking sounds",
-                                        scale=1
-                                    )
-                                    duration_display = gr.Textbox(
-                                        value="",
-                                        label="Est. Duration",
-                                        interactive=False,
-                                        scale=1
-                                    )
-                        example_names = [
-                            "AI TED Talk",
-                            "Political Speech",
-                            "Finance IPO Meeting",
-                            "Telehealth Meeting",
-                            "Military Meeting",
-                            "Oil Meeting",
-                            "Game Creation Meeting",
-                            "Product Meeting"
-                        ]
-                        example_buttons = []
-                        with gr.Row():
-                            for i in range(min(4, len(example_names))):
-                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
-                                example_buttons.append(btn)
-                        with gr.Row():
-                            for i in range(4, min(8, len(example_names))):
-                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
-                                example_buttons.append(btn)
-                        log_output = gr.Textbox(
-                            label="Generation Log",
-                            lines=8, max_lines=15,
-                            interactive=False,
-                            elem_classes="log-output"
-                        )
-                def update_speaker_visibility(num_speakers):
-                    return [gr.update(visible=(i < num_speakers)) for i in range(4)]
-                num_speakers.change(
-                    fn=update_speaker_visibility,
-                    inputs=[num_speakers],
-                    outputs=speaker_selections
-                )
-                def update_duration_display(script_text):
-                    if not script_text or script_text.strip() == "":
-                        return ""
-                    words = script_text.split()
-                    word_count = len(words)
-                    wpm = 150
-                    estimated_minutes = word_count / wpm
-                    if estimated_minutes < 1:
-                        duration_str = f"{int(estimated_minutes * 60)} sec"
-                    else:
-                        minutes = int(estimated_minutes)
-                        seconds = int((estimated_minutes - minutes) * 60)
-                        if seconds > 0:
-                            duration_str = f"{minutes}m {seconds}s"
-                        else:
-                            duration_str = f"{minutes} min"
-                    return f"{word_count} words • ~{duration_str}"
-                script_input.change(
-                    fn=update_duration_display,
-                    inputs=[script_input],
-                    outputs=[duration_display]
-                )
-                def generate_podcast_wrapper(model_choice, num_speakers, script, *speakers_and_params):
-                    try:
-                        speakers = speakers_and_params[:4]
-                        cfg_scale_val = speakers_and_params[4]
-                        audio, log = demo_instance.generate_podcast(
-                            num_speakers=int(num_speakers),
-                            script=script,
-                            speaker_1=speakers[0],
-                            speaker_2=speakers[1],
-                            speaker_3=speakers[2],
-                            speaker_4=speakers[3],
-                            cfg_scale=cfg_scale_val,
-                            model_name=model_choice
-                        )
-                        return audio, log
-                    except Exception as e:
-                        traceback.print_exc()
-                        return None, f"Error: {str(e)}"
-                def on_generate_start():
-                    return gr.update(interactive=False), gr.update(interactive=False), gr.update(value="🔄 Initializing generation...\n⏳ This may take up to 2 minutes depending on script length...")
-                def on_generate_complete(audio, log):
-                    return gr.update(interactive=True), gr.update(interactive=True), audio, log
-                generate_click = generate_btn.click(
-                    fn=on_generate_start,
-                    inputs=[],
-                    outputs=[generate_btn, random_example_btn, log_output],
-                    queue=False
-                ).then(
-                    fn=generate_podcast_wrapper,
-                    inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
-                    outputs=[complete_audio_output, log_output],
-                    queue=True
-                ).then(
-                    fn=lambda: (gr.update(interactive=True), gr.update(interactive=True)),
-                    inputs=[],
-                    outputs=[generate_btn, random_example_btn],
-                    queue=False
-                )
-                def load_random_example(use_natural_checkbox):
-                    import random
-                    scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
-                    if scripts_list:
-                        idx = random.randint(0, len(scripts_list) - 1)
-                        num_speakers_value, script_value = scripts_list[idx]
-                        # Get gender preferences for this script
-                        genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
-                        # Select appropriate voices based on gender
-                        voice_selections = []
-                        for i in range(4):
-                            if i < len(genders):
-                                gender = genders[i]
-                                if gender == "male" and demo_instance.male_voices:
-                                    voice = random.choice(demo_instance.male_voices)
-                                elif gender == "female" and demo_instance.female_voices:
-                                    voice = random.choice(demo_instance.female_voices)
-                                else:
-                                    # neutral or fallback
-                                    all_voices = list(demo_instance.available_voices.keys())
-                                    voice = random.choice(all_voices) if all_voices else None
-                            else:
-                                voice = None
-                            voice_selections.append(voice)
-                        return [num_speakers_value, script_value] + voice_selections
-                    return [2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"] + [None, None, None, None]
-                random_example_btn.click(
-                    fn=load_random_example,
-                    inputs=[use_natural],
-                    outputs=[num_speakers, script_input] + speaker_selections,
-                    queue=False
-                )
-                def load_specific_example(idx, use_natural_checkbox):
-                    import random
-                    scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
-                    if idx < len(scripts_list):
-                        num_speakers_value, script_value = scripts_list[idx]
-                        # Get gender preferences for this script
-                        genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
-                        # Select appropriate voices based on gender
-                        voice_selections = []
-                        for i in range(4):
-                            if i < len(genders):
-                                gender = genders[i]
-                                if gender == "male" and demo_instance.male_voices:
-                                    voice = random.choice(demo_instance.male_voices)
-                                elif gender == "female" and demo_instance.female_voices:
-                                    voice = random.choice(demo_instance.female_voices)
-                                else:
-                                    # neutral or fallback
-                                    all_voices = list(demo_instance.available_voices.keys())
-                                    voice = random.choice(all_voices) if all_voices else None
-                            else:
-                                voice = None
-                            voice_selections.append(voice)
-                        # Return values for all outputs
-                        return [num_speakers_value, script_value] + voice_selections
-                    return [2, ""] + [None, None, None, None]
-                for idx, btn in enumerate(example_buttons):
-                    btn.click(
-                        fn=lambda nat, i=idx: load_specific_example(i, nat),
-                        inputs=[use_natural],
-                        outputs=[num_speakers, script_input] + speaker_selections,
-                        queue=False
-                    )
-            with gr.Tab("Architecture"):
-                with gr.Row():
-                    gr.Markdown('''VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, "
-                    "such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly "
-                    "in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous "
-                    "speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently "
-                    "preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice "
-                    "employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and "
-                    "dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to "
-                    "90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.''')
-                with gr.Row():
-                    with gr.Column():
-                        gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
-                        gr.Markdown("""
-                        ### Overview
-                        VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
-                        such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
-                        particularly in scalability, speaker consistency, and natural turn-taking.
-                        ### Training Architecture
-                        **Transformer-based Large Language Model** integrated with specialized acoustic and semantic tokenizers and a diffusion-based decoding head.
-                        **Core Components:**
-                        - **LLM**: Qwen2.5-1.5B for this release
-                        - **Acoustic Tokenizer**: Based on a σ-VAE variant with mirror-symmetric encoder-decoder structure (~340M parameters each)
-                          - 7 stages of modified Transformer blocks
-                          - Achieves 3200x downsampling from 24kHz input
-                        - **Semantic Tokenizer**: Encoder mirrors the Acoustic Tokenizer's architecture
-                          - Trained with an ASR proxy task
-                        - **Diffusion Head**: Lightweight module (4 layers, ~123M parameters)
-                          - Conditioned on LLM hidden states
-                          - Uses DDPM process with Classifier-Free Guidance
-                        ### Training Details
-                        **Context Length**: Trained with curriculum up to 65,536 tokens
-                        **Training Stages:**
-                        1. **Tokenizer Pre-training**: Acoustic and Semantic tokenizers trained separately
-                        2. **VibeVoice Training**: Frozen tokenizers, only LLM and diffusion head trained
-                           - Curriculum learning: 4k → 16K → 32K → 64K tokens
-                        ### Model Variants
-                        | Model | Context Length | Generation Length | Parameters |
-                        |-------|---------------|-------------------|------------|
-                        | VibeVoice-0.5B-Streaming | - | - | Coming Soon |
-                        | **VibeVoice-1.5B** | 64K | ~90 min | 2.7B |
-                        | VibeVoice-Large | 32K | ~45 min | Redacted |
-                        ### Technical Specifications
-                        - **Frame Rate**: Ultra-low 7.5 Hz for efficiency
-                        - **Sample Rate**: 24kHz audio output
-                        - **Max Duration**: Up to 90 minutes
-                        - **Speaker Capacity**: 1-4 distinct speakers
-                        - **Languages**: English and Chinese
-                        ### Key Innovations
-                        - Continuous speech tokenizers at ultra-low frame rate
-                        - Next-token diffusion framework
-                        - Curriculum learning for long-form generation
-                        - Multi-speaker consistency without explicit modeling
-                        """)
-                    with gr.Column(scale=2):
-                        gr.HTML("""
-                        <div style="text-align: center;">
-                            <div style="margin: 20px 0;">
-                                <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg"
-                                    style="max-width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
-                                    alt="VibeVoice Architecture Diagram">
-                            </div>
-                            <div style="margin: 20px 0;">
-                                <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png"
-                                    style="max-width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
-                                    alt="VibeVoice Performance Chart">
-                            </div>
-                        </div>
-                        """)
-    return interface
-def run_demo(
-    model_paths: dict = None,
-    device: str = "cuda",
-    inference_steps: int = 5,
-    share: bool = True,
-):
-    """
-    model_paths default includes two entries. Replace paths as needed.
-    """
-    if model_paths is None:
-        model_paths = {
-            "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
-            "VibeVoice-7B": "vibevoice/VibeVoice-7B",
-        }
-    set_seed(42)
-    demo_instance = VibeVoiceDemo(model_paths, device, inference_steps)
-    interface = create_demo_interface(demo_instance)
-    interface.queue().launch(
-        share=share,
-        server_name="0.0.0.0" if share else "127.0.0.1",
-        show_error=True,
-        show_api=False
-    )
-if __name__ == "__main__":
-    run_demo()

{configs → backend_modal/configs}/qwen2.5_1.5b_64k.json RENAMED Viewed

File without changes

{configs → backend_modal/configs}/qwen2.5_7b_32k.json RENAMED Viewed

File without changes

{example → backend_modal/example}/1p_EN2CH.mp4 RENAMED Viewed

File without changes

{example → backend_modal/example}/2p_see_u_again.mp4 RENAMED Viewed

File without changes

{example → backend_modal/example}/4p_climate_45min.mp4 RENAMED Viewed

File without changes

backend_modal/modal_runner.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import os
+import time
+import numpy as np
+import librosa
+import soundfile as sf
+import torch
+from datetime import datetime
+# Modal-specific imports
+import modal
+# Define the Modal Stub
+image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        "torch",
+        "accelerate==1.6.0",
+        "transformers==4.51.3",
+        "diffusers",
+        "tqdm",
+        "numpy",
+        "scipy",
+        "ml-collections",
+        "absl-py",
+        "soundfile",
+        "librosa",
+        "pydub",
+    )
+    .add_local_dir("./modular", remote_path="/root/modular")
+    .add_local_dir("./processor", remote_path="/root/processor")
+    .add_local_dir("./voices", remote_path="/root/voices")
+    .add_local_dir("./text_examples", remote_path="/root/text_examples")
+    .add_local_dir("./schedule", remote_path="/root/schedule")
+)
+app = modal.App(
+    name="vibevoice-generator",
+    image=image,
+)
+@app.cls(gpu="T4", scaledown_window=300, secrets=[modal.Secret.from_name("hf-secret")])
+class VibeVoiceModel:
+    def __init__(self, model_paths: dict = None):
+        if model_paths is None:
+            self.model_paths = {
+                "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
+                "VibeVoice-7B": "vibevoice/VibeVoice-7B",
+            }
+        else:
+            self.model_paths = model_paths
+        self.device = "cuda"
+        self.inference_steps = 5
+    @modal.enter()
+    def load_models(self):
+        """
+        This method is run once when the container starts.
+        It downloads and loads all models onto the GPU.
+        """
+        # Project-specific imports are moved here to run inside the container
+        from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+        from processor.vibevoice_processor import VibeVoiceProcessor
+        print("Entering container and loading models to GPU...")
+        # Set compiler flags for better performance
+        if torch.cuda.is_available() and hasattr(torch, '_inductor'):
+            if hasattr(torch._inductor, 'config'):
+                torch._inductor.config.conv_1x1_as_mm = True
+                torch._inductor.config.coordinate_descent_tuning = True
+                torch._inductor.config.epilogue_fusion = False
+                torch._inductor.config.coordinate_descent_check_all_directions = True
+        self.models = {}
+        self.processors = {}
+        for name, path in self.model_paths.items():
+            print(f" - Loading {name} from {path}")
+            proc = VibeVoiceProcessor.from_pretrained(path)
+            mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="sdpa"
+            ).to(self.device)
+            mdl.eval()
+            print(f"  {name} loaded to {self.device}")
+            self.processors[name] = proc
+            self.models[name] = mdl
+        self.setup_voice_presets()
+        print("Model loading complete.")
+    def setup_voice_presets(self):
+        self.available_voices = {}
+        voices_dir = "/root/voices" # Using remote path from Mount
+        if not os.path.exists(voices_dir):
+            print(f"Warning: Voices directory not found at {voices_dir}")
+            return
+        wav_files = [f for f in os.listdir(voices_dir)
+                     if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'))]
+        for wav_file in wav_files:
+            name = os.path.splitext(wav_file)[0]
+            self.available_voices[name] = os.path.join(voices_dir, wav_file)
+        print(f"Voices loaded: {list(self.available_voices.keys())}")
+    def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
+        try:
+            wav, sr = sf.read(audio_path)
+            if len(wav.shape) > 1:
+                wav = np.mean(wav, axis=1)
+            if sr != target_sr:
+                wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
+            return wav
+        except Exception as e:
+            print(f"Error reading audio {audio_path}: {e}")
+            return np.array([])
+    @modal.method()
+    def generate_podcast(self,
+                         num_speakers: int,
+                         script: str,
+                         model_name: str,
+                         cfg_scale: float,
+                         speaker_1: str = None,
+                         speaker_2: str = None,
+                         speaker_3: str = None,
+                         speaker_4: str = None):
+        """
+        This is the main inference function that will be called from the Gradio app.
+        """
+        try:
+            if model_name not in self.models:
+                raise ValueError(f"Unknown model: {model_name}")
+            model = self.models[model_name]
+            processor = self.processors[model_name]
+            model.set_ddpm_inference_steps(num_steps=self.inference_steps)
+            print(f"Generating with model {model_name} on {self.device}")
+            if not script.strip():
+                raise ValueError("Error: Please provide a script.")
+            script = script.replace("’", "'")
+            if not 1 <= num_speakers <= 4:
+                raise ValueError("Error: Number of speakers must be between 1 and 4.")
+            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+            for i, speaker_name in enumerate(selected_speakers):
+                if not speaker_name or speaker_name not in self.available_voices:
+                    raise ValueError(f"Error: Please select a valid speaker for Speaker {i+1}.")
+            log = f"Generating conference with {num_speakers} speakers\n"
+            log += f"Model: {model_name}\n"
+            log += f"Parameters: CFG Scale={cfg_scale}\n"
+            log += f"Speakers: {', '.join(selected_speakers)}\n"
+            voice_samples = []
+            for speaker_name in selected_speakers:
+                audio_path = self.available_voices[speaker_name]
+                audio_data = self.read_audio(audio_path)
+                if len(audio_data) == 0:
+                    raise ValueError(f"Error: Failed to load audio for {speaker_name}")
+                voice_samples.append(audio_data)
+            log += f"Loaded {len(voice_samples)} voice samples\n"
+            lines = script.strip().split('\n')
+            formatted_script_lines = []
+            for line in lines:
+                line = line.strip()
+                if not line: continue
+                if line.startswith('Speaker ') and ':' in line:
+                    formatted_script_lines.append(line)
+                else:
+                    speaker_id = len(formatted_script_lines) % num_speakers
+                    formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
+            formatted_script = '\n'.join(formatted_script_lines)
+            log += f"Formatted script with {len(formatted_script_lines)} turns\n"
+            log += "Processing with VibeVoice...\n"
+            inputs = processor(
+                text=[formatted_script],
+                voice_samples=[voice_samples],
+                padding=True,
+                return_tensors="pt",
+                return_attention_mask=True,
+            ).to(self.device)
+            start_time = time.time()
+            with torch.inference_mode():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=None,
+                    cfg_scale=cfg_scale,
+                    tokenizer=processor.tokenizer,
+                    generation_config={'do_sample': False},
+                    verbose=False,
+                )
+            generation_time = time.time() - start_time
+            if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
+                audio_tensor = outputs.speech_outputs[0]
+                audio = audio_tensor.cpu().float().numpy()
+            else:
+                raise RuntimeError("Error: No audio was generated by the model.")
+            if audio.ndim > 1:
+                audio = audio.squeeze()
+            sample_rate = 24000
+            total_duration = len(audio) / sample_rate
+            log += f"Generation completed in {generation_time:.2f} seconds\n"
+            log += f"Final audio duration: {total_duration:.2f} seconds\n"
+            # Return the raw audio data and sample rate, Gradio will handle the rest
+            return (sample_rate, audio), log
+        except Exception as e:
+            import traceback
+            error_msg = f"An unexpected error occurred on Modal: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            # Return a special value or raise an exception that the client can handle
+            # For Gradio, returning a log message is often best.
+            return None, error_msg

{modular → backend_modal/modular}/__init__.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/configuration_vibevoice.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/modeling_vibevoice.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/modeling_vibevoice_inference.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/modular_vibevoice_diffusion_head.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/modular_vibevoice_text_tokenizer.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/modular_vibevoice_tokenizer.py RENAMED Viewed

File without changes

{modular → backend_modal/modular}/streamer.py RENAMED Viewed

File without changes

packages.txt → backend_modal/packages.txt RENAMED Viewed

File without changes

{processor → backend_modal/processor}/__init__.py RENAMED Viewed

File without changes

{processor → backend_modal/processor}/vibevoice_processor.py RENAMED Viewed

File without changes

{processor → backend_modal/processor}/vibevoice_tokenizer_processor.py RENAMED Viewed

File without changes

{schedule → backend_modal/schedule}/__init__.py RENAMED Viewed

File without changes

{schedule → backend_modal/schedule}/dpm_solver.py RENAMED Viewed

File without changes

{schedule → backend_modal/schedule}/timestep_sampler.py RENAMED Viewed

File without changes

{scripts → backend_modal/scripts}/__init__.py RENAMED Viewed

File without changes

{scripts → backend_modal/scripts}/convert_nnscaler_checkpoint_to_transformers.py RENAMED Viewed

File without changes

setup_voices.sh → backend_modal/setup_voices.sh RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/1p_ai_tedtalk.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/1p_ai_tedtalk_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/1p_politcal_speech.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/1p_politcal_speech_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/2p_financeipo_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/2p_financeipo_meeting_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/2p_telehealth_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/2p_telehealth_meeting_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/3p_military_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/3p_military_meeting_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/3p_oil_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/3p_oil_meeting_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/4p_gamecreation_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/4p_gamecreation_meeting_natural.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/4p_product_meeting.txt RENAMED Viewed

File without changes

{text_examples → backend_modal/text_examples}/4p_product_meeting_natural.txt RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Alice_woman.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Alice_woman_bgm.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Carter_man.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Frank_man.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Maya_woman.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/en-Yasser_man.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/in-Samuel_man.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/zh-Anchen_man_bgm.wav RENAMED Viewed

File without changes

{voices → backend_modal/voices}/zh-Bowen_man.wav RENAMED Viewed

File without changes