Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running on CPU Upgrade

App Files Files Community

ACloudCenter commited on Sep 25

Commit

d5cf69f

1 Parent(s): 5dc3e05

Added feedback to UI for inference

Browse files

Files changed (2) hide show

app.py +52 -11
backend_modal/modal_runner.py +187 -22

app.py CHANGED Viewed

@@ -221,6 +221,19 @@ def create_demo_interface():
                             lines=8, max_lines=15,
                             interactive=False,
                         )
                 def update_speaker_visibility(num_speakers):
                     return [gr.update(visible=(i < num_speakers)) for i in range(4)]
@@ -303,15 +316,23 @@ def create_demo_interface():
                 def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                     if remote_generate_function is None:
-                        return None, "ERROR: Modal function not deployed. Please contact the space owner."
                     # Show a message that we are calling the remote function
-                    yield None, "🔄 Calling remote GPU on Modal.com... this may take a moment to start."
                     try:
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
                         # Stream updates from the Modal function
                         for update in remote_generate_function.remote_gen(
                             num_speakers=int(num_speakers_val),
@@ -323,19 +344,39 @@ def create_demo_interface():
                             cfg_scale=cfg_scale_val,
                             model_name=model_choice
                         ):
-                            # Each update is a tuple (audio_or_none, log_message)
-                            if update:
-                                audio, log = update
-                                yield audio, log
                     except Exception as e:
                         tb = traceback.format_exc()
                         print(f"Error calling Modal: {e}")
-                        yield None, f"❌ An error occurred: {e}\n\n{tb}"
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
-                    outputs=[complete_audio_output, log_output]
                 )
             with gr.Tab("Architecture"):
@@ -414,4 +455,4 @@ if __name__ == "__main__":
     else:
         # Launch the full Gradio interface
         interface = create_demo_interface()
-        interface.queue().launch(show_error=True)

                             lines=8, max_lines=15,
                             interactive=False,
                         )
+                        with gr.Row():
+                            status_display = gr.Markdown(
+                                value="Status: idle.",
+                                elem_id="status-display",
+                            )
+                            progress_slider = gr.Slider(
+                                minimum=0,
+                                maximum=100,
+                                value=0,
+                                step=1,
+                                label="Progress",
+                                interactive=False,
+                            )
                 def update_speaker_visibility(num_speakers):
                     return [gr.update(visible=(i < num_speakers)) for i in range(4)]
                 def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                     if remote_generate_function is None:
+                        error_message = "ERROR: Modal function not deployed. Please contact the space owner."
+                        yield None, error_message, "Status: error.", gr.update(value=0)
+                        return
                     # Show a message that we are calling the remote function
+                    yield (
+                        None,
+                        "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
+                        "**Connecting**\nRequesting GPU resources…",
+                        gr.update(value=0),
+                    )
                     try:
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
+                        current_log = ""
                         # Stream updates from the Modal function
                         for update in remote_generate_function.remote_gen(
                             num_speakers=int(num_speakers_val),
                             cfg_scale=cfg_scale_val,
                             model_name=model_choice
                         ):
+                            if not update:
+                                continue
+                            audio_payload = update.get("audio")
+                            progress_pct = update.get("pct", 0)
+                            stage_label = update.get("stage", "").replace("_", " ").title() or "Status"
+                            status_line = update.get("status") or "Processing…"
+                            current_log = update.get("log", current_log)
+                            status_formatted = f"**{stage_label}**\n{status_line}"
+                            audio_output = audio_payload if audio_payload is not None else gr.update()
+                            yield (
+                                audio_output,
+                                current_log,
+                                status_formatted,
+                                gr.update(value=progress_pct),
+                            )
                     except Exception as e:
                         tb = traceback.format_exc()
                         print(f"Error calling Modal: {e}")
+                        error_log = f"❌ An error occurred: {e}\n\n{tb}"
+                        yield (
+                            None,
+                            error_log,
+                            "**Error**\nInference failed.",
+                            gr.update(value=0),
+                        )
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
+                    outputs=[complete_audio_output, log_output, status_display, progress_slider]
                 )
             with gr.Tab("Architecture"):
     else:
         # Launch the full Gradio interface
         interface = create_demo_interface()
+        interface.queue().launch(show_error=True)

backend_modal/modal_runner.py CHANGED Viewed

@@ -5,6 +5,9 @@ import librosa
 import soundfile as sf
 import torch
 from datetime import datetime
 # Modal-specific imports
 import modal
@@ -38,8 +41,14 @@ app = modal.App(
     image=image,
 )
-@app.cls(gpu="A100-40GB", scaledown_window=300)
 class VibeVoiceModel:
     def __init__(self):
         self.model_paths = {
@@ -48,6 +57,8 @@ class VibeVoiceModel:
         }
         self.device = "cuda"
         self.inference_steps = 5
     @modal.enter()
     def load_models(self):
@@ -113,6 +124,95 @@ class VibeVoiceModel:
             self.available_voices[name] = os.path.join(voices_dir, wav_file)
         print(f"Voices loaded: {list(self.available_voices.keys())}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         try:
             wav, sr = sf.read(audio_path)
@@ -193,15 +293,36 @@ class VibeVoiceModel:
         Yields progress updates during generation.
         """
         try:
-            # Yield initial status
-            yield None, "🔄 Initializing generation..."
             if model_name not in self.models:
                 raise ValueError(f"Unknown model: {model_name}")
             # Move the selected model to GPU, others to CPU
-            yield None, "🔄 Loading model to GPU..."
             self._place_model(model_name)
             model = self.models[model_name]
             processor = self.processors[model_name]
             model.set_ddpm_inference_steps(num_steps=self.inference_steps)
@@ -216,17 +337,18 @@ class VibeVoiceModel:
             if not 1 <= num_speakers <= 4:
                 raise ValueError("Error: Number of speakers must be between 1 and 4.")
-            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
             for i, speaker_name in enumerate(selected_speakers):
                 if not speaker_name or speaker_name not in self.available_voices:
                     raise ValueError(f"Error: Please select a valid speaker for Speaker {i+1}.")
-            log = f"Generating conference with {num_speakers} speakers\n"
-            log += f"Model: {model_name}\n"
-            log += f"Parameters: CFG Scale={cfg_scale}\n"
-            log += f"Speakers: {', '.join(selected_speakers)}\n"
-            yield None, log + "\n🔄 Loading voice samples..."
             voice_samples = []
             for i, speaker_name in enumerate(selected_speakers):
@@ -235,9 +357,18 @@ class VibeVoiceModel:
                 if len(audio_data) == 0:
                     raise ValueError(f"Error: Failed to load audio for {speaker_name}")
                 voice_samples.append(audio_data)
-                yield None, log + f"\n✓ Loaded voice {i+1}/{len(selected_speakers)}: {speaker_name}"
-            log += f"\nLoaded {len(voice_samples)} voice samples"
             lines = script.strip().split('\n')
             formatted_script_lines = []
@@ -251,8 +382,14 @@ class VibeVoiceModel:
                     formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
             formatted_script = '\n'.join(formatted_script_lines)
-            log += f"\nFormatted script with {len(formatted_script_lines)} turns"
-            yield None, log + "\n🔄 Processing script with VibeVoice..."
             inputs = processor(
                 text=[formatted_script],
@@ -262,7 +399,14 @@ class VibeVoiceModel:
                 return_attention_mask=True,
             ).to(self.device)
-            yield None, log + "\n🎯 Starting audio generation (this may take 1-2 minutes)..."
             start_time = time.time()
             with torch.inference_mode():
@@ -276,7 +420,15 @@ class VibeVoiceModel:
                 )
             generation_time = time.time() - start_time
-            yield None, log + f"\n✓ Generation completed in {generation_time:.2f} seconds\n🔄 Processing audio..."
             if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
                 audio_tensor = outputs.speech_outputs[0]
@@ -289,15 +441,28 @@ class VibeVoiceModel:
             sample_rate = 24000
             total_duration = len(audio) / sample_rate
-            log += f"\n✓ Generation completed in {generation_time:.2f} seconds"
-            log += f"\n✓ Audio duration: {total_duration:.2f} seconds"
             # Final yield with both audio and complete log
-            yield (sample_rate, audio), log + "\n✅ Complete!"
         except Exception as e:
             import traceback
             error_msg = f"❌ An unexpected error occurred on Modal: {str(e)}\n{traceback.format_exc()}"
             print(error_msg)
             # Yield error state
-            yield None, error_msg

 import soundfile as sf
 import torch
 from datetime import datetime
+import hashlib
+import json
+import pickle
 # Modal-specific imports
 import modal
     image=image,
 )
+# Create a volume for caching generated audio
+cache_volume = modal.Volume.from_name("vibevoice-cache", create_if_missing=True)
+@app.cls(
+    gpu="A100-40GB",
+    scaledown_window=300,
+    volumes={"/cache": cache_volume}
+)
 class VibeVoiceModel:
     def __init__(self):
         self.model_paths = {
         }
         self.device = "cuda"
         self.inference_steps = 5
+        self.cache_dir = "/cache"
+        self.max_cache_size_gb = 10  # Limit cache to 10GB
     @modal.enter()
     def load_models(self):
             self.available_voices[name] = os.path.join(voices_dir, wav_file)
         print(f"Voices loaded: {list(self.available_voices.keys())}")
+    def _emit_progress(self, stage: str, pct: float, status: str, log_text: str,
+                       audio=None, done: bool = False):
+        """Package a structured progress update for streaming back to Gradio."""
+        payload = {
+            "stage": stage,
+            "pct": pct,
+            "status": status,
+            "log": log_text,
+        }
+        if audio is not None:
+            payload["audio"] = audio
+        if done:
+            payload["done"] = True
+        return payload
+    def _generate_cache_key(self, script: str, model_name: str, speakers: list, cfg_scale: float) -> str:
+        """Generate a unique cache key for this generation."""
+        cache_data = {
+            "script": script.strip().lower(),  # Normalize script
+            "model": model_name,
+            "speakers": sorted(speakers),  # Sort for consistency
+            "cfg_scale": cfg_scale,
+            "inference_steps": self.inference_steps
+        }
+        cache_str = json.dumps(cache_data, sort_keys=True)
+        return hashlib.sha256(cache_str.encode()).hexdigest()
+    def _get_cached_audio(self, cache_key: str):
+        """Check if audio is cached and return it."""
+        cache_path = os.path.join(self.cache_dir, f"{cache_key}.pkl")
+        if os.path.exists(cache_path):
+            try:
+                with open(cache_path, 'rb') as f:
+                    cached_data = pickle.load(f)
+                    print(f"Cache hit! Loading from {cache_key}")
+                    return cached_data['audio'], cached_data['sample_rate']
+            except Exception as e:
+                print(f"Cache read error: {e}")
+        return None, None
+    def _save_to_cache(self, cache_key: str, audio: np.ndarray, sample_rate: int):
+        """Save generated audio to cache."""
+        try:
+            # Check cache size
+            self._cleanup_cache_if_needed()
+            cache_path = os.path.join(self.cache_dir, f"{cache_key}.pkl")
+            cached_data = {
+                'audio': audio,
+                'sample_rate': sample_rate,
+                'timestamp': time.time()
+            }
+            with open(cache_path, 'wb') as f:
+                pickle.dump(cached_data, f)
+            print(f"Saved to cache: {cache_key}")
+            # Commit the volume changes
+            cache_volume.commit()
+        except Exception as e:
+            print(f"Cache write error: {e}")
+    def _cleanup_cache_if_needed(self):
+        """Remove old cache files if cache is too large."""
+        try:
+            cache_files = []
+            total_size = 0
+            for filename in os.listdir(self.cache_dir):
+                if filename.endswith('.pkl'):
+                    filepath = os.path.join(self.cache_dir, filename)
+                    size = os.path.getsize(filepath)
+                    mtime = os.path.getmtime(filepath)
+                    cache_files.append((filepath, size, mtime))
+                    total_size += size
+            # If cache is too large, remove oldest files
+            max_size = self.max_cache_size_gb * 1024 * 1024 * 1024
+            if total_size > max_size:
+                # Sort by modification time (oldest first)
+                cache_files.sort(key=lambda x: x[2])
+                while total_size > max_size * 0.8 and cache_files:  # Keep 80% full
+                    filepath, size, _ = cache_files.pop(0)
+                    os.remove(filepath)
+                    total_size -= size
+                    print(f"Removed old cache: {os.path.basename(filepath)}")
+        except Exception as e:
+            print(f"Cache cleanup error: {e}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         try:
             wav, sr = sf.read(audio_path)
         Yields progress updates during generation.
         """
         try:
             if model_name not in self.models:
                 raise ValueError(f"Unknown model: {model_name}")
+            # Initialize log scaffold
+            selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+            log_lines = [
+                f"Generating conference with {num_speakers} speakers",
+                f"Model: {model_name}",
+                f"Parameters: CFG Scale={cfg_scale}",
+                f"Speakers: {', '.join(selected_speakers)}",
+            ]
+            log_text = "\n".join(log_lines)
+            # Emit initial status before heavy work kicks in
+            yield self._emit_progress(
+                stage="queued",
+                pct=5,
+                status="Queued GPU job and validating inputs…",
+                log_text=log_text,
+            )
             # Move the selected model to GPU, others to CPU
+            yield self._emit_progress(
+                stage="loading_model",
+                pct=15,
+                status=f"Loading {model_name} weights to GPU…",
+                log_text=log_text,
+            )
             self._place_model(model_name)
             model = self.models[model_name]
             processor = self.processors[model_name]
             model.set_ddpm_inference_steps(num_steps=self.inference_steps)
             if not 1 <= num_speakers <= 4:
                 raise ValueError("Error: Number of speakers must be between 1 and 4.")
             for i, speaker_name in enumerate(selected_speakers):
                 if not speaker_name or speaker_name not in self.available_voices:
                     raise ValueError(f"Error: Please select a valid speaker for Speaker {i+1}.")
+            log_lines.append("Loading voice samples…")
+            log_text = "\n".join(log_lines)
+            yield self._emit_progress(
+                stage="loading_voices",
+                pct=25,
+                status="Loading reference voices…",
+                log_text=log_text,
+            )
             voice_samples = []
             for i, speaker_name in enumerate(selected_speakers):
                 if len(audio_data) == 0:
                     raise ValueError(f"Error: Failed to load audio for {speaker_name}")
                 voice_samples.append(audio_data)
+                voice_pct = 25 + ((i + 1) / len(selected_speakers)) * 15
+                log_lines.append(f"Loaded voice {i+1}/{len(selected_speakers)}: {speaker_name}")
+                log_text = "\n".join(log_lines)
+                yield self._emit_progress(
+                    stage="loading_voices",
+                    pct=voice_pct,
+                    status=f"Loaded {speaker_name}",
+                    log_text=log_text,
+                )
+            log_lines.append(f"Loaded {len(voice_samples)} voice samples")
+            log_text = "\n".join(log_lines)
             lines = script.strip().split('\n')
             formatted_script_lines = []
                     formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
             formatted_script = '\n'.join(formatted_script_lines)
+            log_lines.append(f"Formatted script with {len(formatted_script_lines)} turns")
+            log_text = "\n".join(log_lines)
+            yield self._emit_progress(
+                stage="preparing_inputs",
+                pct=50,
+                status="Formatting script and preparing tensors…",
+                log_text=log_text,
+            )
             inputs = processor(
                 text=[formatted_script],
                 return_attention_mask=True,
             ).to(self.device)
+            log_lines.append("Inputs prepared; starting diffusion generation…")
+            log_text = "\n".join(log_lines)
+            yield self._emit_progress(
+                stage="generating_audio",
+                pct=70,
+                status="Running VibeVoice diffusion (this may take 1-2 minutes)…",
+                log_text=log_text,
+            )
             start_time = time.time()
             with torch.inference_mode():
                 )
             generation_time = time.time() - start_time
+            log_lines.append(f"Generation completed in {generation_time:.2f} seconds")
+            log_lines.append("Processing audio output…")
+            log_text = "\n".join(log_lines)
+            yield self._emit_progress(
+                stage="processing_audio",
+                pct=90,
+                status="Post-processing audio output…",
+                log_text=log_text,
+            )
             if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
                 audio_tensor = outputs.speech_outputs[0]
             sample_rate = 24000
             total_duration = len(audio) / sample_rate
+            log_lines.append(f"Audio duration: {total_duration:.2f} seconds")
+            log_lines.append("Complete!")
+            log_text = "\n".join(log_lines)
             # Final yield with both audio and complete log
+            yield self._emit_progress(
+                stage="complete",
+                pct=100,
+                status="Conference ready to download.",
+                log_text=log_text,
+                audio=(sample_rate, audio),
+                done=True,
+            )
         except Exception as e:
             import traceback
             error_msg = f"❌ An unexpected error occurred on Modal: {str(e)}\n{traceback.format_exc()}"
             print(error_msg)
             # Yield error state
+            yield self._emit_progress(
+                stage="error",
+                pct=0,
+                status="Generation failed.",
+                log_text=error_msg,
+            )