Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

ead9609

1 Parent(s): e1e4a12

First commit

Browse files

Files changed (2) hide show

README.md +2 -1
app.py +191 -84

README.md CHANGED Viewed

@@ -6,7 +6,8 @@ colorTo: purple
 sdk: gradio
 sdk_version: 5.47.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://github.com/ruslanmv/ai-story-server

 sdk: gradio
 sdk_version: 5.47.2
 app_file: app.py
+python_version: "3.11"
 pinned: false
 ---
+Check out the configuration reference at https://github.com/ruslanmv/ai-story-server

app.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ===================================================================================
 from __future__ import annotations
 import os
 import base64
 import struct
 import textwrap
@@ -14,31 +15,23 @@ from typing import List, Dict, Tuple, Generator
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")
-# --- Prefer torchaudio sox_io/soundfile backend (avoid FFmpeg/torio bug) ---
-try:
-    import torchaudio
-    _backend_set = False
-    for _cand in ("sox_io", "soundfile"):
-        try:
-            torchaudio.set_audio_backend(_cand)
-            _backend_set = True
-            break
-        except Exception:
-            pass
-    if not _backend_set:
-        os.environ["TORCHAUDIO_USE_FFMPEG"] = "0"
-except Exception:
-    torchaudio = None
-# --- Load .env early (HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
-# --- Hugging Face Spaces & ZeroGPU ---
 try:
-    import spaces
 except Exception:
     class _SpacesShim:
         def GPU(self, *args, **kwargs):
@@ -49,17 +42,20 @@ except Exception:
 import gradio as gr
-# --- Core ML & Data Libraries ---
 import torch
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.manage import ModelManager
-from TTS.utils.generic_utils import get_user_data_dir
 # --- Text & Audio Processing ---
 import nltk
@@ -71,12 +67,15 @@ import noisereduce as nr
 # 2) GLOBALS & HELPERS
 # ===================================================================================
 nltk.download("punkt", quiet=True)
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
 voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
@@ -84,6 +83,10 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
@@ -96,16 +99,17 @@ ROLE_PROMPTS["Pirate"] = (
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
     chunk_size = 36 + len(pcm_data)
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
         b"RIFF", chunk_size, b"WAVE", b"fmt ",
-        16, 1, channels, sample_rate,
-        sample_rate * channels * bit_depth // 8,
-        channels * bit_depth // 8, bit_depth,
         b"data", len(pcm_data)
     )
     return header + pcm_data
@@ -128,11 +132,61 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
 # ===================================================================================
 # 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
 # ===================================================================================
 def precache_assets() -> None:
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
@@ -148,27 +202,31 @@ def precache_assets() -> None:
             except Exception as e:
                 print(f"Failed to download {name}: {e}")
     print("Pre-caching XTTS v2 model files...")
     ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
     print("Pre-caching Zephyr GGUF...")
     try:
         hf_hub_download(
             repo_id="TheBloke/zephyr-7B-beta-GGUF",
-            filename="zephyr-7b-beta.Q5_K_M.gguf"
         )
     except Exception as e:
         print(f"Warning: GGUF pre-cache error: {e}")
 def _load_xtts(device: str) -> Xtts:
-    print("Loading Coqui XTTS V2 model (CPU first)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-    model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
     cfg = XttsConfig()
     cfg.load_json(os.path.join(model_dir, "config.json"))
     model = Xtts.init_from_config(cfg)
     model.load_checkpoint(
         cfg,
         checkpoint_dir=model_dir,
@@ -180,30 +238,59 @@ def _load_xtts(device: str) -> Xtts:
     return model
 def _load_llama() -> Llama:
-    print("Loading LLM (Zephyr GGUF) on CPU...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
-    llm = Llama(
-        model_path=zephyr_model_path,
-        n_gpu_layers=0,
-        n_ctx=4096,
-        n_batch=512,
-        verbose=False
-    )
-    print("LLM loaded (CPU).")
-    return llm
 def init_models_and_latents() -> None:
     global tts_model, llm_model, voice_latents
     if tts_model is None:
-        tts_model = _load_xtts(device="cpu")
     if llm_model is None:
         llm_model = _load_llama()
     if not voice_latents:
         print("Computing voice conditioning latents...")
         for role, filename in [
@@ -213,18 +300,20 @@ def init_models_and_latents() -> None:
             ("Thera", "thera-1.wav"),
         ]:
             path = os.path.join("voices", filename)
-            voice_latents[role] = tts_model.get_conditioning_latents(
-                audio_path=path, gpt_cond_len=30, max_ref_length=60
-            )
         print("Voice latents ready.")
 def _close_llm():
     global llm_model
-    if llm_model is not None:
-        try:
             llm_model.close()
-        except Exception:
-            pass
 atexit.register(_close_llm)
 # ===================================================================================
@@ -264,72 +353,88 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
             speaker_embedding=speaker_embedding,
             temperature=0.85,
         ):
-            if chunk is not None:
-                yield chunk.detach().cpu().numpy().squeeze().tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
         if "device-side assert" in str(e) and api:
-            gr.Warning("Critical GPU error. Attempting to restart the Space...")
             try:
                 api.restart_space(repo_id=repo_id)
             except Exception:
                 pass
 # ===================================================================================
-# 5) ZERO-GPU ENTRYPOINT
 # ===================================================================================
-@spaces.GPU(duration=120)
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
     if tts_model is None or llm_model is None or not voice_latents:
-        raise gr.Error("Models not initialized. Please restart the Space.")
     try:
         if torch.cuda.is_available():
             tts_model.to("cuda")
         else:
             tts_model.to("cpu")
-        history: List[Tuple[str, str | None]] = [(input_text, None)]
-        full_story_text = "".join(
-            generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
-        ).strip()
-        if not full_story_text:
-            return []
-        sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
-        lang = langid.classify(sentences[0])[0] if sentences else "en"
-        results: List[Dict[str, str]] = []
-        for sentence in sentences:
-            if not any(c.isalnum() for c in sentence):
-                continue
-            audio_chunks = generate_audio_stream(tts_model, sentence, lang, voice_latents[chatbot_role])
-            pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
-            try:
-                data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
-                if data_s16.size > 0:
-                    float_data = data_s16.astype(np.float32) / 32767.0
-                    reduced = nr.reduce_noise(y=float_data, sr=24000)
-                    final_pcm = (reduced * 32767).astype(np.int16).tobytes()
-                else:
-                    final_pcm = pcm_data
-            except Exception:
                 final_pcm = pcm_data
-            b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
-            results.append({"text": sentence, "audio": b64_wav})
-        return results
-    finally:
         tts_model.to("cpu")
 # ===================================================================================
 # 6) STARTUP: PRECACHE & UI
@@ -345,15 +450,17 @@ def build_ui() -> gr.Interface:
         ],
         outputs=gr.JSON(label="Story and Audio Output"),
         title="AI Storyteller with ZeroGPU",
-        description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
         flagging_mode="never",
     )
 if __name__ == "__main__":
     print("===== Startup: pre-cache assets and preload models =====")
-    precache_assets()
-    init_models_and_latents()
     print("Models and assets ready. Launching UI...")
     demo = build_ui()
-    demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

 # ===================================================================================
 from __future__ import annotations
 import os
+import sys
 import base64
 import struct
 import textwrap
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
+os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")  # truly disable analytics
+os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0")         # avoid torchaudio/ffmpeg linkage issues
+# --- .env early (HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
+# --- NumPy sanity (Torch 2.2.x wants NumPy 1.x) ---
+import numpy as _np
+if int(_np.__version__.split(".", 1)[0]) >= 2:
+    raise RuntimeError(
+        f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4) for this Space."
+    )
+# --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
 try:
+    import spaces  # Required for ZeroGPU on HF
 except Exception:
     class _SpacesShim:
         def GPU(self, *args, **kwargs):
 import gradio as gr
+# --- Core ML & Data Libraries (after spaces import) ---
 import torch
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+# --- Audio decoding (use ffmpeg-python; no torchaudio) ---
+import ffmpeg
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.manage import ModelManager
+import TTS.tts.models.xtts as xtts_module  # for monkey-patching load_audio
 # --- Text & Audio Processing ---
 import nltk
 # 2) GLOBALS & HELPERS
 # ===================================================================================
+# NLTK data
 nltk.download("punkt", quiet=True)
+# Cached models & latents
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
 voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
+# Config
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
+# Prefer native GPU if available; otherwise we’ll rely on ZeroGPU (or CPU)
+PREFER_NATIVE_GPU = torch.cuda.is_available()
+# System prompts and roles
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
+# ---------- small utils ----------
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
+    byte_rate = sample_rate * channels * bit_depth // 8
+    block_align = channels * bit_depth // 8
     chunk_size = 36 + len(pcm_data)
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
         b"RIFF", chunk_size, b"WAVE", b"fmt ",
+        16, 1, channels, sample_rate, byte_rate, block_align, bit_depth,
         b"data", len(pcm_data)
     )
     return header + pcm_data
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
+# ---------- robust audio decode (mono via ffmpeg) ----------
+def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
+    """
+    Return float32 waveform in [-1, 1], mono, resampled to target_sr.
+    Shape: (samples,)
+    """
+    try:
+        out, _ = (
+            ffmpeg
+            .input(path)
+            .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=target_sr)
+            .run(capture_stdout=True, capture_stderr=True, cmd="ffmpeg")
+        )
+        pcm = np.frombuffer(out, dtype=np.int16)
+        if pcm.size == 0:
+            raise RuntimeError("ffmpeg produced empty audio.")
+        wav = (pcm.astype(np.float32) / 32767.0)
+        return wav
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
+# ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
+def _patched_load_audio(audiopath: str, load_sr: int):
+    """
+    Match XTTS' expected return type:
+    - returns a torch.FloatTensor shaped [1, samples], normalized to [-1, 1],
+      already resampled to `load_sr`.
+    - DO NOT return (audio, sr) tuple.
+    """
+    wav = _decode_audio_ffmpeg_to_mono(audiopath, target_sr=load_sr)
+    import torch as _torch  # local import to avoid any circularities
+    audio = _torch.from_numpy(wav).float().unsqueeze(0)  # [1, N]
+    return audio
+xtts_module.load_audio = _patched_load_audio
+# Also patch the common utility location, in case this version imports from there:
+try:
+    import TTS.utils.audio as _tts_audio_mod
+    _tts_audio_mod.load_audio = _patched_load_audio
+except Exception:
+    pass
+# ---------- where Coqui caches models (avoid get_user_data_dir import) ----------
+def _coqui_cache_dir() -> str:
+    # Matches what TTS uses on Linux: ~/.local/share/tts
+    return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
 # ===================================================================================
 # 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
 # ===================================================================================
 def precache_assets() -> None:
+    """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
+    # Voices
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
             except Exception as e:
                 print(f"Failed to download {name}: {e}")
+    # XTTS model files
     print("Pre-caching XTTS v2 model files...")
     ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
+    # LLM GGUF
     print("Pre-caching Zephyr GGUF...")
     try:
         hf_hub_download(
             repo_id="TheBloke/zephyr-7B-beta-GGUF",
+            filename="zephyr-7b-beta.Q5_K_M.gguf",
+            force_download=False
         )
     except Exception as e:
         print(f"Warning: GGUF pre-cache error: {e}")
 def _load_xtts(device: str) -> Xtts:
+    """Load XTTS from the local cache. Use checkpoint_dir to avoid None path bugs."""
+    print(f"Loading Coqui XTTS V2 model on {device.upper()}...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    ModelManager().download_model(model_name)  # idempotent
+    model_dir = os.path.join(_coqui_cache_dir(), model_name.replace("/", "--"))
     cfg = XttsConfig()
     cfg.load_json(os.path.join(model_dir, "config.json"))
     model = Xtts.init_from_config(cfg)
     model.load_checkpoint(
         cfg,
         checkpoint_dir=model_dir,
     return model
 def _load_llama() -> Llama:
+    """
+    Load Llama (Zephyr GGUF). Prefer GPU offload if native CUDA build is present,
+    otherwise fall back to pure CPU.
+    """
+    print("Loading LLM (Zephyr GGUF)...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
+    # Heuristic: try to offload a large number of layers if CUDA build exists.
+    gpu_layers_env = int(os.getenv("LLAMA_GPU_LAYERS", "100"))
+    n_gpu_layers = gpu_layers_env if PREFER_NATIVE_GPU else 0
+    try:
+        llm = Llama(
+            model_path=zephyr_model_path,
+            n_gpu_layers=n_gpu_layers,   # if CUDA build exists, this offloads layers
+            n_ctx=4096,
+            n_batch=512,
+            verbose=False
+        )
+        used = "GPU-offload" if n_gpu_layers > 0 else "CPU"
+        print(f"LLM loaded ({used}).")
+        return llm
+    except Exception as e:
+        print(f"LLM GPU offload failed ({e}); falling back to CPU.")
+        llm = Llama(
+            model_path=zephyr_model_path,
+            n_gpu_layers=0,
+            n_ctx=4096,
+            n_batch=512,
+            verbose=False
+        )
+        print("LLM loaded (CPU).")
+        return llm
 def init_models_and_latents() -> None:
+    """
+    Preload TTS and LLM. If native GPU is available at startup, load XTTS on CUDA
+    and precompute voice latents there; otherwise do it on CPU (ZeroGPU will move it later).
+    """
     global tts_model, llm_model, voice_latents
+    target_device = "cuda" if PREFER_NATIVE_GPU else "cpu"
     if tts_model is None:
+        tts_model = _load_xtts(device=target_device)
     if llm_model is None:
         llm_model = _load_llama()
+    # Pre-compute latents once; uses patched loader (ffmpeg) under the hood
     if not voice_latents:
         print("Computing voice conditioning latents...")
         for role, filename in [
             ("Thera", "thera-1.wav"),
         ]:
             path = os.path.join("voices", filename)
+            with torch.no_grad():
+                voice_latents[role] = tts_model.get_conditioning_latents(
+                    audio_path=path, gpt_cond_len=30, max_ref_length=60
+                )
         print("Voice latents ready.")
+# Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
 def _close_llm():
     global llm_model
+    try:
+        if llm_model is not None:
             llm_model.close()
+    except Exception:
+        pass
 atexit.register(_close_llm)
 # ===================================================================================
             speaker_embedding=speaker_embedding,
             temperature=0.85,
         ):
+            if chunk is None:
+                continue
+            # chunk: torch.FloatTensor [N] or [1, N], float32 in [-1, 1]
+            f32 = chunk.detach().cpu().numpy().squeeze()
+            f32 = np.clip(f32, -1.0, 1.0).astype(np.float32)
+            s16 = (f32 * 32767.0).astype(np.int16)
+            yield s16.tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
         if "device-side assert" in str(e) and api:
             try:
+                gr.Warning("Critical GPU error. Attempting to restart the Space...")
                 api.restart_space(repo_id=repo_id)
             except Exception:
                 pass
 # ===================================================================================
+# 5) ZERO-GPU ENTRYPOINT (also works on native GPU)
 # ===================================================================================
+@spaces.GPU(duration=120)  # On native-GPU Spaces this simply runs with the resident GPU.
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
+    # Ensure models/latents exist
     if tts_model is None or llm_model is None or not voice_latents:
+        init_models_and_latents()
+    # Prefer GPU if available at call time (ZeroGPU grants CUDA during this function)
     try:
         if torch.cuda.is_available():
             tts_model.to("cuda")
         else:
             tts_model.to("cpu")
+    except Exception:
+        tts_model.to("cpu")
+    # Generate story text
+    history: List[Tuple[str, str | None]] = [(input_text, None)]
+    full_story_text = "".join(
+        generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
+    ).strip()
+    if not full_story_text:
+        return []
+    # Split into TTS-friendly sentences
+    sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
+    lang = langid.classify(sentences[0])[0] if sentences else "en"
+    results: List[Dict[str, str]] = []
+    for sentence in sentences:
+        if not any(c.isalnum() for c in sentence):
+            continue
+        audio_chunks = generate_audio_stream(tts_model, sentence, lang, voice_latents[chatbot_role])
+        pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
+        # Optional noise reduction (best-effort)
+        try:
+            data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
+            if data_s16.size > 0:
+                float_data = (data_s16.astype(np.float32) / 32767.0)
+                reduced = nr.reduce_noise(y=float_data, sr=24000)
+                final_pcm = np.clip(reduced * 32767.0, -32768, 32767).astype(np.int16).tobytes()
+            else:
                 final_pcm = pcm_data
+        except Exception:
+            final_pcm = pcm_data
+        b64_wav = base64.b64encode(pcm_to_wav(final_pcm, sample_rate=24000, channels=1, bit_depth=16)).decode("utf-8")
+        results.append({"text": sentence, "audio": b64_wav})
+    # Release GPU immediately if we were in a ZeroGPU window
+    try:
         tts_model.to("cpu")
+    except Exception:
+        pass
+    return results
 # ===================================================================================
 # 6) STARTUP: PRECACHE & UI
         ],
         outputs=gr.JSON(label="Story and Audio Output"),
         title="AI Storyteller with ZeroGPU",
+        description="Enter a prompt to generate a short story with voice narration using on-demand GPU or native GPU when available.",
         flagging_mode="never",
+        allow_flagging="never",
     )
 if __name__ == "__main__":
     print("===== Startup: pre-cache assets and preload models =====")
+    print(f"Python: {sys.version.split()[0]} | Torch CUDA available: {torch.cuda.is_available()}")
+    precache_assets()              # 1) download everything to disk
+    init_models_and_latents()      # 2) load models (prefer native GPU) + compute voice latents
     print("Models and assets ready. Launching UI...")
     demo = build_ui()
+    demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))