Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on Sep 1

Commit

aa644be

verified ·

1 Parent(s): 6e08050

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -66

app.py CHANGED Viewed

@@ -1,15 +1,10 @@
 # Created by bilsimaging.com
 import os
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import json
-import uuid
-import time
-import shutil
 import base64
 import random
 import tempfile
@@ -32,15 +27,17 @@ ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
 CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
 OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
-SPACE_TITLE = "🎵 Shorti Foley Sound— HunyuanVideo-Foley"
-SPACE_TAGLINE = "Bring your videos to life with AI-powered Foley Sound"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
-# ZeroGPU limit
-GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "120"))
 # Globals
 _model_dict = None
@@ -53,9 +50,9 @@ _device: Optional[torch.device] = None
 # ------------
 def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
     """
-    Pick device safely.
-    IMPORTANT: Do NOT query torch.cuda.is_available() in main/non-GPU processes
-    on Stateless GPU Spaces. Only set CUDA when called from a @spaces.GPU context.
     """
     if pref.startswith("cuda"):
         d = torch.device(f"cuda:{gpu_id}")
@@ -105,10 +102,30 @@ def prepare_once() -> None:
 # -----------------------
 # Model load & inference
 # -----------------------
 def auto_load_models(device_str: str = "cpu") -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
-    Use device_str="cuda" ONLY inside @spaces.GPU function to avoid CUDA init in main process.
     """
     global _model_dict, _cfg, _device
@@ -117,6 +134,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
     # Make absolutely sure safetensors is preferred
     os.environ["HF_PREFER_SAFETENSORS"] = "1"
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
@@ -128,6 +146,8 @@ def auto_load_models(device_str: str = "cpu") -> str:
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
     except OSError as e:
         logger.error(str(e))
@@ -135,6 +155,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
         os.environ["HF_PREFER_SAFETENSORS"] = "1"
         try:
             _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
             return "✅ Model loaded (after safetensors retry)."
         except Exception as e2:
             logger.error(str(e2))
@@ -145,7 +166,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
 def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
-    """Preferred: project’s util; fallback to ffmpeg."""
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
@@ -167,7 +188,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
-    """Save WAV + MP4 in autosaved/, add metadata with a soft watermark note."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
@@ -222,7 +243,7 @@ def infer_single_video(
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
-    # Lazy-load on GPU
     if _model_dict is None or _cfg is None:
         msg = auto_load_models(device_str="cuda")
         if not str(msg).startswith("✅"):
@@ -235,23 +256,25 @@ def infer_single_video(
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
-    # preprocess
-    visual_feats, text_feats, audio_len_s = feature_process(
-        video_file, (text_prompt or "").strip(), _model_dict, _cfg
-    )
-    # generate batch
-    n = int(max(1, min(6, sample_nums)))
-    audio, sr = denoise_process(
-        visual_feats,
-        text_feats,
-        audio_len_s,
-        _model_dict,
-        _cfg,
-        guidance_scale=float(guidance_scale),
-        num_inference_steps=int(num_inference_steps),
-        batch_size=n,
-    )
     # save results
     outs = []
@@ -262,7 +285,7 @@ def infer_single_video(
 # -------------
-# Gradio UI
 # -------------
 def _about_html() -> str:
     return f"""
@@ -292,8 +315,6 @@ def _about_html() -> str:
       <h3>MCP & API</h3>
       <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
       Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
     </div>
     """
@@ -349,7 +370,7 @@ def create_ui() -> gr.Blocks:
                         v6 = gr.Video(label="Sample 6", height=160, visible=False)
                         gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
-                # Generate handler
                 def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
                     vis = []
@@ -358,37 +379,16 @@ def create_ui() -> gr.Blocks:
                             vis.append(gr.update(visible=True, value=outs[i]))
                         else:
                             vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
-                    # Also refresh the gallery in this same event
-                    new_gallery = _list_gallery()
-                    return (*vis, msg, new_gallery)
                 generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
-                    outputs=[v1, v2, v3, v4, v5, v6, status],  # updated below to include gallery via .then-like merge
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
-                # Workaround: extend outputs to include gallery refresh using a wrapper
-                def _process_and_update_with_gallery(video_file, text_prompt, cfg, nsteps, nsamples):
-                    outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
-                    vis = []
-                    for i in range(6):
-                        if outs and i < len(outs):
-                            vis.append(gr.update(visible=True, value=outs[i]))
-                        else:
-                            vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
-                    new_gallery = _list_gallery()
-                    return (*vis, msg, new_gallery)
-                # Re-bind with gallery as extra output
-                generate.click(
-                    fn=_process_and_update_with_gallery,
-                    inputs=[video_input, text_input, guidance_scale, steps, samples],
-                    outputs=[v1, v2, v3, v4, v5, v6, status,],  # gallery will be refreshed on Gallery tab itself
-                )
                 load_btn.click(
                     fn=lambda: auto_load_models(device_str="cpu"),
                     inputs=[],
@@ -411,7 +411,7 @@ def create_ui() -> gr.Blocks:
                 samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
             with gr.Tab("📁 Gallery"):
-                gr.Markdown("Latest generated videos (autosaved to `outputs/autosaved/`).")
                 gallery = gr.Gallery(
                     value=_list_gallery(),
                     columns=3,
@@ -443,7 +443,7 @@ Loads the model proactively (useful before batch runs).
 - `shortifoley://status` → quick health info
 - `foley_prompt` → reusable guidance for describing the sound
-Works great with media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
 """)
             with gr.Tab("ℹ️ About"):
@@ -459,7 +459,7 @@ Works great with media-automation in tools like **n8n**: call `load_model_tool`
             """
         )
-        # ---- REST + MCP endpoints ----
         def _download_to_tmp(url: str) -> str:
             try:
                 import requests
@@ -499,7 +499,7 @@ Works great with media-automation in tools like **n8n**: call `load_model_tool`
             sample_nums: int = 1,
         ) -> Dict[str, List[str]]:
             if _model_dict is None or _cfg is None:
-                msg = auto_load_models(device_str="cpu")
                 if not str(msg).startswith("✅"):
                     raise RuntimeError(msg)
             local = _normalize_video_input(video_url_or_b64)
@@ -546,7 +546,7 @@ if __name__ == "__main__":
     logger.info("===== Application Startup =====\n")
     prepare_once()
-    # Probe imports
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
@@ -557,10 +557,10 @@ if __name__ == "__main__":
     ui = create_ui()
-    # Enable MCP server
     ui.launch(
         server_name="0.0.0.0",
         share=False,
         show_error=True,
-        mcp_server=True,   # MCP
     )

 # Created by bilsimaging.com
 import os
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import json
 import base64
 import random
 import tempfile
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
 CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
+# Always save into outputs/autosaved/
 OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
+SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
+SPACE_TAGLINE = "Text/Video → Audio Foley · Created by bilsimaging.com"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
+# ZeroGPU limit (<=120s recommended)
+GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
 # Globals
 _model_dict = None
 # ------------
 def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
     """
+    Safe device picker.
+    IMPORTANT: Do NOT probe torch.cuda.is_available() here on Stateless GPU Spaces.
+    Only request CUDA inside a @spaces.GPU function.
     """
     if pref.startswith("cuda"):
         d = torch.device(f"cuda:{gpu_id}")
 # -----------------------
 # Model load & inference
 # -----------------------
+def _force_fp32_on_modules(obj):
+    """Ensure every torch.nn.Module inside obj is float32 to avoid half/float mismatches."""
+    try:
+        import torch.nn as nn
+        for name in dir(obj):
+            try:
+                m = getattr(obj, name)
+            except Exception:
+                continue
+            if isinstance(m, nn.Module):
+                m.float()
+        if hasattr(obj, "foley_model"): obj.foley_model.float()
+        if hasattr(obj, "dac_model"): obj.dac_model.float()
+        if hasattr(obj, "siglip2_model"): obj.siglip2_model.float()
+        if hasattr(obj, "clap_model"): obj.clap_model.float()
+        if hasattr(obj, "syncformer_model"): obj.syncformer_model.float()
+    except Exception as e:
+        logger.warning(f"FP32 cast warning: {e}")
 def auto_load_models(device_str: str = "cpu") -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
+    Use device_str='cuda' ONLY inside @spaces.GPU to avoid CUDA init in main process.
     """
     global _model_dict, _cfg, _device
     # Make absolutely sure safetensors is preferred
     os.environ["HF_PREFER_SAFETENSORS"] = "1"
+    torch.set_float32_matmul_precision("high")  # allow TF32 where possible
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
+        # Force fp32 to fix: RuntimeError: Input type (Half) and bias (float) must match
+        _force_fp32_on_modules(_model_dict)
         return "✅ Model loaded."
     except OSError as e:
         logger.error(str(e))
         os.environ["HF_PREFER_SAFETENSORS"] = "1"
         try:
             _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
+            _force_fp32_on_modules(_model_dict)
             return "✅ Model loaded (after safetensors retry)."
         except Exception as e2:
             logger.error(str(e2))
 def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
+    """Preferred: project's util; fallback to ffmpeg."""
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
+    """Save WAV + MP4 in outputs/autosaved/, add metadata with a soft watermark note."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
+    # Lazy-load on GPU ONLY here (prevents CUDA init in main process)
     if _model_dict is None or _cfg is None:
         msg = auto_load_models(device_str="cuda")
         if not str(msg).startswith("✅"):
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
+    # Avoid autocast to float16 to fix Half/Float mismatch inside Synchformer conv3d
+    with torch.autocast(device_type="cuda", enabled=False):
+        # preprocess
+        visual_feats, text_feats, audio_len_s = feature_process(
+            video_file, (text_prompt or "").strip(), _model_dict, _cfg
+        )
+        # generate batch
+        n = int(max(1, min(6, sample_nums)))
+        audio, sr = denoise_process(
+            visual_feats,
+            text_feats,
+            audio_len_s,
+            _model_dict,
+            _cfg,
+            guidance_scale=float(guidance_scale),
+            num_inference_steps=int(num_inference_steps),
+            batch_size=n,
+        )
     # save results
     outs = []
 # -------------
+# Gradio UI (with MCP+API inside the same app)
 # -------------
 def _about_html() -> str:
     return f"""
       <h3>MCP & API</h3>
       <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
       Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
     </div>
     """
                         v6 = gr.Video(label="Sample 6", height=160, visible=False)
                         gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
+                # Generate handler (single binding, exact outputs)
                 def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
                     vis = []
                             vis.append(gr.update(visible=True, value=outs[i]))
                         else:
                             vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
+                    return (*vis, msg)
                 generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
+                    outputs=[v1, v2, v3, v4, v5, v6, status],
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
                 load_btn.click(
                     fn=lambda: auto_load_models(device_str="cpu"),
                     inputs=[],
                 samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
             with gr.Tab("📁 Gallery"):
+                gr.Markdown("Latest generated videos (autosaved to <code>outputs/autosaved/</code>).")
                 gallery = gr.Gallery(
                     value=_list_gallery(),
                     columns=3,
 - `shortifoley://status` → quick health info
 - `foley_prompt` → reusable guidance for describing the sound
+Works great for media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
 """)
             with gr.Tab("ℹ️ About"):
             """
         )
+        # ---- REST + MCP endpoints (inside Blocks) ----
         def _download_to_tmp(url: str) -> str:
             try:
                 import requests
             sample_nums: int = 1,
         ) -> Dict[str, List[str]]:
             if _model_dict is None or _cfg is None:
+                msg = auto_load_models(device_str="cpu")  # safe in HTTP context
                 if not str(msg).startswith("✅"):
                     raise RuntimeError(msg)
             local = _normalize_video_input(video_url_or_b64)
     logger.info("===== Application Startup =====\n")
     prepare_once()
+    # Probe imports (early surfacing)
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
     ui = create_ui()
+    # Enable MCP server so tools/resources/prompts are discoverable
     ui.launch(
         server_name="0.0.0.0",
         share=False,
         show_error=True,
+        mcp_server=True,   # Enable MCP server
     )