Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on Aug 31

Commit

0ff82ef

verified ·

1 Parent(s): e7621f8

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -92

app.py CHANGED Viewed

@@ -2,6 +2,9 @@
 # Created by bilsimaging.com
 import os
 import sys
 import io
 import json
@@ -60,7 +63,10 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
             d = torch.device("cpu")
     else:
         d = torch.device(pref)
-    logger.info(f"Using CUDA {d}" if d.type == "cuda" else f"Using {d}")
     return d
@@ -89,7 +95,7 @@ def _download_weights_if_needed() -> None:
             "synchformer_state_dict.pth",
             "vae_128d_48k.pth",
             "assets/*",
-            "config.yaml",  # harmless
         ],
     )
@@ -105,12 +111,16 @@ def prepare_once() -> None:
 def auto_load_models() -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "Model already loaded."
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
@@ -122,8 +132,19 @@ def auto_load_models() -> str:
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
     except Exception as e:
-        logger.error(e)
         return f"❌ Failed to load model: {e}"
@@ -134,7 +155,6 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
         merge_audio_video(audio_path, video_path, out_path)
     except Exception as e:
-        # Fallback: plain ffmpeg merge (assumes same duration or lets ffmpeg handle)
         logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
         import subprocess
         cmd = [
@@ -242,89 +262,8 @@ def infer_single_video(
     return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
-# ---------------
-# MCP-only APIs
-# ---------------
-def _download_to_tmp(url: str) -> str:
-    """Download a remote file to temp."""
-    try:
-        import requests
-    except Exception:
-        raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
-    r = requests.get(url, timeout=30)
-    r.raise_for_status()
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    tmp.write(r.content)
-    tmp.flush()
-    tmp.close()
-    return tmp.name
-def _maybe_from_base64(data_url_or_b64: str) -> str:
-    """Accept data: URLs or raw base64; returns temp file path."""
-    b64 = data_url_or_b64
-    if data_url_or_b64.startswith("data:"):
-        b64 = data_url_or_b64.split(",", 1)[-1]
-    raw = base64.b64decode(b64)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    tmp.write(raw)
-    tmp.flush()
-    tmp.close()
-    return tmp.name
-def _normalize_video_input(video_url_or_b64: str) -> str:
-    v = (video_url_or_b64 or "").strip()
-    if v.startswith("http://") or v.startswith("https://"):
-        return _download_to_tmp(v)
-    return _maybe_from_base64(v)
-with gr.Blocks() as mcp_only_endpoints:
-    gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
-    @gr.api
-    def api_generate_from_url(
-        video_url_or_b64: str,
-        text_prompt: str = "",
-        guidance_scale: float = 4.5,
-        num_inference_steps: int = 50,
-        sample_nums: int = 1,
-    ) -> Dict[str, List[str]]:
-        """
-        Generate Foley from a remote video URL or base64-encoded video.
-        Returns: {"videos": [paths], "message": str}
-        """
-        if _model_dict is None or _cfg is None:
-            raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
-        local = _normalize_video_input(video_url_or_b64)
-        outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
-        return {"videos": outs, "message": msg}
-    @gr.api
-    def load_model_tool() -> str:
-        """Ensure model is loaded on server (MCP convenience)."""
-        return auto_load_models()
-    @gr.mcp.resource("shortifoley://status")
-    def shortifoley_status() -> str:
-        """Return a simple readiness string for MCP clients."""
-        ready = _model_dict is not None and _cfg is not None
-        dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
-        return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
-    @gr.mcp.prompt()
-    def foley_prompt(name: str = "default") -> str:
-        """Reusable guidance for describing sound ambience."""
-        return (
-            "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
-            "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
-        )
 # -------------
-# Gradio UI
 # -------------
 def _about_html() -> str:
     return f"""
@@ -407,13 +346,12 @@ def create_ui() -> gr.Blocks:
                             vis_updates.append(gr.update(visible=True, value=outs[i]))
                         else:
                             vis_updates.append(gr.update(visible=False, value=None))
-                    gal_items = _list_gallery()
-                    return (*vis_updates, msg, gr.update(value=gal_items))
                 generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
-                    outputs=[v1, v2, v3, v4, v5, v6, status, ],
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
@@ -445,9 +383,76 @@ def create_ui() -> gr.Blocks:
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
-        # Also expose gallery update after generate
         generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
     return demo
@@ -485,8 +490,6 @@ if __name__ == "__main__":
         logger.info(msg)
     ui = create_ui()
-    # Mount MCP-only endpoints alongside the UI
-    ui.blocks.append(mcp_only_endpoints)
     # Enable MCP server so tools/resources/prompts are discoverable
     ui.launch(

 # Created by bilsimaging.com
 import os
+# ---- Prefer safetensors for all HF model loads (fixes CLAP .bin crash on ZeroGPU) ----
+os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import io
 import json
             d = torch.device("cpu")
     else:
         d = torch.device(pref)
+    if d.type == "cuda":
+        logger.info(f"Using CUDA {d}")
+    else:
+        logger.info(f"Using {d}")
     return d
             "synchformer_state_dict.pth",
             "vae_128d_48k.pth",
             "assets/*",
+            "config.yaml",
         ],
     )
 def auto_load_models() -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
+    Ensures safetensors is preferred to avoid ZeroGPU issues with .bin checkpoints.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "Model already loaded."
+    # Ensure Transformers prefers safetensors for everything:
+    os.environ["HF_PREFER_SAFETENSORS"] = "1"
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
+    except OSError as e:
+        # If any OSError (often from trying to read pytorch_model.bin), retry after enforcing safetensors.
+        logger.error(str(e))
+        logger.info("Retrying load after enforcing safetensors preference...")
+        os.environ["HF_PREFER_SAFETENSORS"] = "1"
+        try:
+            _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
+            return "✅ Model loaded (after safetensors retry)."
+        except Exception as e2:
+            logger.error(str(e2))
+            return f"❌ Failed to load model: {e2}"
     except Exception as e:
+        logger.error(str(e))
         return f"❌ Failed to load model: {e}"
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
         merge_audio_video(audio_path, video_path, out_path)
     except Exception as e:
         logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
         import subprocess
         cmd = [
     return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
 # -------------
+# Gradio UI (with MCP+API inside the same app)
 # -------------
 def _about_html() -> str:
     return f"""
                             vis_updates.append(gr.update(visible=True, value=outs[i]))
                         else:
                             vis_updates.append(gr.update(visible=False, value=None))
+                    return (*vis_updates, msg)
                 generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
+                    outputs=[v1, v2, v3, v4, v5, v6, status],
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
+        # Keep gallery in sync after generate
         generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
+        # -----------------------
+        # MCP + REST API endpoints
+        # -----------------------
+        def _download_to_tmp(url: str) -> str:
+            try:
+                import requests
+            except Exception:
+                raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
+            r = requests.get(url, timeout=30)
+            r.raise_for_status()
+            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+            tmp.write(r.content)
+            tmp.flush()
+            tmp.close()
+            return tmp.name
+        def _maybe_from_base64(data_url_or_b64: str) -> str:
+            b64 = data_url_or_b64
+            if data_url_or_b64.startswith("data:"):
+                b64 = data_url_or_b64.split(",", 1)[-1]
+            raw = base64.b64decode(b64)
+            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+            tmp.write(raw)
+            tmp.flush()
+            tmp.close()
+            return tmp.name
+        def _normalize_video_input(video_url_or_b64: str) -> str:
+            v = (video_url_or_b64 or "").strip()
+            if v.startswith("http://") or v.startswith("https://"):
+                return _download_to_tmp(v)
+            return _maybe_from_base64(v)
+        @gr.api
+        def api_generate_from_url(
+            video_url_or_b64: str,
+            text_prompt: str = "",
+            guidance_scale: float = 4.5,
+            num_inference_steps: int = 50,
+            sample_nums: int = 1,
+        ) -> Dict[str, List[str]]:
+            if _model_dict is None or _cfg is None:
+                raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
+            local = _normalize_video_input(video_url_or_b64)
+            outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
+            return {"videos": outs, "message": msg}
+        @gr.api
+        def load_model_tool() -> str:
+            """Ensure model is loaded on server (MCP convenience)."""
+            return auto_load_models()
+        @gr.mcp.resource("shortifoley://status")
+        def shortifoley_status() -> str:
+            """Return a simple readiness string for MCP clients."""
+            ready = _model_dict is not None and _cfg is not None
+            dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
+            return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
+        @gr.mcp.prompt()
+        def foley_prompt(name: str = "default") -> str:
+            """Reusable guidance for describing sound ambience."""
+            return (
+                "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
+                "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
+            )
     return demo
         logger.info(msg)
     ui = create_ui()
     # Enable MCP server so tools/resources/prompts are discoverable
     ui.launch(