Spaces:

TencentARC
/

ToonComposer

Running on Zero

App Files Files Community

l-li commited on Aug 17

Commit

00274d1

1 Parent(s): dcac773

update requirements.

Browse files

Files changed (39) hide show

.gitattributes +15 -0
LICENSE +233 -0
README.md +4 -4
app.py +1066 -0
model/__init__.py +0 -0
model/dera.py +195 -0
model/dit.py +1090 -0
model/image_encoder.py +903 -0
model/prompter.py +107 -0
model/text_encoder.py +269 -0
model/vae.py +809 -0
pipeline/__init__.py +0 -0
pipeline/i2v_pipeline.py +511 -0
requirements.txt +148 -0
samples/1_image1.png +3 -0
samples/1_out.mp4 +3 -0
samples/1_prompt.txt +1 -0
samples/1_sketch1.jpg +3 -0
samples/1_sketch2.jpg +3 -0
samples/1_sketch3.jpg +3 -0
samples/2_image1.jpg +3 -0
samples/2_out.mp4 +3 -0
samples/2_prompt.txt +1 -0
samples/2_sketch1.jpg +3 -0
samples/2_sketch2.jpg +3 -0
samples/3_image1.png +3 -0
samples/3_out.mp4 +3 -0
samples/3_prompt.txt +1 -0
samples/3_sketch1.jpg +3 -0
samples/ToonComposer-Icon.png +3 -0
samples/ToonComposer-Method.jpg +3 -0
samples/ToonComposer-TLDR.jpg +3 -0
scheduler/__init__.py +0 -0
scheduler/flow_match.py +78 -0
tooncomposer.py +234 -0
util/model_util.py +241 -0
util/optical_flow.py +140 -0
util/stylesheets.py +0 -0
util/training_util.py +317 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/1_out.mp4 filter=lfs diff=lfs merge=lfs -text
+samples/2_out.mp4 filter=lfs diff=lfs merge=lfs -text
+samples/3_out.mp4 filter=lfs diff=lfs merge=lfs -text
+samples/1_image1.png filter=lfs diff=lfs merge=lfs -text
+samples/3_image1.png filter=lfs diff=lfs merge=lfs -text
+samples/ToonComposer-Icon.png filter=lfs diff=lfs merge=lfs -text
+samples/1_sketch2.jpg filter=lfs diff=lfs merge=lfs -text
+samples/1_sketch3.jpg filter=lfs diff=lfs merge=lfs -text
+samples/2_image1.jpg filter=lfs diff=lfs merge=lfs -text
+samples/1_sketch1.jpg filter=lfs diff=lfs merge=lfs -text
+samples/2_sketch1.jpg filter=lfs diff=lfs merge=lfs -text
+samples/2_sketch2.jpg filter=lfs diff=lfs merge=lfs -text
+samples/3_sketch1.jpg filter=lfs diff=lfs merge=lfs -text
+samples/ToonComposer-Method.jpg filter=lfs diff=lfs merge=lfs -text
+samples/ToonComposer-TLDR.jpg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,233 @@

+Tencent is pleased to support the open source community by making ToonComposer available.
+Copyright (C) 2025 Tencent. All rights reserved.
+ToonComposer is licensed under the MIT License except for the third-party components listed below, which is licensed under different terms. ToonComposer does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+For avoidance of doubts, ToonComposer refers to the inference code, parameters and weights made publicly available by Tencent in accordance with the MIT License in this repository.
+Terms of the MIT License:
+--------------------------------------------------------------------
+Copyright (C) 2025 Tencent. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the " Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+The ToonComposer model was developed by Tencent based on the following Open Models.
+The ToonComposer inference code was developed by Tencent based on the code of the following Open Models.The below software in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) Tencent.
+Open Models Licensed under the Apache-2.0 License:
+--------------------------------------------------------------------
+1.Wan2.1
+Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+The code of this model was modified by Tencent.
+--------------------------------------------------------------------
+Terms of the Apache-2.0 License:
+--------------------------------------------------------------------
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
 title: ToonComposer
-emoji: 🚀
 colorFrom: gray
-colorTo: pink
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ToonComposer
+emoji: 🎨
 colorFrom: gray
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.25.2
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1066 @@

+import torch
+import numpy as np
+from PIL import Image
+from tooncomposer import ToonComposer, get_base_model_paths
+import argparse
+import json
+from util.training_util import extract_img_to_sketch
+import os
+import tempfile
+import cv2
+import gradio as gr
+from einops import rearrange
+from datetime import datetime
+from typing import Optional, List, Dict
+from huggingface_hub import snapshot_download
+os.environ["GRADIO_TEMP_DIR"] = os.path.abspath(os.path.join(os.path.dirname(__file__), "gradio_cache"))
+# -----------------------------------------------------------------------------
+# Weights resolution and download helpers
+# -----------------------------------------------------------------------------
+WAN_REPO_ID = "Wan-AI/Wan2.1-I2V-14B-480P"
+TOONCOMPOSER_REPO_ID = "TencentARC/ToonComposer"
+def _path_is_dir_with_files(dir_path: str, required_files: List[str]) -> bool:
+    if not dir_path or not os.path.isdir(dir_path):
+        return False
+    for f in required_files:
+        if not os.path.exists(os.path.join(dir_path, f)):
+            return False
+    return True
+def resolve_wan_model_root(preferred_dir: Optional[str] = None, hf_token: Optional[str] = None) -> str:
+    """Return a directory containing Wan2.1-I2V-14B-480P weights.
+    Resolution order:
+    1) preferred_dir arg (if valid)
+    2) WAN21_I2V_DIR env var (if valid)
+    3) HF local cache (no download) via snapshot_download(local_files_only=True)
+    4) HF download to cache via snapshot_download()
+    """
+    # Required filenames relative to the model root
+    expected = get_base_model_paths("Wan2.1-I2V-14B-480P", format='dict', model_root=".")
+    required_files = []
+    required_files.extend([os.path.basename(p) for p in expected["dit"]])
+    required_files.append(os.path.basename(expected["image_encoder"]))
+    required_files.append(os.path.basename(expected["text_encoder"]))
+    required_files.append(os.path.basename(expected["vae"]))
+    # 1) preferred_dir arg
+    if _path_is_dir_with_files(preferred_dir or "", required_files):
+        return os.path.abspath(preferred_dir)
+    # 2) environment variable
+    env_dir = os.environ.get("WAN21_I2V_DIR")
+    if _path_is_dir_with_files(env_dir or "", required_files):
+        return os.path.abspath(env_dir)
+    # 3) try local cache without network
+    try:
+        cached_dir = snapshot_download(repo_id=WAN_REPO_ID, local_files_only=True)
+        return cached_dir
+    except Exception:
+        pass
+    # 4) download (may be large)
+    cached_dir = snapshot_download(repo_id=WAN_REPO_ID, token=hf_token)
+    return cached_dir
+def resolve_tooncomposer_repo_dir(preferred_dir: Optional[str] = None, hf_token: Optional[str] = None) -> str:
+    """Return a directory containing ToonComposer repo with 480p/608p subdirs."""
+    # Quick validity check: ensure either a subdir 480p or 608p exists with required files
+    def has_resolution_dirs(base_dir: str) -> bool:
+        if not base_dir or not os.path.isdir(base_dir):
+            return False
+        ok = False
+        for res in ["480p", "608p"]:
+            d = os.path.join(base_dir, res)
+            if os.path.isdir(d):
+                ckpt = os.path.join(d, "tooncomposer.ckpt")
+                cfg = os.path.join(d, "config.json")
+                if os.path.exists(ckpt) and os.path.exists(cfg):
+                    ok = True
+        return ok
+    # 1) preferred_dir arg
+    if has_resolution_dirs(preferred_dir or ""):
+        return os.path.abspath(preferred_dir)
+    # 2) environment variable
+    env_dir = os.environ.get("TOONCOMPOSER_DIR")
+    if has_resolution_dirs(env_dir or ""):
+        return os.path.abspath(env_dir)
+    # 3) try local cache first
+    try:
+        cached_dir = snapshot_download(repo_id=TOONCOMPOSER_REPO_ID, local_files_only=True)
+        return cached_dir
+    except Exception:
+        pass
+    # 4) download repo to cache
+    cached_dir = snapshot_download(repo_id=TOONCOMPOSER_REPO_ID, token=hf_token)
+    return cached_dir
+def build_checkpoints_by_resolution(tooncomposer_base_dir: str) -> Dict[str, Dict[str, object]]:
+    """Construct resolution mapping from a base repo dir that contains 480p/608p.
+    The ToonComposer HF repo stores, inside each resolution dir:
+      - tooncomposer.ckpt
+      - config.json (model configuration)
+    """
+    mapping = {}
+    # Known target sizes
+    res_to_hw = {
+        "480p": (480, 832),
+        "608p": (608, 1088),
+    }
+    for res, (h, w) in res_to_hw.items():
+        res_dir = os.path.join(tooncomposer_base_dir, res)
+        mapping[res] = {
+            "target_height": h,
+            "target_width": w,
+            "snapshot_args_path": os.path.join(res_dir, "config.json"),
+            "checkpoint_path": os.path.join(res_dir, "tooncomposer.ckpt"),
+        }
+    return mapping
+# Will be populated in main() after resolving ToonComposer repo directory
+checkpoints_by_resolution = {}
+def tensor2video(frames):
+    frames = rearrange(frames, "C T H W -> T H W C")
+    frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
+    frames = [Image.fromarray(frame) for frame in frames]
+    return frames
+def _load_model_config(config_path: str) -> Dict[str, object]:
+    with open(config_path, "r") as f:
+        data = json.load(f)
+    return data
+def _merge_with_defaults(cfg: Dict[str, object]) -> Dict[str, object]:
+    # Provide safe defaults for optional fields used at inference-time
+    defaults = {
+        "base_model_name": "Wan2.1-I2V-14B-480P",
+        "learning_rate": 1e-5,
+        "train_architecture": "lora",
+        "lora_rank": 4,
+        "lora_alpha": 4,
+        "lora_target_modules": "q,k,v,o,ffn.0,ffn.2",
+        "init_lora_weights": "kaiming",
+        "use_gradient_checkpointing": True,
+        "tiled": False,
+        "tile_size_height": 34,
+        "tile_size_width": 34,
+        "tile_stride_height": 18,
+        "tile_stride_width": 16,
+        "output_path": "./",
+        "use_local_lora": False,
+        "use_dera": False,
+        "dera_rank": None,
+        "use_dera_spatial": True,
+        "use_dera_temporal": True,
+        "use_sequence_cond": True,
+        "sequence_cond_mode": "sparse",
+        "use_channel_cond": False,
+        "use_sequence_cond_position_aware_residual": True,
+        "use_sequence_cond_loss": False,
+        "fast_dev": False,
+        "max_num_cond_images": 1,
+        "max_num_cond_sketches": 2,
+        "visualize_attention": False,
+        "random_spaced_cond_frames": False,
+        "use_sketch_mask": True,
+        "sketch_mask_ratio": 0.2,
+        "no_first_sketch": False,
+    }
+    merged = defaults.copy()
+    merged.update(cfg)
+    return merged
+def initialize_model(resolution="480p", fast_dev=False, device="cuda:0", dtype=torch.bfloat16,
+                     wan_model_dir: Optional[str] = None, tooncomposer_dir: Optional[str] = None,
+                     hf_token: Optional[str] = None):
+    # Initialize model components
+    if resolution not in checkpoints_by_resolution:
+        raise ValueError(f"Resolution '{resolution}' is not available. Found: {list(checkpoints_by_resolution.keys())}")
+    # 1) resolve config and checkpoint from ToonComposer repo (local or HF)
+    snapshot_args_path = checkpoints_by_resolution[resolution]["snapshot_args_path"]
+    checkpoint_path = checkpoints_by_resolution[resolution]["checkpoint_path"]
+    # 2) load model config
+    snapshot_args_raw = _load_model_config(snapshot_args_path)
+    snapshot_args = _merge_with_defaults(snapshot_args_raw)
+    snapshot_args["checkpoint_path"] = checkpoint_path
+    # 3) resolve Wan2.1 model root
+    snapshot_args["model_root"] = resolve_wan_model_root(preferred_dir=wan_model_dir, hf_token=hf_token)
+    # Backward-compat fields
+    if "training_max_frame_stride" not in snapshot_args:
+        snapshot_args["training_max_frame_stride"] = 4
+    snapshot_args["random_spaced_cond_frames"] = False
+    args = argparse.Namespace(**snapshot_args)
+    if not fast_dev:
+        model = ToonComposer(
+            base_model_name=args.base_model_name,
+            model_root=args.model_root,
+            learning_rate=args.learning_rate,
+            train_architecture=args.train_architecture,
+            lora_rank=args.lora_rank,
+            lora_alpha=args.lora_alpha,
+            lora_target_modules=args.lora_target_modules,
+            init_lora_weights=args.init_lora_weights,
+            use_gradient_checkpointing=args.use_gradient_checkpointing,
+            checkpoint_path=args.checkpoint_path,
+            tiled=args.tiled,
+            tile_size=(args.tile_size_height, args.tile_size_width),
+            tile_stride=(args.tile_stride_height, args.tile_stride_width),
+            output_path=args.output_path,
+            use_local_lora=args.use_local_lora,
+            use_dera=args.use_dera,
+            dera_rank=args.dera_rank,
+            use_dera_spatial=args.use_dera_spatial,
+            use_dera_temporal=args.use_dera_temporal,
+            use_sequence_cond=args.use_sequence_cond,
+            sequence_cond_mode=args.sequence_cond_mode,
+            use_channel_cond=args.use_channel_cond,
+            use_sequence_cond_position_aware_residual=args.use_sequence_cond_position_aware_residual,
+            use_sequence_cond_loss=args.use_sequence_cond_loss,
+            fast_dev=args.fast_dev,
+            max_num_cond_images=args.max_num_cond_images,
+            max_num_cond_sketches=args.max_num_cond_sketches,
+            visualize_attention=args.visualize_attention,
+            random_spaced_cond_frames=args.random_spaced_cond_frames,
+            use_sketch_mask=args.use_sketch_mask,
+            sketch_mask_ratio=args.sketch_mask_ratio,
+            no_first_sketch=args.no_first_sketch,
+        )
+        model = model.to(device, dtype=dtype).eval()
+    else:
+        print("Fast dev mode. Models will not be loaded.")
+        model = None
+    print("Models initialized.")
+    return model, device, dtype
+# -----------------------------------------------------------------------------
+# CLI args and global initialization
+# -----------------------------------------------------------------------------
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--resolution", type=str, default=os.environ.get("TOONCOMPOSER_RESOLUTION", "480p"), choices=["480p", "608p"], help="Target resolution to load by default.")
+    parser.add_argument("--device", type=str, default=os.environ.get("DEVICE", "cuda"))
+    parser.add_argument("--dtype", type=str, default=os.environ.get("DTYPE", "bfloat16"), choices=["bfloat16", "float32"])
+    parser.add_argument("--wan_model_dir", type=str, default=os.environ.get("WAN21_I2V_DIR"), help="Local directory containing Wan2.1 model files. If not provided, will try HF cache and download if needed.")
+    parser.add_argument("--tooncomposer_dir", type=str, default=os.environ.get("TOONCOMPOSER_DIR"), help="Local directory containing ToonComposer weights with 480p/608p subdirectories. If not provided, will try HF cache and download if needed.")
+    parser.add_argument("--hf_token", type=str, default=os.environ.get("HF_TOKEN"), help="Hugging Face token (if needed for gated models).")
+    parser.add_argument("--fast_dev", action="store_true", help="Run in fast dev mode without loading heavy models.")
+    return parser.parse_args()
+_cli_args = _parse_args()
+# Resolve ToonComposer repo dir and build resolution mapping
+_toon_dir = resolve_tooncomposer_repo_dir(preferred_dir=_cli_args.tooncomposer_dir, hf_token=_cli_args.hf_token)
+checkpoints_by_resolution = build_checkpoints_by_resolution(_toon_dir)
+_dtype_map = {
+    "bfloat16": torch.bfloat16,
+    "float32": torch.float32,
+}
+fast_dev = bool(_cli_args.fast_dev)
+model, device, dtype = initialize_model(
+    resolution=_cli_args.resolution,
+    fast_dev=fast_dev,
+    device=_cli_args.device,
+    dtype=_dtype_map[_cli_args.dtype],
+    wan_model_dir=_cli_args.wan_model_dir,
+    tooncomposer_dir=_cli_args.tooncomposer_dir,
+    hf_token=_cli_args.hf_token,
+)
+def process_conditions(num_items, item_inputs, num_frames, is_sketch=False, target_height=480, target_width=832):
+    """Process condition images/sketches into masked video tensor and mask"""
+    # Create empty tensors filled with -1
+    video = torch.zeros((1, 3, num_frames, target_height, target_width), device=device)
+    mask = torch.zeros((1, num_frames), device=device)
+    for i in range(num_items):
+        img, frame_idx = item_inputs[i]
+        if img is None or frame_idx is None:
+            continue
+        # Convert PIL image to tensor
+        img_tensor = torch.from_numpy(np.array(img)).permute(2,0,1).float() / 127.5 - 1.0
+        if is_sketch:
+            img_tensor = -img_tensor
+        img_tensor = img_tensor.unsqueeze(0).to(device)
+        # Resize to model's expected resolution while preserving aspect ratio
+        # Get original dimensions
+        _, _, h, w = img_tensor.shape
+        # Resize based on short edge while maintaining aspect ratio
+        if h/w < target_height/target_width:
+            new_h = target_height
+            new_w = int(w * (new_h / h))
+        else:  # Width is the short edge
+            new_w = target_width
+            new_h = int(h * (new_w / w))
+        # Resize with the calculated dimensions
+        img_tensor = torch.nn.functional.interpolate(img_tensor, size=(new_h, new_w), mode="bilinear")
+        # Center crop to target resolution if needed
+        if new_h > target_height or new_w > target_width:
+            # Calculate starting positions for crop
+            start_h = max(0, (new_h - target_height) // 2)
+            start_w = max(0, (new_w - target_width) // 2)
+            # Crop
+            img_tensor = img_tensor[:, :, start_h:start_h+target_height, start_w:start_w+target_width]
+        # Place in video tensor
+        frame_idx = min(max(int(frame_idx), 0), num_frames-1)
+        if is_sketch:
+            video[:, :, frame_idx] = img_tensor[:, :3]  # Handle RGBA sketches
+        else:
+            video[:, :, frame_idx] = img_tensor
+        mask[:, frame_idx] = 1.0
+    return video, mask
+def process_sketch_masks(num_sketch_masks, sketch_mask_inputs, num_frames, target_height=480, target_width=832):
+    """Process sketch masks into a single tensor"""
+    # Create empty tensor filled with 1s (1 means no mask, keep original)
+    sketch_local_mask = torch.ones((1, 1, num_frames, target_height, target_width), device=device)
+    for i in range(num_sketch_masks):
+        editor_value, frame_idx = sketch_mask_inputs[i]
+        if editor_value is None or frame_idx is None:
+            continue
+        # For ImageMask, we need to extract the mask from the editor_value dictionary
+        # editor_value is a dict with 'background', 'layers', and 'composite' keys from ImageEditor
+        if isinstance(editor_value, dict):
+            if "composite" in editor_value and editor_value["composite"] is not None:
+                # The 'composite' is the image with mask drawn on it
+                # Since we're using ImageMask with fixed black brush, the black areas are the mask
+                # Convert the composite to a binary mask (0=masked, 1=not masked)
+                # sketch = editor_value["background"]  # This is the sketch
+                mask = editor_value["layers"][0] if editor_value["layers"] else None  # This is the mask layer
+                if mask is not None:
+                    # Convert mask to tensor and normalize
+                    mask_array = np.array(mask)
+                    mask_array = np.max(mask_array, axis=2)
+                    # Convert to tensor, normalize to [0, 1]
+                    mask_tensor = torch.from_numpy(mask_array).float()
+                    if mask_tensor.max() > 1.0:
+                        mask_tensor = mask_tensor / 255.0
+                    # Resize to model's expected resolution
+                    mask_tensor = mask_tensor.unsqueeze(0).unsqueeze(0)  # [1, 1, h, w]
+                    mask_tensor = torch.nn.functional.interpolate(mask_tensor, size=(target_height, target_width), mode="nearest")
+                    # Invert the mask: black (0) = masked area, white (1) = keep original
+                    # We need to invert because in the UI black means "masked"
+                    mask_tensor = 1.0 - mask_tensor
+                    # Place in sketch_local_mask tensor
+                    frame_idx = min(max(int(frame_idx), 0), num_frames-1)
+                    sketch_local_mask[:, :, frame_idx] = mask_tensor
+    sketch_mask_vis = torch.ones((1, 3, num_frames, target_height, target_width), device=device)
+    for t in range(sketch_local_mask.shape[2]):
+        for c in range(3):
+            sketch_mask_vis[0, c, t, :, :] = torch.where(
+                sketch_local_mask[0, 0, t] > 0.5,
+                1.0,  # White for unmasked areas
+                -1.0  # Black for masked areas
+            )
+    return sketch_local_mask
+def invert_sketch(image):
+    """Invert the colors of an image (black to white, white to black)"""
+    if image is None:
+        return None
+    # Handle input from ImageMask component (EditorValue dictionary)
+    if isinstance(image, dict) and "background" in image:
+        # Extract the background image
+        bg_image = image["background"]
+        # Invert the background
+        inverted_bg = invert_sketch_internal(bg_image)
+        # Return updated editor value
+        return gr.update(value=inverted_bg)
+    # Original function for regular images
+    return invert_sketch_internal(image)
+def invert_sketch_internal(image):
+    """Internal function to invert an image"""
+    if image is None:
+        return None
+    # Convert to PIL image if needed
+    if isinstance(image, str):  # If it's a filepath
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    # Ensure it's a PIL image now
+    if not isinstance(image, Image.Image):
+        try:
+            image = Image.fromarray(np.array(image))
+        except:
+            print(f"Warning: Could not convert image of type {type(image)} to PIL Image")
+            return image
+    # Invert the image
+    inverted = Image.fromarray(255 - np.array(image))
+    return inverted
+def create_blank_mask(canvas_width=832, canvas_height=480):
+    """Create a blank white mask image"""
+    return Image.new('RGB', (canvas_width, canvas_height), color='white')
+def create_mask_with_sketch(sketch, canvas_width=832, canvas_height=480):
+    """Create a mask image with sketch as background"""
+    if sketch is None:
+        return create_blank_mask(canvas_width, canvas_height)
+    # Convert sketch to PIL if needed
+    if not isinstance(sketch, Image.Image):
+        sketch = Image.fromarray(np.array(sketch))
+    # Resize sketch to fit the canvas
+    sketch = sketch.resize((canvas_width, canvas_height))
+    # Create a semi-transparent white layer over the sketch
+    overlay = Image.new('RGBA', (canvas_width, canvas_height), (255, 255, 255, 128))
+    # Ensure sketch has alpha channel
+    if sketch.mode != 'RGBA':
+        sketch = sketch.convert('RGBA')
+    # Overlay the semi-transparent white layer on the sketch
+    result = Image.alpha_composite(sketch, overlay)
+    # Convert back to RGB for Gradio
+    return result.convert('RGB')
+def validate_inputs(num_frames, num_cond_images, num_cond_sketches, text_prompt, *args):
+    """Validate user inputs and return error messages if any"""
+    errors = []
+    # Check text prompt
+    if not text_prompt or text_prompt.strip() == "":
+        errors.append("❌ Text prompt is required. Please enter a description for your video.")
+    # Check condition images
+    cond_images_count = 0
+    for i in range(int(num_cond_images)):
+        img = args[i*2]
+        frame_idx = args[i*2+1]
+        if img is None:
+            errors.append(f"❌ Image #{i+1} is missing. Please upload an image or reduce the number of keyframe images.")
+        else:
+            cond_images_count += 1
+        if frame_idx is not None and (frame_idx < 0 or frame_idx >= num_frames):
+            errors.append(f"❌ Frame index for Image #{i+1} is {frame_idx}, which is out of range. Must be between 0 and {num_frames-1}.")
+    # Check condition sketches
+    num_cond_sketches_index = 8  # Starting index for sketch inputs
+    cond_sketches_count = 0
+    sketch_frame_indices = []
+    for i in range(int(num_cond_sketches)):
+        sketch_idx = num_cond_sketches_index + i*2
+        frame_idx_idx = num_cond_sketches_index + 1 + i*2
+        if sketch_idx < len(args) and frame_idx_idx < len(args):
+            sketch = args[sketch_idx]
+            frame_idx = args[frame_idx_idx]
+            # Check if sketch is provided
+            if sketch is None:
+                errors.append(f"❌ Sketch #{i+1} is missing. Please upload a sketch or reduce the number of keyframe sketches.")
+            else:
+                # For ImageMask components, check if background is provided
+                if isinstance(sketch, dict):
+                    if "background" not in sketch or sketch["background"] is None:
+                        errors.append(f"❌ Sketch #{i+1} is missing. Please upload a sketch image.")
+                    else:
+                        cond_sketches_count += 1
+                else:
+                    cond_sketches_count += 1
+            # Check frame index
+            if frame_idx is not None and (frame_idx < 0 or frame_idx >= num_frames):
+                errors.append(f"❌ Frame index for Sketch #{i+1} is {frame_idx}, which is out of range. Must be between 0 and {num_frames-1}.")
+            elif frame_idx is not None:
+                sketch_frame_indices.append(frame_idx)
+    # Check for duplicate frame indices
+    image_frame_indices = []
+    for i in range(int(num_cond_images)):
+        frame_idx = args[i*2+1]
+        if frame_idx is not None:
+            image_frame_indices.append(frame_idx)
+    all_frame_indices = image_frame_indices + sketch_frame_indices
+    if len(all_frame_indices) != len(set(all_frame_indices)):
+        errors.append("❌ Duplicate frame indices detected. Each image and sketch must be placed at a different frame.")
+    # Check minimum requirements
+    if cond_images_count == 0:
+        errors.append("❌ At least one input image is required.")
+    return errors
+def tooncomposer_inference(num_frames, num_cond_images, num_cond_sketches, text_prompt, cfg_scale, sequence_cond_residual_scale, resolution, *args):
+    # Validate inputs first
+    validation_errors = validate_inputs(num_frames, num_cond_images, num_cond_sketches, text_prompt, *args)
+    if validation_errors:
+        error_message = "\n".join(validation_errors)
+        return gr.update(value=None), error_message
+    try:
+        # Parse inputs
+        # Get the condition images
+        cond_images = []
+        for i in range(int(num_cond_images)):
+            img = args[i*2]
+            frame_idx = args[i*2+1]
+            if img is not None and frame_idx is not None:
+                cond_images.append((img, frame_idx))
+        # Get num_cond_sketches
+        if num_cond_sketches is None:
+            num_cond_sketches = 0
+        else:
+            num_cond_sketches = int(num_cond_sketches)
+        # Get condition sketches and masks
+        cond_sketches = []
+        sketch_masks = []
+        num_cond_sketches_index = 8  # Starting index for sketch inputs
+        for i in range(num_cond_sketches):
+            sketch_idx = num_cond_sketches_index + i*2
+            frame_idx_idx = num_cond_sketches_index + 1 + i*2
+            if sketch_idx < len(args) and frame_idx_idx < len(args):
+                editor_value = args[sketch_idx]
+                frame_idx = args[frame_idx_idx]
+                if editor_value is not None and frame_idx is not None:
+                    # Extract the sketch from the background of the editor value
+                    if isinstance(editor_value, dict) and "background" in editor_value:
+                        sketch = editor_value["background"]
+                        if sketch is not None:
+                            cond_sketches.append((sketch, frame_idx))
+                            # Also add to sketch_masks for mask processing
+                            sketch_masks.append((editor_value, frame_idx))
+                    else:
+                        # For regular image inputs (first sketch)
+                        if editor_value is not None:
+                            cond_sketches.append((editor_value, frame_idx))
+        # Set target resolution based on selection
+        target_height, target_width = checkpoints_by_resolution[resolution]["target_height"], checkpoints_by_resolution[resolution]["target_width"]
+        # Update model resolution
+        if not fast_dev:
+            model.update_height_width(target_height, target_width)
+        # Process conditions
+        with torch.no_grad():
+            # Process image conditions
+            masked_cond_video, preserved_cond_mask = process_conditions(
+                num_cond_images, cond_images, num_frames, target_height=target_height, target_width=target_width
+            )
+            # Process sketch conditions
+            masked_cond_sketch, preserved_sketch_mask = process_conditions(
+                len(cond_sketches), cond_sketches, num_frames, is_sketch=True, target_height=target_height, target_width=target_width
+            )
+            # Process sketch masks (if any)
+            sketch_local_mask = None
+            if len(sketch_masks) > 0:
+                sketch_local_mask = process_sketch_masks(
+                    len(sketch_masks), sketch_masks, num_frames, target_height=target_height, target_width=target_width
+                )
+            else:
+                sketch_local_mask = torch.ones((1, 1, num_frames, target_height, target_width), device=device)
+            if fast_dev:
+                print("Fast dev mode, returning dummy video")
+                # Create a simple dummy video for testing
+                temp_dir = tempfile.mkdtemp()
+                video_path = os.path.join(temp_dir, "dummy_video.mp4")
+                # Create a simple test video
+                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                video_writer = cv2.VideoWriter(video_path, fourcc, 20.0, (target_width, target_height))
+                for i in range(30):  # 30 frames
+                    # Create a simple colored frame
+                    frame = np.full((target_height, target_width, 3), (i * 8) % 255, dtype=np.uint8)
+                    video_writer.write(frame)
+                video_writer.release()
+                return video_path, "✅ Dummy video generated successfully in fast dev mode!"
+            masked_cond_video = masked_cond_video.to(device=device, dtype=dtype)
+            preserved_cond_mask = preserved_cond_mask.to(device=device, dtype=dtype)
+            masked_cond_sketch = masked_cond_sketch.to(device=device, dtype=dtype)
+            preserved_sketch_mask = preserved_sketch_mask.to(device=device, dtype=dtype)
+            with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(device).type):
+                # Generate video
+                model.pipe.device = device
+                generated_video = model.pipe(
+                    prompt=[text_prompt],
+                    negative_prompt=[model.negative_prompt],
+                    input_image=None,
+                    num_inference_steps=15,
+                    num_frames=num_frames,
+                    seed=42, tiled=True,
+                    input_condition_video=masked_cond_video,
+                    input_condition_preserved_mask=preserved_cond_mask,
+                    input_condition_video_sketch=masked_cond_sketch,
+                    input_condition_preserved_mask_sketch=preserved_sketch_mask,
+                    sketch_local_mask=sketch_local_mask,
+                    cfg_scale=cfg_scale,
+                    sequence_cond_residual_scale=sequence_cond_residual_scale,
+                    height=target_height,
+                    width=target_width,
+                )
+            # Convert to PIL images
+            video_frames = model.pipe.tensor2video(generated_video[0].cpu())
+            # Convert PIL images to an MP4 video
+            temp_dir = tempfile.mkdtemp()
+            video_path = os.path.join(temp_dir, "generated_video.mp4")
+            width, height = video_frames[0].size
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4 video
+            video_writer = cv2.VideoWriter(video_path, fourcc, 20.0, (width, height))  # 20 fps
+            for frame in video_frames:
+                # Convert PIL image to OpenCV BGR format
+                frame_bgr = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
+                video_writer.write(frame_bgr)
+            video_writer.release()
+            print(f"Generated video saved to {video_path}. Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+            return video_path, f"✅ Video generated successfully! (with {len(cond_images)} keyframe images, {len(cond_sketches)} keyframe sketches)"
+    except Exception as e:
+        error_msg = f"❌ Error during generation: {str(e)}"
+        print(error_msg)
+        return gr.update(value=None), error_msg
+def create_sample_gallery():
+    """Create gallery items for samples"""
+    import os
+    gallery_items = []
+    sample_info = [
+        {
+            "id": 1,
+            "title": "Sample 1",
+            "description": "Man playing with blue fish underwater (3 sketches)",
+            "preview": "samples/1_image1.png"
+        },
+        {
+            "id": 2,
+            "title": "Sample 2",
+            "description": "Girl and boy planting a growing flower (2 sketches)",
+            "preview": "samples/2_image1.jpg"
+        },
+        {
+            "id": 3,
+            "title": "Sample 3",
+            "description": "Ancient Chinese boy giving apple to elder (1 sketch)",
+            "preview": "samples/3_image1.png"
+        }
+    ]
+    for sample in sample_info:
+        if os.path.exists(sample["preview"]):
+            gallery_items.append((sample["preview"], f"{sample['title']}: {sample['description']}"))
+    return gallery_items
+def handle_gallery_select(evt: gr.SelectData):
+    """Handle gallery selection and load the corresponding sample"""
+    sample_id = evt.index + 1  # Gallery index starts from 0, sample IDs start from 1
+    return apply_sample_to_ui(sample_id)
+def load_sample_data(sample_id):
+    """Load sample data based on the selected sample"""
+    import os
+    samples_dir = "samples"
+    # Sample configurations
+    sample_configs = {
+        1: {
+            "prompt": "Underwater scene: A shirtless man plays with a spiraling blue fish. A whale follows a bag in the man's hand, swimming in circles as the man uses the bag to lure the blue fish forward. Anime. High quality.",
+            "num_sketches": 3,
+            "image_frame": 0,
+            "sketch_frames": [20, 40, 60],
+            "num_frames": 61
+        },
+        2: {
+            "prompt": "A girl and a silver-haired boy plant a huge flower. As the camera slowly moves up, the huge flower continues to grow and bloom. Anime. High quality.",
+            "num_sketches": 2,
+            "image_frame": 0,
+            "sketch_frames": [30, 60],
+            "num_frames": 61
+        },
+        3: {
+            "prompt": "An ancient Chinese boy holds an apple and smiles as he gives it to an elderly man nearby. Anime. High quality.",
+            "num_sketches": 1,
+            "image_frame": 0,
+            "sketch_frames": [30],
+            "num_frames": 33
+        }
+    }
+    if sample_id not in sample_configs:
+        return None
+    config = sample_configs[sample_id]
+    # Load image
+    image_path = os.path.join(samples_dir, f"{sample_id}_image1.png")
+    if not os.path.exists(image_path):
+        image_path = os.path.join(samples_dir, f"{sample_id}_image1.jpg")
+    # Load sketches
+    sketches = []
+    for i in range(config["num_sketches"]):
+        sketch_path = os.path.join(samples_dir, f"{sample_id}_sketch{i+1}.jpg")
+        if os.path.exists(sketch_path):
+            sketches.append(sketch_path)
+    # Load output video
+    output_path = os.path.join(samples_dir, f"{sample_id}_out.mp4")
+    return {
+        "prompt": config["prompt"],
+        "image": image_path if os.path.exists(image_path) else None,
+        "sketches": sketches,
+        "image_frame": config["image_frame"],
+        "sketch_frames": config["sketch_frames"][:len(sketches)],
+        "output_video": output_path if os.path.exists(output_path) else None,
+        "num_sketches": len(sketches),
+        "num_frames": config["num_frames"]
+    }
+def apply_sample_to_ui(sample_id):
+    """Apply sample data to UI components"""
+    sample_data = load_sample_data(sample_id)
+    if not sample_data:
+        return [gr.update() for _ in range(20)]  # Return no updates if sample not found
+    updates = [gr.update(value=sample_data["num_frames"])]
+    # Update prompt
+    updates.append(gr.update(value=sample_data["prompt"]))
+    # Update number of sketches
+    updates.append(gr.update(value=sample_data["num_sketches"]))
+    # Update condition image
+    updates.append(gr.update(value=sample_data["image"]))
+    updates.append(gr.update(value=sample_data["image_frame"]))
+    # Update sketches (up to 4)
+    for i in range(4):
+        if i < len(sample_data["sketches"]):
+            # Load sketch image
+            sketch_img = Image.open(sample_data["sketches"][i])
+            # Create ImageMask format
+            sketch_dict = {
+                "background": sketch_img,
+                "layers": [],
+                "composite": sketch_img
+            }
+            updates.append(gr.update(value=sketch_dict))
+            updates.append(gr.update(value=sample_data["sketch_frames"][i]))
+        else:
+            updates.append(gr.update(value=None))
+            updates.append(gr.update(value=30))
+    # Update output video
+    updates.append(gr.update(value=sample_data["output_video"]))
+    # Update status
+    updates.append(gr.update(value=f"✅ Loaded Sample {sample_id}: {sample_data['prompt'][:50]}..."))
+    return updates
+if __name__ == "__main__":
+    from util.stylesheets import css, pre_js, banner_image
+    with gr.Blocks(title="🎨 ToonComposer Demo", css=css, js=pre_js) as iface:
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML(banner_image)
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                💡 **Quick Guide**
+                1. Set the promopt and number of target frames, input keyframe images/sketches, etc.
+                2. Upload keyframe image as the first frame (with index set to 0).
+                3. Upload sketches with optional motion masks for controlled generation at specified frame indices.
+                4. Click the *Generate* button to create your cartoon video.
+                """)
+        max_num_frames = 61
+        cond_images_inputs = []
+        cond_sketches_inputs = []
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Accordion("Video Settings", open=True):
+                    num_frames = gr.Slider(
+                        minimum=17, maximum=max_num_frames, value=max_num_frames, step=1, label="🎥 Number of Frames",
+                        info="Select the total number of frames for the generated video. Should be 4N+"
+                    )
+                    resolution = gr.Radio(
+                        choices=["480p", "608p"],
+                        value="480p",
+                        label="🎥 Resolution",
+                        info="Select the resolution for the generated video."
+                    )
+                    text_prompt = gr.Textbox(
+                        label="📝 Text Prompt",
+                        placeholder="Enter a description for the video.",
+                        info="Describe what you want to generate in the video.",
+                        lines=5
+                    )
+                    cfg_scale = gr.Slider(
+                        minimum=1.0, maximum=15.0, value=7.5, label="⚙️ CFG Scale",
+                        info="Adjust the classifier-free guidance scale for generation."
+                    )
+                    sequence_cond_residual_scale = gr.Slider(
+                        minimum=0.0, maximum=1.2, value=1.0, label="⚙️ Pos-aware Residual Scale",
+                        info="Adjust the residual scale for the position-aware sequence condition."
+                    )
+            with gr.Column(scale=3):
+                with gr.Accordion("Keyframe Image(s)", open=True):
+                    num_cond_images = gr.Slider(
+                        minimum=1, maximum=4, value=1, step=1, label="🖼️ Number of Keyframe Images",
+                        info="Specify how many keyframe color images to use (max 4 images)."
+                    )
+                    for i in range(4):  # Max 4 condition images
+                        with gr.Tab(label=f"Image {i+1}", interactive=i==0) as tab:
+                            gr.Markdown("At least one image is required. \n Each image or sketch will be used to control the cartoon geneartion at the given frame index.")
+                            image_input = gr.Image(
+                                label=f"Image {i+1}", type="pil",
+                                placeholder=f"Upload a keyframe image {i+1}..."
+                            )
+                            frame_index_input = gr.Slider(
+                                label=f"Frame Index for Image #{i+1}", minimum=0, maximum=max_num_frames - 1,
+                                value=i * (max_num_frames-1) // 3, step=1,
+                                info=f"Frame position for Image {i+1} (0 to {max_num_frames-1})"
+                            )
+                            cond_images_inputs.append((image_input, frame_index_input, tab))
+            with gr.Column(scale=3):
+                with gr.Accordion("Keyframe Sketch(es)", open=True):
+                    num_cond_sketches = gr.Slider(
+                        minimum=1, maximum=4, value=1, step=1, label="✏️ Number of Keyframe Sketch(es)",
+                        info="Specify how many keyframe sketches to use (max 4 sketches)."
+                    )
+                    for i in range(4):  # Max 4 condition sketches
+                        with gr.Tab(label=f"Sketch {i + 1}", interactive=i==0) as tab:
+                            gr.Markdown("At least one sketch is required. \n You can optionally draw black areas using the brush tool to mark regions where motion can be generated freely.")
+                            # Use ImageMask which allows uploading an image and drawing a mask
+                            sketch_input = gr.ImageMask(
+                                label=f"Sketch {i + 1} with Motion Mask",
+                                type="pil",
+                                elem_id=f"sketch_mask_{i + 1}"
+                            )
+                            # All sketches have a frame index input
+                            _frame_index_input = gr.Slider(
+                                label=f"Frame Index for Sketch #{i + 1}", minimum=0, maximum=max_num_frames - 1,
+                                value=max_num_frames-1, step=1,
+                                info=f"Frame position for Sketch {i + 1} (0 to {max_num_frames-1})"
+                            )
+                            cond_sketches_inputs.append((sketch_input, _frame_index_input, tab))
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Sample Gallery Section
+                with gr.Accordion("🔍 Sample Gallery", open=True):
+                    gr.Markdown("Click on any sample image below to load the sample inputs.")
+                    sample_gallery = gr.Gallery(
+                        value=create_sample_gallery(),
+                        label="Sample Examples",
+                        show_label=False,
+                        elem_id="sample-gallery",
+                        columns=3,
+                        rows=1,
+                        height=200,
+                        allow_preview=True,
+                        object_fit="contain")
+                with gr.Accordion("🛠️ Tools", open=False):
+                    tool_input = gr.Image(
+                        label=f"Input Image", type="pil",
+                        placeholder=f"Upload an image."
+                    )
+                    invert_btn = gr.Button(f"Invert Colors")
+                    invert_btn.click(
+                        fn=invert_sketch,
+                        inputs=[tool_input],
+                        outputs=[tool_input]
+                    )
+            with gr.Column(scale=1):
+                status_text = gr.Textbox(
+                    label="📊 Status",
+                    value="Ready to generate. Please check your inputs and click Run.",
+                    interactive=False,
+                    lines=5
+                )
+                with gr.Accordion("🎬 Generated Video", open=True):
+                    output_video = gr.Video(
+                        label="Video Output",
+                        show_label=True
+                    )
+                    run_button = gr.Button("🚀 Generate Video", variant="primary", size="lg")
+        def update_visibility(num_items, num_frames):
+            # Update visibility for columns
+            updates_images = []
+            updates_indices = []
+            for i in range(4):
+                is_visible = i < num_items
+                # is_visible = True
+                updates_images.append(gr.update(interactive=is_visible))
+                updates_indices.append(gr.update(
+                    value=((num_frames - 1) // max(num_items, 1)) * (i + 1),
+                    minimum=0, maximum=num_frames-1,
+                ))
+            return updates_images + updates_indices
+        def update_visibility_images(num_items, num_frames):
+            # Update visibility for columns
+            updates_images = []
+            updates_indices = []
+            for i in range(4):
+                is_visible = i < num_items
+                updates_images.append(gr.update(interactive=is_visible))
+                updates_indices.append(gr.update(
+                    value=((num_frames - 1) // max(num_items, 1)) * i,
+                    minimum=0, maximum=num_frames-1,
+                ))
+            return updates_images + updates_indices
+        def update_frame_ranges(num_items_images, num_items_sketches, num_frames):
+            """Update the maximum values for all frame index sliders"""
+            updates = []
+            for i in range(4):  # Images
+                updates.append(gr.update(
+                    value=((num_frames - 1) // max(num_items_images, 1)) * i,
+                    maximum=num_frames-1
+                    ))
+            for i in range(4):  # Sketches
+                updates.append(gr.update(
+                    value=((num_frames - 1) // max(num_items_sketches, 1)) * (i + 1),
+                    maximum=num_frames-1))
+            return updates
+        num_cond_images.change(
+            fn=update_visibility_images,
+            inputs=[num_cond_images, num_frames],
+            outputs=[tab for _, _, tab in cond_images_inputs] \
+                + [frame_index_input for _, frame_index_input, _ in cond_images_inputs],
+        )
+        num_cond_sketches.change(
+            fn=update_visibility,
+            inputs=[num_cond_sketches, num_frames],
+            outputs=[tab for _, _, tab in cond_sketches_inputs] \
+                + [frame_index_input for _, frame_index_input, _ in cond_sketches_inputs],
+        )
+        num_frames.change(
+            fn=update_frame_ranges,
+            inputs=[num_cond_images, num_cond_sketches, num_frames],
+            outputs=[frame_index_input for _, frame_index_input, _ in cond_images_inputs] + \
+                    [frame_index_input for _, frame_index_input, _ in cond_sketches_inputs]
+        )
+        def update_resolution(resolution):
+            model.update_height_width(checkpoints_by_resolution[resolution]["target_height"], checkpoints_by_resolution[resolution]["target_width"])
+            model.load_tooncomposer_checkpoint(checkpoints_by_resolution[resolution]["checkpoint_path"])
+            return gr.update(), gr.update()
+        resolution.change(
+            fn=update_resolution,
+            inputs=[resolution],
+            outputs=[output_video, run_button]
+        )
+        sample_outputs = [
+            num_frames, text_prompt, num_cond_sketches,
+            cond_images_inputs[0][0], cond_images_inputs[0][1],  # Image 1
+            cond_sketches_inputs[0][0], cond_sketches_inputs[0][1],  # Sketch 1
+            cond_sketches_inputs[1][0], cond_sketches_inputs[1][1],  # Sketch 2
+            cond_sketches_inputs[2][0], cond_sketches_inputs[2][1],  # Sketch 3
+            cond_sketches_inputs[3][0], cond_sketches_inputs[3][1],  # Sketch 4
+            output_video, status_text
+        ]
+        sample_gallery.select(
+            fn=handle_gallery_select,
+            outputs=sample_outputs
+        )
+        inputs = [num_frames, num_cond_images, num_cond_sketches, text_prompt, cfg_scale, sequence_cond_residual_scale, resolution]
+        run_button.click(
+            fn=tooncomposer_inference,
+            inputs=inputs,
+            outputs=[output_video, status_text]
+        )
+        # Add condition image inputs
+        for image_input, frame_index_input, _ in cond_images_inputs:
+            inputs.append(image_input)
+            inputs.append(frame_index_input)
+        # Add sketch inputs (both regular and ImageMask)
+        for sketch_input, frame_index_input, _ in cond_sketches_inputs:
+            inputs.append(sketch_input)
+            inputs.append(frame_index_input)
+        iface.launch(server_port=7860, server_name="0.0.0.0",
+                     allowed_paths=[os.path.abspath(os.path.join(os.path.dirname(__file__), "gradio_cache")),
+                                   os.path.abspath(os.path.join(os.path.dirname(__file__), "samples"))])

model/__init__.py ADDED Viewed

File without changes

model/dera.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+import torch.nn as nn
+from einops import rearrange
+from .dit import flash_attention
+import torch.amp as amp
+class DeRAAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 mode="spatial"):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.visualize_attention = False
+        if mode == 'spatial':
+            self.rope_apply = self.rope_apply_spatial
+        elif mode == 'temporal':
+            self.rope_apply = self.rope_apply_temporal
+        elif mode == 'spatial_temporal':
+            self.rope_apply = self.rope_apply_spatial_temporal
+        else:
+            raise ValueError("Invalid mode: {}".format(mode))
+    @staticmethod
+    @amp.autocast(enabled=False, device_type="cuda")
+    def rope_apply_spatial(x, grid_size, freqs, sequence_cond_compressed_indices=None):
+        batch, _, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
+        freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+        assert len(grid_size) == 2, "grid_size mustbe [h, w]"
+        h, w = grid_size[0], grid_size[1]
+        seq_len = h * w
+        x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(
+            batch, seq_len, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[1][:h].view(1, h, 1, -1).expand(1, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(1, h, w, -1)
+        ], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
+        freqs_i = torch.concat([freqs_i.new_ones(batch, seq_len, 1, c//3), freqs_i], dim=3)
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
+        return x_i.float()
+    @staticmethod
+    @amp.autocast(enabled=False, device_type="cuda")
+    def rope_apply_temporal(x, grid_size, freqs, sequence_cond_compressed_indices=None):
+        batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
+        freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+        assert len(grid_size) == 1, "grid_size must be [t]"
+        seq_len = grid_size[0]
+        x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(batch, seq_len, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:seq_len].view(seq_len, 1, 1, -1)
+        ], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
+        freqs_i = torch.concat([freqs_i, freqs_i.new_ones(batch, seq_len, 1, 2 * c//3)], dim=3)
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
+        if seq_len_actual > seq_len:
+            sequence_cond_seq_length = seq_len_actual - seq_len
+            if sequence_cond_seq_length == seq_len:
+                x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2))
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3)
+            else:
+                sequence_cond_compressed_index = sequence_cond_compressed_indices[0]
+                sequence_cond_t_length = len(sequence_cond_compressed_index)
+                assert sequence_cond_t_length == sequence_cond_seq_length, "`sequence_cond_t_length` must be equal to `sequence_cond_seq_length`"
+                x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2))
+                freqs_i_sequence_cond = torch.cat([
+                    freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1),
+                ], dim=-1).reshape(sequence_cond_seq_length, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
+                freqs_i_sequence_cond = torch.concat([freqs_i_sequence_cond, freqs_i_sequence_cond.new_ones(batch, sequence_cond_t_length, 1, 2 * c//3)], dim=3)
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3)
+            x_i = torch.cat([x_i, x_i_sequence_cond], dim=1)
+        return x_i.float()
+    @staticmethod
+    @amp.autocast(enabled=False, device_type="cuda")
+    def rope_apply_spatial_temporal(x, grid_sizes, freqs, sequence_cond_compressed_indices=None):
+        batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
+        freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+        assert len(grid_sizes) == 3, "grid_sizes must be ([f, h, w])"
+        f, h, w = grid_sizes[0], grid_sizes[1], grid_sizes[2]
+        seq_len = f * h * w
+        x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(
+            batch, seq_len, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ], dim=-1).reshape(seq_len, 1, -1)
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
+        if seq_len_actual > seq_len:
+            sequence_cond_seq_length = seq_len_actual - seq_len
+            if sequence_cond_seq_length == seq_len:
+                x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2))
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3)
+            else:
+                sequence_cond_compressed_index = sequence_cond_compressed_indices[0]
+                sequence_cond_t_length = len(sequence_cond_compressed_index)
+                assert sequence_cond_t_length * h * w == sequence_cond_seq_length, "`sequence_cond_t_length * h * w` must be equal to `sequence_cond_seq_length`"
+                x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2))
+                freqs_i_sequence_cond = torch.cat([
+                    freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1).expand(sequence_cond_t_length, h, w, -1),
+                    freqs[1][:h].view(1, h, 1, -1).expand(sequence_cond_t_length, h, w, -1),
+                    freqs[2][:w].view(1, 1, w, -1).expand(sequence_cond_t_length, h, w, -1)
+                ], dim=-1).reshape(sequence_cond_seq_length, 1, -1)
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3)
+            x_i = torch.cat([x_i, x_i_sequence_cond], dim=1)
+        return x_i.float()
+    def forward(self, x, seq_lens, grid_size, freqs, sequence_cond_compressed_indices):
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        def qkv_fn(x):
+            q = self.q(x).view(b, s, n, d)
+            k = self.k(x).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        q_rope = self.rope_apply(q, grid_size, freqs, sequence_cond_compressed_indices)
+        k_rope = self.rope_apply(k, grid_size, freqs, sequence_cond_compressed_indices)
+        if self.visualize_attention:
+            with torch.no_grad():
+                self._last_attn_maps = self._compute_attention_for_visualization(q_rope, k_rope) # CPU tesnor of [S, S]
+                self._last_grid_sizes = grid_size
+                self._last_seq_lens = seq_lens
+        x = flash_attention(
+            q=q_rope,
+            k=k_rope,
+            v=v,
+            k_lens=None,
+            window_size=self.window_size)
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class DeRA(nn.Module):
+    def __init__(self, dim, rank, use_spatial=True, use_temporal=True):
+        super(DeRA, self).__init__()
+        self.dim = dim
+        self.rank = rank
+        self.use_spatial = use_spatial
+        self.use_temporal = use_temporal
+        if not use_spatial and not use_temporal:
+            self.attention_mode = "none"
+        else:
+            self.attention_mode = "spatial_temporal" if use_spatial and use_temporal else "spatial" if use_spatial else "temporal"
+        self.spatial_down_proj = nn.Linear(self.dim, rank, bias=False)
+        self.spatial_up_proj = nn.Linear(rank, self.dim, bias=False)
+        self.spatial_up_proj.weight.data.zero_()
+        if self.attention_mode != "none":
+            self.spatial_attn = DeRAAttention(dim=rank, num_heads=4, window_size=(-1, -1),
+                                              mode=self.attention_mode)
+        else:
+            self.spatial_attn = None
+    def forward(self, x, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices):
+        _, actual_seq, _ = x.shape
+        if isinstance(grid_sizes, torch.Tensor):
+            grid_sizes = tuple(grid_sizes[0].tolist())
+        if len(grid_sizes) != 3:
+            raise ValueError("`grid_sizes` should contain time, spatial height, and width dimensions")
+        _, orig_h, orig_w = grid_sizes
+        actual_t = actual_seq // (orig_h * orig_w)
+        x_low = self.spatial_down_proj(x)
+        if self.attention_mode == "spatial":
+            x_low_spatial = rearrange(x_low, 'b (t h w) r -> (b t) (h w) r', t=actual_t, h=orig_h, w=orig_w)
+            x_low_spatial = self.spatial_attn(x_low_spatial, seq_lens, grid_sizes[1:], freqs, sequence_cond_compressed_indices)
+            x_low = rearrange(x_low_spatial, '(b t) (h w) r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w)
+        elif self.attention_mode == "temporal":
+            x_low_temporal = rearrange(x_low, 'b (t h w) r -> (b h w) t r', t=actual_t, h=orig_h, w=orig_w)
+            x_low_temporal = self.spatial_attn(x_low_temporal, seq_lens, grid_sizes[:1], freqs, sequence_cond_compressed_indices)
+            x_low = rearrange(x_low_temporal, '(b h w) t r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w)
+        elif self.attention_mode == "spatial_temporal":
+            x_low = self.spatial_attn(x_low, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices)
+        x_out = self.spatial_up_proj(x_low)
+        return x_out

model/dit.py ADDED Viewed

	@@ -0,0 +1,1090 @@

+import math
+import torch
+import torch.amp as amp
+import torch.nn as nn
+from util.model_util import hash_state_dict_keys
+from einops import rearrange
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+import warnings
+__all__ = ['WanModel']
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == 'cuda' and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor(
+            [lq] * b, dtype=torch.int32).to(
+                device=q.device, non_blocking=True)
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor(
+            [lk] * b, dtype=torch.int32).to(
+                device=k.device, non_blocking=True)
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn(
+            'Flash attention 3 is not available, use flash attention 2 instead.'
+        )
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic)[0].unflatten(0, (b, lq))
+    elif FLASH_ATTN_2_AVAILABLE:
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic).unflatten(0, (b, lq))
+    elif SAGE_ATTN_AVAILABLE:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = sageattn(q, k, v, dropout_p=dropout_p, is_causal=causal)
+        x = x.transpose(1, 2).contiguous()
+    else:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).contiguous()
+    # output
+    return x.type(out_dtype)
+def create_sdpa_mask(q, k, q_lens, k_lens, causal=False):
+    b, lq, lk = q.size(0), q.size(1), k.size(1)
+    if q_lens is None:
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32)
+    if k_lens is None:
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32)
+    attn_mask = torch.zeros((b, lq, lk), dtype=torch.bool)
+    for i in range(b):
+        q_len, k_len = q_lens[i], k_lens[i]
+        attn_mask[i, q_len:, :] = True
+        attn_mask[i, :, k_len:] = True
+        if causal:
+            causal_mask = torch.triu(torch.ones((lq, lk), dtype=torch.bool), diagonal=1)
+            attn_mask[i, :, :] = torch.logical_or(attn_mask[i, :, :], causal_mask)
+    attn_mask = attn_mask.logical_not().to(q.device, non_blocking=True)
+    return attn_mask
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn('Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.')
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
+        out = out.transpose(1, 2).contiguous()
+        return out
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@amp.autocast(enabled=False, device_type="cuda")
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta,
+                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@amp.autocast(enabled=False, device_type="cuda")
+def rope_apply(x, grid_sizes, freqs, sequence_cond_compressed_indices=None):
+    batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    output = []
+    assert len(grid_sizes) == batch, "grid_sizes must have the same length as the batch size ([b, 3=[f, h, w])"
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(
+            seq_len, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ], dim=-1).reshape(seq_len, 1, -1)
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        if seq_len_actual > seq_len:
+            sequence_cond_seq_length = seq_len_actual - seq_len
+            if sequence_cond_seq_length == seq_len:
+                x_i_sequence_cond = torch.view_as_complex(x[i, seq_len:].to(torch.float64).reshape(seq_len_actual - seq_len, n, -1, 2))
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(2)
+            else:
+                sequence_cond_compressed_index = sequence_cond_compressed_indices[i]
+                sequence_cond_t_length = len(sequence_cond_compressed_index)
+                assert sequence_cond_t_length * h * w == sequence_cond_seq_length, "`sequence_cond_t_length * h * w` must be equal to `sequence_cond_seq_length`"
+                x_i_sequence_cond = torch.view_as_complex(x[i, seq_len:].to(torch.float64).reshape(sequence_cond_seq_length, n, -1, 2))
+                freqs_i_sequence_cond = torch.cat([
+                    freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1).expand(sequence_cond_t_length, h, w, -1),
+                    freqs[1][:h].view(1, h, 1, -1).expand(sequence_cond_t_length, h, w, -1),
+                    freqs[2][:w].view(1, 1, w, -1).expand(sequence_cond_t_length, h, w, -1)
+                ], dim=-1).reshape(sequence_cond_seq_length, 1, -1)
+                x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(2)
+            x_i = torch.cat([x_i, x_i_sequence_cond])
+        output.append(x_i)
+    return torch.stack(output).float()
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanSelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.visualize_attention = False
+    def forward(self, x, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices):
+        """
+        Args:
+            x:              [B, L, C].
+            seq_lens:       [B].
+            grid_sizes:     [B, 3=[f, h, w]].
+            freqs:          [L, 2].
+            sequence_cond_compressed_indices: [B, T_sequence_condITION].
+        `f` in `grid_sizes` can less than the actual seq_lens (L),
+        which indicates full in-context condition (when L=2*f) or
+        sparse in-context condition (when `f` < L < 2*f and `sequence_cond_compressed_indices` is not None) is used.
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        q_rope = rope_apply(q, grid_sizes, freqs, sequence_cond_compressed_indices)
+        k_rope = rope_apply(k, grid_sizes, freqs, sequence_cond_compressed_indices)
+        if self.visualize_attention:
+            with torch.no_grad():
+                self._last_attn_maps = self._compute_attention_for_visualization(q_rope, k_rope) # CPU tesnor of [S, S]
+                self._last_grid_sizes = grid_sizes
+                self._last_seq_lens = seq_lens
+        x = flash_attention(
+            q=q_rope,
+            k=k_rope,
+            v=v,
+            k_lens=None,
+            window_size=self.window_size)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+    def _compute_attention_for_visualization(self, q, k):
+        """Compute attention maps for visualization purposes"""
+        # b, _, n, d = q.shape
+        print("Computing attention maps for visualization")
+        # Reshape for attention computation
+        q = q.permute(0, 2, 1, 3)  # [b, n, s, d]
+        k = k.permute(0, 2, 1, 3)  # [b, n, s, d]
+        # query: b, n, s, d
+        print("q.shape=", q.shape)
+        print("k.shape=", k.shape)
+        attention_probs_list = []
+        for i in range(0, q.shape[1], 20):
+            print(f"Computing attention for head {i} to {i+20}")
+            query_attention = q[-1][i : i + 20]
+            key_attention = k[-1][i : i + 20]
+            identity_matrix = torch.eye(
+                query_attention.shape[-2],
+                device=query_attention.device,
+                dtype=query_attention.dtype,
+            ) # shape=[s]
+            attention_probs_temp = torch.nn.functional.scaled_dot_product_attention(
+                query_attention,
+                key_attention,
+                identity_matrix,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            attention_probs_list.append(attention_probs_temp.detach().cpu())
+            del (
+                query_attention,
+                key_attention,
+                identity_matrix,
+                attention_probs_temp,
+            )
+        attention_probs = torch.mean(torch.cat(attention_probs_list), dim=0).float().numpy()
+        print("Attention maps computed. Shape=", attention_probs.shape)
+        # Only keep attention maps, don't compute the output
+        return attention_probs # [s, s]
+class WanT2VCrossAttention(WanSelfAttention):
+    def forward(self, x, context, context_lens):
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(
+            dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, context, context_lens):
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = self.o(x)
+        return x
+WANX_CROSSATTENTION_CLASSES = {
+    't2v_cross_attn': WanT2VCrossAttention,
+    'i2v_cross_attn': WanI2VCrossAttention,
+}
+class WanAttentionBlock(nn.Module):
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6,
+                 use_local_lora=False,
+                 use_dera=False,
+                 dera_rank=None,
+                 use_dera_spatial=True,
+                 use_dera_temporal=True):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
+        self.norm3 = WanLayerNorm(
+            dim, eps,
+            elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WANX_CROSSATTENTION_CLASSES[cross_attn_type](
+            dim, num_heads, (-1, -1), qk_norm, eps)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
+            nn.Linear(ffn_dim, dim))
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.use_local_lora = use_local_lora
+        if use_local_lora:
+            from .local_lora import LocalLoRA
+            self.local_lora = LocalLoRA(dim=dim, rank=64, kernel_size=(3, 3), stride=(1, 1))
+        self.use_dera = use_dera
+        if use_dera:
+            from .dera import DeRA
+            self.dera = DeRA(dim, rank=dera_rank, use_spatial=use_dera_spatial, use_temporal=use_dera_temporal)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+        sequence_cond_compressed_indices,
+        dera_freqs=None
+    ):
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = (self.modulation.to(dtype=e.dtype, device=e.device) + e).chunk(6, dim=1)
+        assert e[0].dtype == torch.float32
+        # self-attention
+        x_self_attn_input = self.norm1(x).float() * (1 + e[1]) + e[0]
+        y = self.self_attn(x_self_attn_input, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices)
+        if self.use_local_lora:
+            y = y + self.local_lora(x_self_attn_input, grid_sizes)
+        if self.use_dera:
+            y = y + self.dera(x_self_attn_input, seq_lens, grid_sizes, dera_freqs, sequence_cond_compressed_indices)
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            x = x + y * e[2]
+        def cross_attn_ffn(x, context, context_lens, e):
+            x = x + self.cross_attn(self.norm3(x), context, context_lens)
+            y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
+            with amp.autocast(dtype=torch.float32, device_type="cuda"):
+                x = x + y * e[5]
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = (self.modulation.to(dtype=e.dtype, device=e.device) + e.unsqueeze(1)).chunk(2, dim=1)
+            x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim))
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class WanModel(nn.Module):
+    def __init__(self,
+                 model_type='t2v',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6,
+                 use_local_lora=False,
+                 use_dera=False,
+                 dera_rank=None,
+                 use_dera_spatial=True,
+                 use_dera_temporal=True,
+                 use_sequence_cond=False,
+                 sequence_cond_in_dim=None,
+                 sequence_cond_mode=None,
+                 use_channel_cond=False,
+                 channel_cond_in_dim=None,
+                 use_sequence_cond_position_aware_residual=False,
+                 use_sequence_cond_loss=False
+                 ):
+        super().__init__()
+        assert model_type in ['t2v', 'i2v']
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.use_local_lora = use_local_lora
+        self.use_dera = use_dera
+        # embeddings
+        self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
+            nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
+        self.blocks = nn.ModuleList([
+            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
+                               window_size, qk_norm, cross_attn_norm, eps, use_local_lora=use_local_lora,
+                               use_dera=use_dera, dera_rank=dera_rank, use_dera_spatial=use_dera_spatial, use_dera_temporal=use_dera_temporal)
+            for _ in range(num_layers)
+        ])
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat([
+            rope_params(1024, d - 4 * (d // 6)),
+            rope_params(1024, 2 * (d // 6)),
+            rope_params(1024, 2 * (d // 6))
+        ], dim=1)
+        if self.use_dera:
+            dera_d = dera_rank // 4  # (18)
+            self.dera_freqs = torch.cat([
+                rope_params(1024, dera_d - 4 * (dera_d // 6)),
+                rope_params(1024, 2 * (dera_d // 6)),
+                rope_params(1024, 2 * (dera_d // 6))
+            ], dim=1)
+        else:
+            self.dera_freqs = None
+        if model_type == 'i2v':
+            self.img_emb = MLPProj(1280, dim)
+        self.init_weights()
+        self.use_sequence_cond = use_sequence_cond
+        self.sequence_cond_in_dim = sequence_cond_in_dim
+        self.sequence_cond_mode = sequence_cond_mode
+        if use_sequence_cond:
+            assert sequence_cond_in_dim is not None, "`sequence_cond_in_dim` must be provided when `use_sequence_cond` is True"
+            self.sequence_cond_patch_embedding = nn.Conv3d(sequence_cond_in_dim, dim, kernel_size=patch_size, stride=patch_size)
+            self.sequence_cond_identifier = nn.Parameter(torch.randn(1, 1, dim) / dim**0.5)
+        self.use_channel_cond = use_channel_cond
+        self.channel_cond_in_dim = channel_cond_in_dim
+        if use_channel_cond:
+            assert channel_cond_in_dim is not None, "`channel_cond_in_dim` must be provided when `use_channel_cond` is True"
+        self.use_sequence_cond_position_aware_residual = use_sequence_cond_position_aware_residual
+        if use_sequence_cond_position_aware_residual:
+            self.sequence_cond_residual_proj = nn.Linear(dim, dim, bias=False)
+            self.sequence_cond_residual_proj.weight.data.zero_()
+        self.use_sequence_cond_loss = use_sequence_cond_loss
+        if self.use_sequence_cond_loss:
+            self.sequence_latent_to_cond_proj = nn.Linear(dim, dim, bias=False)
+            self.sequence_latent_to_cond_proj.weight.data.zero_()
+            self.head_sequence_cond_out = nn.Linear(dim, math.prod(patch_size) * out_dim)
+    def copy_sequence_cond_patch_embedding_weights(self):
+        size_patch_embedding = self.patch_embedding.weight.size(1)
+        size_sequence_cond_patch_embedding = self.sequence_cond_patch_embedding.weight.size(1)
+        self.sequence_cond_patch_embedding.weight.data = self.patch_embedding.weight.data[:, size_patch_embedding - size_sequence_cond_patch_embedding:, :, :, :].clone()
+        if self.patch_embedding.bias is not None:
+            self.sequence_cond_patch_embedding.bias.data = self.patch_embedding.bias.data.clone()
+    def copy_patch_embedding_weights_for_channel_cond(self):
+        original_patch_in_channels = self.patch_embedding.in_channels
+        new_patch_embedding = nn.Conv3d(in_channels=original_patch_in_channels + self.channel_cond_in_dim,
+                                        out_channels=self.dim, kernel_size=self.patch_size, stride=self.patch_size)
+        new_patch_embedding.weight.data[:, :original_patch_in_channels, :, :, :] = self.patch_embedding.weight.data.clone()
+        if self.patch_embedding.bias is not None:
+            new_patch_embedding.bias.data = self.patch_embedding.bias.data.clone()
+        del self.patch_embedding
+        self.patch_embedding = new_patch_embedding
+    def forward(
+        self,
+        x,
+        timestep,
+        context,
+        seq_len,
+        clip_fea=None,
+        y=None,
+        use_gradient_checkpointing=False,
+        sequence_cond=None,
+        sequence_cond_compressed_indices=None,
+        channel_cond=None,
+        sequence_cond_residual_scale=1.0,
+        **kwargs,
+    ):
+        """
+        x:              A list of videos each with shape [C, T, H, W].
+        t:              [B].
+        context:        A list of text embeddings each with shape [L, C].
+        sequence_cond: A list of conditional frames each with shape [C, T_sequence_cond, H, W].
+        sequence_cond_compressed_indices: [B, T_sequence_cond] Indices for any additional conditional information, where T_sequence_cond < T. For sparse mode only.
+            Note:
+                sequence_cond will be injected into the model as an additional input sequence, i.e., sequence dimension.
+                channel_cond will be injected into the model in the input' channel dimension.
+            Examples:
+                1) for extra cond case:
+                    # given x: [B, C, T, H, W] ----> [B, L=T*H*W, C] --patch_embedding--> [B, L, D]
+                    # sequence_cond: [B, C_sequence_cond, T_sequence_cond, H, W] ----> [B, L_sequence_cond=T_sequence_cond*H*W, C_sequence_cond] --sequence_cond_embedding--> [B, L_sequence_cond, D]
+                    x = torch.concat([x, sequence_cond], dim=2) # Concat on sequence dimension after patch/extra cond embedding
+                    # after concat, x: [B, L+L_sequence_cond, D]
+                2) for channel cond case:
+                    given x: [B, C, T, H, W]
+                    channel_cond: [B, C_CHANNEL_COND, T, H, W]
+                    x = torch.concat([x, channel_cond], dim=1) # Concat on channel dimension before patch/extra cond embedding
+                    # x: [B, C + C_CHANNEL_COND, T, H, W] --patch_embedding(requires param copy and tuning)--> [B, L=T*H*W, D]
+        """
+        if self.model_type == 'i2v':
+            assert clip_fea is not None and y is not None
+        # params
+        device = x[0].device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if self.dera_freqs is not None and self.dera_freqs.device != device:
+            self.dera_freqs = self.dera_freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        if channel_cond is not None:
+            assert self.use_channel_cond, "forward argument `channel_cond` is provided but model property `self.use_channel_cond` is False"
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, channel_cond)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        x = torch.cat(x, dim=0)
+        if sequence_cond is not None:
+            assert self.use_sequence_cond, "forward argument `sequence_cond` is provided but model property `self.use_sequence_cond` is False"
+            sequence_cond = [self.sequence_cond_patch_embedding(u.unsqueeze(0)) for u in sequence_cond]
+            sequence_cond = [u.flatten(2).transpose(1, 2) + self.sequence_cond_identifier for u in sequence_cond]
+            sequence_cond = torch.concat(sequence_cond, dim=0)
+            x = torch.concat([x, sequence_cond], dim=1)
+        actual_seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        # time embeddings
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, timestep).float())
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack([
+                torch.cat(
+                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                for u in context
+            ]))
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(e=e0, seq_lens=actual_seq_lens, grid_sizes=grid_sizes,
+            freqs=self.freqs, context=context, context_lens=context_lens,
+            sequence_cond_compressed_indices=sequence_cond_compressed_indices, dera_freqs=self.dera_freqs)
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        for block_idx, block in enumerate(self.blocks):
+            if self.training and use_gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x, **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                x = block(x, **kwargs)
+            if self.use_sequence_cond_loss and block_idx == len(self.blocks) - 3:
+                # This this function, the context length will be extended from (N+C) to 2N, where C is the length of the sparse sequence cond.
+                x_ori = x[:, :seq_len, :]
+                x_ori_projected = self.sequence_latent_to_cond_proj(x_ori)
+                x_seq_cond = x[:, seq_len:, :]
+                seq_cond_length = len(sequence_cond_compressed_indices[0])
+                x_ori_projected = rearrange(x_ori_projected, 'b (t h w) c -> b c t h w', t=grid_sizes[0, 0], h=grid_sizes[0, 1], w=grid_sizes[0, 2])
+                x_seq_cond = rearrange(x_seq_cond, 'b (t h w) c -> b c t h w', t=seq_cond_length, h=grid_sizes[0, 1], w=grid_sizes[0, 2])
+                x_ori_projected[:, :, sequence_cond_compressed_indices[0], :, :] += x_seq_cond
+                x_ori_projected = rearrange(x_ori_projected, 'b c t h w -> b (t h w) c')
+                x = torch.concat([x_ori, x_ori_projected], dim=1)
+                # Let the later blocks generate sketches at the full seqeuence length
+            if self.use_sequence_cond_position_aware_residual and block_idx < len(self.blocks) - 1:
+                # Apply the sequence condition position-aware residual for all blocks except the last one
+                x_ori = x[:, :seq_len, :]
+                x_seq_cond = x[:, seq_len:, :]
+                x_seq_cond_porjected = self.sequence_cond_residual_proj(x_seq_cond)
+                assert x_ori.shape[0] == 1, "Only support batch size 1 for `sequence_cond_position_aware_residual`."
+                seq_cond_length = len(sequence_cond_compressed_indices[0])
+                x_ori = rearrange(x_ori, 'b (t h w) c -> b c t h w', t=grid_sizes[0, 0], h=grid_sizes[0, 1], w=grid_sizes[0, 2])
+                x_seq_cond_porjected = rearrange(x_seq_cond_porjected, 'b (t h w) c -> b c t h w', t=seq_cond_length, h=grid_sizes[0, 1], w=grid_sizes[0, 2])
+                x_ori[:, :, sequence_cond_compressed_indices[0], :, :] = x_ori[:, :, sequence_cond_compressed_indices[0], :, :] + x_seq_cond_porjected * sequence_cond_residual_scale
+                x_ori = rearrange(x_ori, 'b c t h w -> b (t h w) c')
+                x = torch.concat([x_ori, x_seq_cond], dim=1)
+        if sequence_cond is not None:
+            if self.use_sequence_cond_loss:
+                sequence_cond_out = x[:, seq_len:, :]
+                sequence_cond_out = self.unpatchify(sequence_cond_out, grid_sizes)  # sequence_cond_grid_sizes
+                sequence_cond_out = torch.stack(sequence_cond_out).float()  # b, c, t, h, w
+            else:
+                sequence_cond_out = None
+            x = x[:, :seq_len, :]
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        x = torch.stack(x).float()
+        if sequence_cond is not None and self.use_sequence_cond_loss:
+            return x, sequence_cond_out
+        return x
+    def unpatchify(self, x, grid_sizes):
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum('fhwpqrc->cfphqwr', u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)
+    @staticmethod
+    def state_dict_converter():
+        return WanModelStateDictConverter()
+class WanModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {"blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+            "blocks.0.scale_shift_table": "blocks.0.modulation",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "scale_shift_table": "head.modulation",
+            "proj_out.bias": "head.head.bias",
+            "proj_out.weight": "head.head.weight",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
+                if name_ in rename_dict:
+                    name_ = rename_dict[name_]
+                    name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
+                    state_dict_[name_] = param
+        if hash_state_dict_keys(state_dict) == "cb104773c6c2cb6df4f9529ad5c60d0b":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict_, config
+    def from_civitai(self, state_dict):
+        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "model_type": "i2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict, config

model/image_encoder.py ADDED Viewed

	@@ -0,0 +1,903 @@

+"""
+Concise re-implementation of
+``https://github.com/openai/CLIP'' and
+``https://github.com/mlfoundations/open_clip''.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .dit import flash_attention
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout))
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(self,
+                 vocab_size=250002,
+                 max_seq_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 dim=1024,
+                 num_heads=16,
+                 num_layers=24,
+                 post_norm=True,
+                 dropout=0.1,
+                 eps=1e-5):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+            for _ in range(num_layers)
+        ])
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = self.token_embedding(ids) + \
+            self.type_embedding(torch.zeros_like(ids)) + \
+            self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(
+            mask.view(b, 1, 1, s).gt(0), 0.0,
+            torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False,
+                      return_tokenizer=False,
+                      device='cpu',
+                      **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5)
+    cfg.update(**kwargs)
+    # init model
+    if pretrained:
+        from sora import DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device('meta'):
+            model = XLMRoberta(**cfg)
+        # load checkpoint
+        model.load_state_dict(
+            torch.load(
+                DOWNLOAD_TO_CACHE('models/xlm_roberta/xlm_roberta_large.pth'),
+                map_location=device),
+            assign=True)
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = XLMRoberta(**cfg)
+    # init tokenizer
+    if return_tokenizer:
+        from sora.data import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(
+            name='xlm-roberta-large',
+            seq_len=model.text_len,
+            clean='whitespace')
+        return model, tokenizer
+    else:
+        return model
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat([
+            pos[:, :n],
+            F.interpolate(
+                pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
+                    0, 3, 1, 2),
+                size=(tar_grid, tar_grid),
+                mode='bicubic',
+                align_corners=False).flatten(2).transpose(1, 2)
+        ],
+                         dim=1)
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 causal=False,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        x = x.reshape(b, s, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 post_norm=False,
+                 causal=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert activation in ['quick_gelu', 'gelu', 'swi_glu']
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
+                                  proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == 'swi_glu':
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 activation='gelu',
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+        # compute attention
+        x = flash_attention(q, k, v, version=2)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 mlp_ratio=4,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 pool_type='token',
+                 pre_norm=True,
+                 post_norm=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        if image_size % patch_size != 0:
+            print(
+                '[WARNING] image_size is not divisible by patch_size',
+                flush=True)
+        assert pool_type in ('token', 'token_fc', 'attn_pool')
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size)**2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3,
+            dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=not pre_norm)
+        if pool_type in ('token', 'token_fc'):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(gain * torch.randn(
+            1, self.num_patches +
+            (1 if pool_type in ('token', 'token_fc') else 0), dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
+                           activation, attn_dropout, proj_dropout, norm_eps)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == 'token':
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == 'token_fc':
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == 'attn_pool':
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
+                                      proj_dropout, norm_eps)
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ('token', 'token_fc'):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1).to(dtype=x.dtype, device=x.device), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        e = e.to(dtype=x.dtype, device=x.device)
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=512,
+                 image_size=224,
+                 patch_size=16,
+                 vision_dim=768,
+                 vision_mlp_ratio=4,
+                 vision_heads=12,
+                 vision_layers=12,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 vocab_size=49408,
+                 text_len=77,
+                 text_dim=512,
+                 text_mlp_ratio=4,
+                 text_heads=8,
+                 text_layers=12,
+                 text_causal=True,
+                 text_pool='argmax',
+                 text_head_bias=False,
+                 logit_bias=None,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pool = vision_pool
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_mlp_ratio = text_mlp_ratio
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_causal = text_causal
+        self.text_pool = text_pool
+        self.text_head_bias = text_head_bias
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            mlp_ratio=text_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            causal=text_causal,
+            pool_type=text_pool,
+            head_bias=text_head_bias,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+        if logit_bias is not None:
+            self.logit_bias = nn.Parameter(logit_bias * torch.ones([]))
+        # initialize weights
+        self.init_weights()
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long. Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, std=0.1)
+        # attentions
+        for modality in ['visual', 'textual']:
+            dim = self.vision_dim if modality == 'visual' else self.text_dim
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (
+                1.0 / math.sqrt(2 * len(transformer)))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop('out_dim')
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False))
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=1024,
+                 image_size=224,
+                 patch_size=14,
+                 vision_dim=1280,
+                 vision_mlp_ratio=4,
+                 vision_heads=16,
+                 vision_layers=32,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 activation='gelu',
+                 vocab_size=250002,
+                 max_text_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 text_dim=1024,
+                 text_heads=16,
+                 text_layers=24,
+                 text_post_norm=True,
+                 text_dropout=0.1,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = None
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+def _clip(pretrained=False,
+          pretrained_name=None,
+          model_cls=CLIP,
+          return_transforms=False,
+          return_tokenizer=False,
+          tokenizer_padding='eos',
+          dtype=torch.float32,
+          device='cpu',
+          **kwargs):
+    # init model
+    if pretrained and pretrained_name:
+        from sora import BUCKET, DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device('meta'):
+            model = model_cls(**kwargs)
+        # checkpoint path
+        checkpoint = f'models/clip/{pretrained_name}'
+        if dtype in (torch.float16, torch.bfloat16):
+            suffix = '-' + {
+                torch.float16: 'fp16',
+                torch.bfloat16: 'bf16'
+            }[dtype]
+            if object_exists(BUCKET, f'{checkpoint}{suffix}.pth'):
+                checkpoint = f'{checkpoint}{suffix}'
+        checkpoint += '.pth'
+        # load
+        model.load_state_dict(
+            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device),
+            assign=True,
+            strict=False)
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = model_cls(**kwargs)
+    # set device
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if 'siglip' in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose([
+            T.Resize((model.image_size, model.image_size),
+                     interpolation=T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+        output += (transforms,)
+    # init tokenizer
+    if return_tokenizer:
+        from sora import data
+        if 'siglip' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name=f'timm/{pretrained_name}',
+                seq_len=model.text_len,
+                clean='canonicalize')
+        elif 'xlm' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name='xlm-roberta-large',
+                seq_len=model.max_text_len - 2,
+                clean='whitespace')
+        elif 'mba' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name='facebook/xlm-roberta-xl',
+                seq_len=model.max_text_len - 2,
+                clean='whitespace')
+        else:
+            tokenizer = data.CLIPTokenizer(
+                seq_len=model.text_len, padding=tokenizer_padding)
+        output += (tokenizer,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+        pretrained=False,
+        pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
+        **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool='token',
+        activation='gelu',
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0)
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class WanImageEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=torch.float32,
+            device="cpu")
+    def encode_image(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat([
+            F.interpolate(
+                u,
+                size=size,
+                mode='bicubic',
+                align_corners=False) for u in videos
+        ])
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        out = self.model.visual(videos, use_31_block=True)
+        return out
+    @staticmethod
+    def state_dict_converter():
+        return WanImageEncoderStateDictConverter()
+class WanImageEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.startswith("textual."):
+                continue
+            name = "model." + name
+            state_dict_[name] = param
+        return state_dict_

model/prompter.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from diffsynth.prompters.base_prompter import BasePrompter
+from model.text_encoder import WanTextEncoder
+from transformers import AutoTokenizer
+import ftfy
+import html
+import string
+import regex as re
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+class WanPrompter(BasePrompter):
+    def __init__(self, tokenizer_path=None, text_len=512):
+        super().__init__()
+        self.text_len = text_len
+        self.text_encoder = None
+        self.fetch_tokenizer(tokenizer_path)
+    def fetch_tokenizer(self, tokenizer_path=None):
+        if tokenizer_path is not None:
+            self.tokenizer = HuggingfaceTokenizer(name=tokenizer_path, seq_len=self.text_len, clean='whitespace')
+    def fetch_models(self, text_encoder: WanTextEncoder = None):
+        self.text_encoder = text_encoder
+    def encode_prompt(self, prompt, positive=True, device="cuda"):
+        prompt = self.process_prompt(prompt, positive=positive)
+        ids, mask = self.tokenizer(prompt, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_emb = self.text_encoder(ids, mask)
+        prompt_emb = [u[:v] for u, v in zip(prompt_emb, seq_lens)]
+        return prompt_emb

model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
+                            self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1,
+                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
+            torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
+            0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
+                                     math.log(self.max_dist / max_exact) *
+                                     (num_buckets - max_exact)).long()
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+class WanTextEncoder(torch.nn.Module):
+    def __init__(self,
+                 vocab=256384,
+                 dim=4096,
+                 dim_attn=4096,
+                 dim_ffn=10240,
+                 num_heads=64,
+                 num_layers=24,
+                 num_buckets=32,
+                 shared_pos=False,
+                 dropout=0.1):
+        super(WanTextEncoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                            shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanTextEncoderStateDictConverter()
+class WanTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

model/vae.py ADDED Viewed

	@@ -0,0 +1,809 @@

+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+CACHE_T = 2
+def check_is_instance(model, module_class):
+    if isinstance(model, module_class):
+        return True
+    if hasattr(model, "module") and isinstance(model.module, module_class):
+        return True
+    return False
+def block_causal_mask(x, block_size):
+    # params
+    b, n, s, _, device = *x.size(), x.device
+    assert s % block_size == 0
+    num_blocks = s // block_size
+    # build mask
+    mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
+    for i in range(num_blocks):
+        mask[:, :,
+             i * block_size:(i + 1) * block_size, :(i + 1) * block_size] = 1
+    return mask
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else
+                    -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(dim,
+                                          dim * 2, (3, 1, 1),
+                                          padding=(1, 0, 0))
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim,
+                                          dim, (3, 1, 1),
+                                          stride=(2, 1, 1),
+                                          padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(
+            0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            #attn_mask=block_causal_mask(q, block_size=h * w)
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(out_dim, out_dim, dropout),
+                                    AttentionBlock(out_dim),
+                                    ResidualBlock(out_dim, out_dim, dropout))
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, z_dim, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
+                                    AttentionBlock(dims[0]),
+                                    ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if check_is_instance(m, CausalConv3d):
+            count += 1
+    return count
+class VideoVAE_(nn.Module):
+    def __init__(self,
+                 dim=96,
+                 z_dim=16,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :],
+                                   feat_cache=self._enc_feat_map,
+                                   feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                                    feat_cache=self._enc_feat_map,
+                                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i:i + 1, :, :],
+                                   feat_cache=self._feat_map,
+                                   feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(x[:, :, i:i + 1, :, :],
+                                    feat_cache=self._feat_map,
+                                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2) # may add tensor offload
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+class WanVideoVAE(nn.Module):
+    def __init__(self, z_dim=16):
+        super().__init__()
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
+        self.upsampling_factor = 8
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip((torch.arange(border_width) + 1) / border_width, dims=(0,))
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, _, H, W = data.shape
+        h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
+        w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
+        h = repeat(h, "H -> H W", H=H, W=W)
+        w = repeat(w, "W -> H W", H=H, W=W)
+        mask = torch.stack([h, w]).min(dim=0).values
+        mask = rearrange(mask, "H W -> 1 1 1 H W")
+        return mask
+    def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
+        _, _, T, H, W = hidden_states.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = T * 4 - 3
+        weight = torch.zeros((1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        values = torch.zeros((1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding"):
+            hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h==0, h_>=H, w==0, w_>=W),
+                border_width=((size_h - stride_h) * self.upsampling_factor, (size_w - stride_w) * self.upsampling_factor)
+            ).to(dtype=hidden_states.dtype, device=data_device)
+            target_h = h * self.upsampling_factor
+            target_w = w * self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.float().clamp_(-1, 1)
+        return values
+    def tiled_encode(self, video, device, tile_size, tile_stride):
+        _, _, T, H, W = video.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = (T + 3) // 4
+        weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
+        values = torch.zeros((1, 16, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
+            hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h==0, h_>=H, w==0, w_>=W),
+                border_width=((size_h - stride_h) // self.upsampling_factor, (size_w - stride_w) // self.upsampling_factor)
+            ).to(dtype=video.dtype, device=data_device)
+            target_h = h // self.upsampling_factor
+            target_w = w // self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.float()
+        return values
+    def single_encode(self, video, device):
+        video = video.to(device)
+        x = self.model.encode(video, self.scale)
+        return x.float()
+    def single_decode(self, hidden_state, device):
+        hidden_state = hidden_state.to(device)
+        video = self.model.decode(hidden_state, self.scale)
+        return video.float().clamp_(-1, 1)
+    def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        videos = [video.to("cpu") for video in videos]
+        hidden_states = []
+        for video in videos:
+            video = video.unsqueeze(0)
+            if tiled:
+                tile_size = (tile_size[0] * 8, tile_size[1] * 8)
+                tile_stride = (tile_stride[0] * 8, tile_stride[1] * 8)
+                hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
+            else:
+                hidden_state = self.single_encode(video, device)
+            hidden_state = hidden_state.squeeze(0)
+            hidden_states.append(hidden_state)
+        hidden_states = torch.stack(hidden_states)
+        return hidden_states
+    def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
+        videos = []
+        for hidden_state in hidden_states:
+            hidden_state = hidden_state.unsqueeze(0)
+            if tiled:
+                video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
+            else:
+                video = self.single_decode(hidden_state, device)
+            video = video.squeeze(0)
+            videos.append(video)
+        videos = torch.stack(videos)
+        return videos
+    @staticmethod
+    def state_dict_converter():
+        return WanVideoVAEStateDictConverter()
+class WanVideoVAEStateDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        if 'model_state' in state_dict:
+            state_dict = state_dict['model_state']
+        for name in state_dict:
+            state_dict_['model.' + name] = state_dict[name]
+        return state_dict_

pipeline/__init__.py ADDED Viewed

File without changes

pipeline/i2v_pipeline.py ADDED Viewed

	@@ -0,0 +1,511 @@

+from diffsynth import ModelManager
+from diffsynth.pipelines.base import BasePipeline
+from diffsynth.vram_management import enable_vram_management, AutoWrappedModule, AutoWrappedLinear
+from model.dit import WanModel
+from model.text_encoder import WanTextEncoder
+from model.vae import WanVideoVAE
+from model.image_encoder import WanImageEncoder
+from model.prompter import WanPrompter
+from scheduler.flow_match import FlowMatchScheduler
+import torch, os
+from einops import rearrange, repeat
+import numpy as np
+import PIL.Image
+from tqdm import tqdm
+from safetensors import safe_open
+from model.text_encoder import T5RelativeEmbedding, T5LayerNorm
+from model.dit import WanLayerNorm, WanRMSNorm, WanSelfAttention
+from model.vae import RMS_norm, CausalConv3d, Upsample
+def binary_tensor_to_indices(tensor):
+    assert tensor.dim() == 2, "Input tensor must be in [b, t]"
+    indices = [(row == 1).nonzero(as_tuple=True)[0] for row in tensor]
+    return indices
+def propagate_visualize_attention_arg(model, visualize_attention=False):
+        """
+        Recursively set the visualize_attention parameter to True for all WanSelfAttention modules
+        Only for inference/test mode
+        """
+        for name, module in model.named_modules():
+            if isinstance(module, WanSelfAttention):
+                if "blocks.0.self_attn" in name or "blocks.19.self_attn" in name or "blocks.39.self_attn" in name:
+                    print(f"Set `visualize_attention` to {visualize_attention} for {name}")
+                    module.visualize_attention = visualize_attention
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, tokenizer_path=None):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        self.dit: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.model_names = ['text_encoder', 'dit', 'vae']
+        self.height_division_factor = 16
+        self.width_division_factor = 16
+    def enable_vram_management(self, num_persistent_param_in_dit=None):
+        dtype = next(iter(self.text_encoder.parameters())).dtype
+        enable_vram_management(
+            self.text_encoder,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Embedding: AutoWrappedModule,
+                T5RelativeEmbedding: AutoWrappedModule,
+                T5LayerNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.dit.parameters())).dtype
+        enable_vram_management(
+            self.dit,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv3d: AutoWrappedModule,
+                torch.nn.LayerNorm: AutoWrappedModule,
+                WanLayerNorm: AutoWrappedModule,
+                WanRMSNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+            max_num_param=num_persistent_param_in_dit,
+            overflow_module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.vae.parameters())).dtype
+        enable_vram_management(
+            self.vae,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv2d: AutoWrappedModule,
+                RMS_norm: AutoWrappedModule,
+                CausalConv3d: AutoWrappedModule,
+                Upsample: AutoWrappedModule,
+                torch.nn.SiLU: AutoWrappedModule,
+                torch.nn.Dropout: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        self.enable_cpu_offload()
+    def fetch_models_from_model_manager(self, model_manager: ModelManager):
+        text_encoder_model_and_path = model_manager.fetch_model("wan_video_text_encoder", require_model_path=True)
+        if text_encoder_model_and_path is not None:
+            self.text_encoder, tokenizer_path = text_encoder_model_and_path
+            self.prompter.fetch_models(self.text_encoder)
+            self.prompter.fetch_tokenizer(os.path.join(os.path.dirname(tokenizer_path), "google/umt5-xxl"))
+        self.dit = model_manager.fetch_model("wan_video_dit")
+        self.vae = model_manager.fetch_model("wan_video_vae")
+        self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+    def _init_component_from_checkpoint_path(self, model_cls, state_dict_path, strict=True, config_dict=None):
+        config = {}
+        state_dict = self._load_state_dict(state_dict_path)
+        if hasattr(model_cls, "state_dict_converter"):
+            state_dict_converter = model_cls.state_dict_converter()
+            state_dict = state_dict_converter.from_civitai(state_dict)
+            if isinstance(state_dict, tuple):
+                state_dict, config = state_dict
+        config.update(config_dict or {})
+        model = model_cls(**config)
+        if "use_local_lora" in config_dict or "use_dera" in config_dict:
+            strict = False
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=strict)
+        print(f"Missing keys: {missing_keys}")
+        print(f"Unexpected keys: {unexpected_keys}")
+        return model
+    def _load_state_dict(self, state_dict_paths):
+        if isinstance(state_dict_paths, str):
+            state_dict_paths = [state_dict_paths]
+        state_dict = {}
+        for state_dict_path in tqdm(state_dict_paths, desc="Reading file(s) from disk"):
+            state_dict.update(self._load_single_file(state_dict_path))
+        return state_dict
+    def _load_single_file(self, file_path):
+        if file_path.endswith(".safetensors"):
+            return self._load_state_dict_from_safetensors(file_path)
+        else:
+            return torch.load(file_path, map_location='cpu')
+    def _load_state_dict_from_safetensors(self, file_path, torch_dtype=None):
+        state_dict = {}
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for k in f.keys():
+                state_dict[k] = f.get_tensor(k)
+                if torch_dtype is not None:
+                    state_dict[k] = state_dict[k].to(torch_dtype)
+        return state_dict
+    def initialize_dummy_dit(self, config):
+        print("Initializing a dummy DIT model.")
+        self.dit = WanModel(**config)
+        print("Dummy DIT model is initialized.")
+    def fetch_models_from_checkpoints(self, path_dict, config_dict=None):
+        default_config = {"text_encoder": {}, "dit": {}, "vae": {}, "image_encoder": {}}
+        config_dict = {**default_config, **(config_dict or {})}
+        components = {
+            "text_encoder": WanTextEncoder,
+            "dit": WanModel,
+            "vae": WanVideoVAE,
+            "image_encoder": WanImageEncoder
+        }
+        for name, model_cls in components.items():
+            if name not in path_dict:
+                print(f"Component {name} is not found in the checkpoint path dict. Skipping.")
+                continue
+            path = path_dict[name]
+            config = config_dict.get(name, {})
+            print(f"Loading {name} from {path} with config {config}.")
+            setattr(self, name, self._init_component_from_checkpoint_path(model_cls, path, config_dict=config))
+            print(f"Initialized {name} from checkpoint.")
+        if "text_encoder" in path_dict:
+            self.prompter.fetch_models(self.text_encoder)
+            self.prompter.fetch_tokenizer(os.path.join(os.path.dirname(path_dict["text_encoder"]), "google/umt5-xxl"))
+        print("Initialized prompter from checkpoint.")
+        print("All components are initialized from checkpoints.")
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager, torch_dtype=None, device=None):
+        if device is None: device = model_manager.device
+        if torch_dtype is None: torch_dtype = model_manager.torch_dtype
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        pipe.fetch_models_from_model_manager(model_manager)
+        return pipe
+    def denoising_model(self):
+        return self.dit
+    def encode_prompt(self, prompt, positive=True):
+        prompt_emb = self.prompter.encode_prompt(prompt, positive=positive)
+        return {"context": prompt_emb}
+    def encode_image(self, image, num_frames, height, width):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            image = self.preprocess_image(image.resize((width, height))).to(self.device)
+            clip_context = self.image_encoder.encode_image([image])
+            msk = torch.ones(1, num_frames, height//8, width//8, device=self.device)
+            msk[:, 1:] = 0
+            msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+            msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
+            msk = msk.transpose(1, 2)[0]
+            y = self.vae.encode([torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)], device=self.device)[0]
+            y = torch.concat([msk, y])
+        return {"clip_fea": clip_context, "y": [y]}
+    def check_and_fix_image_or_video_tensor_input(self, _tensor):
+        assert isinstance(_tensor, torch.Tensor), "Input must be a tensor."
+        if _tensor.max() <= 255 and _tensor.max() > 1.0:
+            _tensor = _tensor.to(self.device) / 127.5 - 1
+            print("Input tensor is converted from [0, 255] to [-1, 1].")
+        elif _tensor.min() >= 0 and _tensor.max() <= 1:
+            _tensor = _tensor.to(self.device) * 2 - 1
+            print("Input tensor is converted from [0, 1] to [-1, 1].")
+        return _tensor
+    def encode_video_with_mask(self, video, num_frames, height, width, condition_preserved_mask):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            video = video.to(self.device)
+            y = self.vae.encode(video, device=self.device)
+            msk = condition_preserved_mask
+            assert msk is not None, "The mask must be provided for the masked video input."
+            assert msk.dim() == 2, "The mask must be a 2D tensor in [b, t]."
+            assert msk.shape[0] == video.shape[0], "The batch size of the mask must be the same as the input video."
+            assert msk.shape[1] == num_frames, "The number of frames in the mask must be the same as the input video."
+            msk = msk.to(self.device)
+            msk = msk.unsqueeze(-1).unsqueeze(-1)
+            msk = repeat(msk, 'b t 1 1 -> b t h w', h=height//8, w=width//8)
+            msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+            msk = msk.view(video.shape[0], msk.shape[1] // 4, 4, height//8, width//8)  # b, t, c, h, w
+            msk = msk.transpose(1, 2)  # b, c, t, h, w
+            y = torch.concat([msk, y], dim=1)
+        return y
+    def encode_video_with_mask_sparse(self, video, height, width, condition_preserved_mask, sketch_local_mask=None):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            batch_size = video.shape[0]
+            cond_indices = binary_tensor_to_indices(condition_preserved_mask)
+            sequence_cond_compressed_indices = [(cond_index + 3) // 4 for cond_index in cond_indices]
+            video = video.to(self.device)
+            video_latent = self.vae.encode(video, device=self.device)
+            video_latent = video_latent[:, :, sequence_cond_compressed_indices[0], :, :]
+            msk = condition_preserved_mask.to(self.device)
+            msk = msk.unsqueeze(-1).unsqueeze(-1)  # b, t, 1, 1
+            msk = repeat(msk, 'b t 1 1 -> b t h w', h=height//8, w=width//8)
+            msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+            msk = msk.view(batch_size, msk.shape[1] // 4, 4, height//8, width//8)  # b, t, 4, h//8, w//8
+            msk = msk.transpose(1, 2)  # b, 4, t, h//8, w//8
+            msk = msk[:, :, sequence_cond_compressed_indices[0], :, :]
+            if sketch_local_mask is not None:
+                sketch_local_mask = sketch_local_mask.to(self.device)
+                if sketch_local_mask.shape[-2:] != (height//8, width//8):
+                    sk_batch_t = sketch_local_mask.shape[0] * sketch_local_mask.shape[2]
+                    sketch_local_mask_reshaped = sketch_local_mask.reshape(sk_batch_t, 1, sketch_local_mask.shape[3], sketch_local_mask.shape[4])
+                    sketch_local_mask_resized = torch.nn.functional.interpolate(
+                        sketch_local_mask_reshaped,
+                        size=(height//8, width//8),
+                        mode='nearest'
+                    )
+                    sketch_local_mask_resized = sketch_local_mask_resized.reshape(
+                        sketch_local_mask.shape[0],
+                        sketch_local_mask.shape[1],
+                        sketch_local_mask.shape[2],
+                        height//8, width//8
+                    )
+                else:
+                    sketch_local_mask_resized = sketch_local_mask
+                sketch_mask = sketch_local_mask_resized
+                sketch_mask = torch.concat([torch.repeat_interleave(sketch_mask[:, :, 0:1], repeats=4, dim=2), sketch_mask[:, :, 1:]], dim=2)
+                sketch_mask = sketch_mask.view(batch_size, sketch_mask.shape[1], sketch_mask.shape[2] // 4, 4, height//8, width//8)
+                sketch_mask = sketch_mask.permute(0, 1, 3, 2, 4, 5)  # [b, 1, 4, t//4, h//8, w//8]
+                sketch_mask = sketch_mask.view(batch_size, 4, sketch_mask.shape[3], height//8, width//8)  # [b, 4, t//4, h//8, w//8]
+                sketch_mask = sketch_mask[:, :, sequence_cond_compressed_indices[0], :, :]  # [b, 4, len(indices), h//8, w//8]
+                combined_latent = torch.cat([msk, video_latent, sketch_mask], dim=1)
+            else:
+                combined_latent = torch.concat([msk, video_latent], dim=1)
+        return combined_latent, sequence_cond_compressed_indices  # b, c=(4+16+4=24), t, h, w when sketch_local_mask is provided
+    def encode_image_or_masked_video(self, image_or_masked_video, num_frames, height, width, condition_preserved_mask=None):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            batch_size = image_or_masked_video.shape[0]
+            if isinstance(image_or_masked_video, PIL.Image.Image) or (isinstance(image_or_masked_video, torch.Tensor) and image_or_masked_video.dim() <= 4):
+                if isinstance(image_or_masked_video, PIL.Image.Image):
+                    image_or_masked_video = self.preprocess_image(image_or_masked_video.resize((width, height))).to(self.device)
+                else:
+                    if image_or_masked_video.dim() == 3:
+                        image_or_masked_video = image_or_masked_video.unsqueeze(0)  # b=1, c, h, w
+                    image_or_masked_video = image_or_masked_video.to(self.device)
+                y = self.vae.encode([torch.concat([image_or_masked_video.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image_or_masked_video.device)], dim=1)], device=self.device)
+                msk_idx_to_be_zero = range(1, num_frames)
+                clip_context = self.image_encoder.encode_image(image_or_masked_video.unsqueeze(1))  # need to be [b, 1, c, h, w]
+                msk = torch.ones(batch_size, num_frames, height//8, width//8, device=self.device)
+                msk[:, msk_idx_to_be_zero] = 0
+                msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+                msk = msk.view(batch_size, msk.shape[1] // 4, 4, height//8, width//8)
+                msk = msk.transpose(1, 2)
+            elif isinstance(image_or_masked_video, torch.Tensor) and image_or_masked_video.dim() == 5:
+                image_or_masked_video = image_or_masked_video.to(self.device)
+                first_image = image_or_masked_video[:, :, 0, :, :].unsqueeze(1)
+                clip_context = self.image_encoder.encode_image(first_image)
+                y = self.vae.encode(image_or_masked_video, device=self.device)
+                msk = condition_preserved_mask  # b, t
+                assert msk is not None, "The mask must be provided for the masked video input."
+                assert msk.dim() == 2, "The mask must be a 2D tensor in [b, t]."
+                assert msk.shape[0] == batch_size, "The batch size of the mask must be the same as the input video."
+                assert msk.shape[1] == num_frames, "The number of frames in the mask must be the same as the input video."
+                msk = msk.to(self.device)
+                msk = msk.unsqueeze(-1).unsqueeze(-1)  # b, t, 1, 1
+                msk = repeat(msk, 'b t 1 1 -> b t h w', h=height//8, w=width//8)
+                msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+                msk = msk.view(batch_size, msk.shape[1] // 4, 4, height//8, width//8)  # b, t, 4, h//8, w//8
+                msk = msk.transpose(1, 2)  # b, 4, t, h//8, w//8
+            else:
+                raise ValueError("Input must be an image (PIL/Tensor in [b, c, h, w]) or a masked video (Tensor in [b, c, t, h, w]).")
+        y = torch.concat([msk, y], dim=1)
+        return {"clip_fea": clip_context, "y": y}
+    def tensor2video(self, frames):
+        frames = rearrange(frames, "C T H W -> T H W C")
+        frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
+        frames = [PIL.Image.fromarray(frame) for frame in frames]
+        return frames
+    def prepare_extra_input(self, latents=None):
+        return {"seq_len": latents.shape[2] * latents.shape[3] * latents.shape[4] // 4}
+    def encode_video(self, input_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            latents = self.vae.encode(input_video, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return latents
+    def decode_video(self, latents, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            frames = self.vae.decode(latents, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return frames
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        negative_prompt="",
+        input_image=None,
+        input_video=None,
+        denoising_strength=1.0,
+        seed=None,
+        rand_device="cpu",
+        height=480,
+        width=832,
+        num_frames=81,
+        cfg_scale=5.0,
+        num_inference_steps=50,
+        sigma_shift=5.0,
+        tiled=True,
+        tile_size=(30, 52),
+        tile_stride=(15, 26),
+        progress_bar_cmd=tqdm,
+        # progress_bar_st=None,
+        input_condition_video=None,
+        input_condition_preserved_mask=None,
+        input_condition_video_sketch=None,
+        input_condition_preserved_mask_sketch=None,
+        sketch_local_mask=None,
+        visualize_attention=False,
+        output_path=None,
+        batch_idx=None,
+        sequence_cond_residual_scale=1.0,
+    ):
+        height, width = self.check_resize_height_width(height, width)
+        if num_frames % 4 != 1:
+            num_frames = (num_frames + 2) // 4 * 4 + 1
+            print(f"Only `num_frames % 4 != 1` is acceptable. We round it up to {num_frames}.")
+        tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.scheduler.set_timesteps(num_inference_steps, denoising_strength, shift=sigma_shift)
+        noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=torch.float32).to(self.device)
+        if input_video is not None:
+            self.load_models_to_device(['vae'])
+            input_video = self.preprocess_images(input_video)
+            input_video = torch.stack(input_video, dim=2)
+            latents = self.encode_video(input_video, **tiler_kwargs).to(dtype=noise.dtype, device=noise.device)
+            latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
+        else:
+            latents = noise
+        self.load_models_to_device(["text_encoder"])
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        if cfg_scale != 1.0:
+            prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False)
+        self.load_models_to_device(["image_encoder", "vae"])
+        if input_image is not None and self.image_encoder is not None:
+            image_emb = self.encode_image(input_image, num_frames, height, width)
+        elif input_condition_video is not None and self.image_encoder is not None:
+            assert input_condition_preserved_mask is not None, "`input_condition_preserved_mask` must not be None when `input_condition_video` is given."
+            image_emb = self.encode_image_or_masked_video(input_condition_video, num_frames, height, width, input_condition_preserved_mask)
+        else:
+            image_emb = {}
+        # Extra input
+        extra_input = self.prepare_extra_input(latents)
+        if self.dit.use_sequence_cond:
+            assert input_condition_video_sketch is not None, "`input_condition_video_sketch` must not be None when `use_sequence_cond` is True."
+            assert input_condition_preserved_mask_sketch is not None, "`input_condition_preserved_mask_sketch` must not be None when `input_condition_video_sketch` is given."
+            if self.dit.sequence_cond_mode == "sparse":
+                sequence_cond, sequence_cond_compressed_indices = self.encode_video_with_mask_sparse(input_condition_video_sketch, height, width, input_condition_preserved_mask_sketch, sketch_local_mask)
+                extra_input.update({"sequence_cond": sequence_cond,
+                                    "sequence_cond_compressed_indices": sequence_cond_compressed_indices})
+            elif self.dit.sequence_cond_mode == "full":
+                sequence_cond = self.encode_video_with_mask(input_condition_video_sketch, num_frames, height, width, input_condition_preserved_mask_sketch)
+                extra_input.update({"sequence_cond": sequence_cond})
+            else:
+                raise ValueError(f"Invalid `sequence_cond_model`={self.dit.sequence_cond_mode} in the DIT model.")
+        elif self.dit.use_channel_cond:
+            sequence_cond = self.encode_video_with_mask(input_condition_video_sketch, num_frames, height, width, input_condition_preserved_mask_sketch)
+            extra_input.update({"channel_cond": sequence_cond})
+        self.load_models_to_device([])
+        if sequence_cond_residual_scale != 1.0:
+            extra_input.update({"sequence_cond_residual_scale": sequence_cond_residual_scale})
+        # Denoise
+        self.load_models_to_device(["dit"])
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
+                timestep = timestep.unsqueeze(0).to(dtype=torch.float32, device=self.device)
+                _should_visualize_attention = visualize_attention and (progress_id == len(self.scheduler.timesteps) - 1)
+                if _should_visualize_attention:
+                    print(f"Visualizing attention maps (Step {progress_id + 1}/{len(self.scheduler.timesteps)}).")
+                    propagate_visualize_attention_arg(self.dit, True)
+                # Inference
+                noise_pred_posi = self.dit(latents, timestep=timestep, **prompt_emb_posi, **image_emb, **extra_input)
+                if isinstance(noise_pred_posi, tuple):
+                    noise_pred_posi = noise_pred_posi[0]
+                if cfg_scale != 1.0:
+                    noise_pred_nega = self.dit(latents, timestep=timestep, **prompt_emb_nega, **image_emb, **extra_input)
+                    if isinstance(noise_pred_nega, tuple):
+                        noise_pred_nega = noise_pred_nega[0]
+                    noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
+                else:
+                    noise_pred = noise_pred_posi
+                # Scheduler
+                latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)
+                # If visualization is enabled, save the attention maps
+                if _should_visualize_attention:
+                    print("Saving attention maps...")
+                    from util.model_util import save_attention_maps
+                    save_attention_maps(self.dit, output_path, batch_idx, timestep.squeeze().cpu().numpy().item())
+                    propagate_visualize_attention_arg(self.dit, False)
+        # Decode
+        self.load_models_to_device(['vae'])
+        frames = self.decode_video(latents, **tiler_kwargs)
+        self.load_models_to_device([])
+        return frames

requirements.txt ADDED Viewed

	@@ -0,0 +1,148 @@

+absl-py==2.2.2
+accelerate==1.6.0
+beartype==0.20.2
+beautifulsoup4==4.13.4
+braceexpand==0.1.7
+cached-property==2.0.1
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+clip==0.2.0
+comm==0.2.3
+contourpy==1.3.2
+controlnet_aux==0.0.7
+crcmod==1.7
+cycler==0.12.1
+datasets==3.5.0
+debugpy==1.8.15
+decorator==5.2.1
+decord==0.6.0
+deepspeed==0.16.7
+diffsynth==1.1.7
+diffusers==0.33.1
+dill==0.3.8
+docker-pycreds==0.4.0
+dulwich==0.22.8
+easydict==1.13
+einops==0.8.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fairscale==0.4.13
+fastapi==0.115.12
+fastrlock==0.8.3
+ffmpy==0.5.0
+filelock==3.13.1
+flash_attn==2.8.0.post2 --global-option="--no-build-isolation"
+fonttools==4.57.0
+frozenlist==1.6.0
+fsspec==2024.12.0
+ftfy==6.3.1
+func_timeout==4.3.5
+fuzzywuzzy==0.18.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.25.2
+gradio_client==1.8.0
+groovy==0.1.2
+grpcio==1.71.0
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.8
+httpx==0.28.1
+huggingface-hub==0.30.2
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+importlib_metadata==8.6.1
+ipykernel==6.30.0
+ipython==8.37.0
+jedi==0.19.2
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.8
+kornia==0.8.0
+kornia_rs==0.1.8
+lazy_loader==0.4
+lightning==2.5.1
+lightning-utilities==0.14.3
+lpips==0.1.4
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+modelscope==1.25.0
+moviepy==2.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.4.3
+multiprocess==0.70.16
+ninja==1.11.1.4
+numpy==2.2.5
+omegaconf==2.3.0
+opencv-python==4.11.0.86
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+peft==0.15.2
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.7
+proglog==0.1.11
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==5.29.4
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==19.0.1
+pycryptodome==3.22.0
+pydantic==2.11.3
+pydantic_core==2.33.1
+pydub==0.25.1
+Pygments==2.19.1
+pynvml==12.0.0
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytorch-fid==0.3.0
+pytorch-lightning==2.5.1
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==27.0.0
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.6
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-image==0.25.2
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==2.26.1
+setproctitle==1.3.5
+shellingham==1.5.4
+simplejson==3.20.1
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.7
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.13.1
+taming-transformers==0.0.1
+tensorboard==2.19.0
+tokenizers==0.20.3
+torch==2.6.0
+torchaudio==2.6.0
+torchdiffeq==0.2.5
+torchmetrics==1.7.1
+torchsde==0.2.6
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.46.2
+triton==3.2.0
+xformers==0.0.29.post2

samples/1_image1.png ADDED Viewed

Git LFS Details

SHA256: 3a02da307776afbf196bcf3001b6e6b334154cc016ab23cf7863156dd1e80dd3
Pointer size: 131 Bytes
Size of remote file: 174 kB

samples/1_out.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa51ac0653a18dc20b9a6946aaa1a7923d58fe291e926908703c300a4d13c4a2
+size 356550

samples/1_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ['在海底，一个上身赤裸的的男子和一个螺旋游动的蓝鱼嬉戏。鲸鱼跟着男人手里拿的袋子绕圈，男子拿着袋子引诱着蓝鱼向前游动。Anime. High quality.']

samples/1_sketch1.jpg ADDED Viewed

Git LFS Details

SHA256: a26bfa93807f5aff098ed3147a4e8e543d4cbe7a8d184c82c3e0e161eb8556db
Pointer size: 130 Bytes
Size of remote file: 61.3 kB

samples/1_sketch2.jpg ADDED Viewed

Git LFS Details

SHA256: 9327a6acc26a54a4f45132ceccacfb0d014f85f11965aa28a4ad5dab7a3b7114
Pointer size: 130 Bytes
Size of remote file: 57.2 kB

samples/1_sketch3.jpg ADDED Viewed

Git LFS Details

SHA256: d1866a5cf3e392525f25422824e5bb9b28838ea96e2f2d7f99ac428d14ed6053
Pointer size: 130 Bytes
Size of remote file: 58 kB

samples/2_image1.jpg ADDED Viewed

Git LFS Details

SHA256: a527a886764611d46d9921d2e11bda0b40f4b08c266d46c3bb8e42a179723537
Pointer size: 130 Bytes
Size of remote file: 62.9 kB

samples/2_out.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9f28ab63b4fc5b07c0ed01f715ec671f8d839b8783dc8a432c7764bd35605f5
+size 151565

samples/2_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ['一个女孩和一个银发男孩种下了一颗巨大的花，随着镜头缓慢向上移动，这个巨大的花不断生长变大并开放。Anime. High quality.']

samples/2_sketch1.jpg ADDED Viewed

Git LFS Details

SHA256: 65cb81959a7e1e9e779c04ea5bd33630e1d46d4f434f26b85514a8ed833a6b65
Pointer size: 130 Bytes
Size of remote file: 51.2 kB

samples/2_sketch2.jpg ADDED Viewed

Git LFS Details

SHA256: 62b6c56e32e29c1588df29f229d3743263e8aac56b041a4f0d627159cdc492ef
Pointer size: 130 Bytes
Size of remote file: 57.1 kB

samples/3_image1.png ADDED Viewed

Git LFS Details

SHA256: 968fd96a40d945afdd70f485200c1e2ee17750290493ce8d6f79e5c337da0f91
Pointer size: 131 Bytes
Size of remote file: 167 kB

samples/3_out.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb131043289d831f7c4e0d3dd4a21ecc3c4eecca1bf3ae539bb14414c439cde
+size 87909

samples/3_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ['一个古代中国男孩拿着苹果，笑眯眯地送给旁边的老人。Anime. High quality.']

samples/3_sketch1.jpg ADDED Viewed

Git LFS Details

SHA256: 346d166a4f69d18b666a4d550b63154e05369994e23403068086b59893803873
Pointer size: 130 Bytes
Size of remote file: 73.9 kB

samples/ToonComposer-Icon.png ADDED Viewed

Git LFS Details

SHA256: 79e20f3daac212a6bd7e31646e12cb3bed399798f119393241543daea035f2dd
Pointer size: 131 Bytes
Size of remote file: 704 kB

samples/ToonComposer-Method.jpg ADDED Viewed

Git LFS Details

SHA256: 9052fdd65dfc6d26f2f52e3b23e93dc7e5c69ac65f1e801292f6341e1a50609c
Pointer size: 131 Bytes
Size of remote file: 421 kB

samples/ToonComposer-TLDR.jpg ADDED Viewed

Git LFS Details

SHA256: 962b56e9858c45305f7d5df711ba6551b052238980443dd1b1d76f1465dd6f31
Pointer size: 131 Bytes
Size of remote file: 277 kB

scheduler/__init__.py ADDED Viewed

File without changes

scheduler/flow_match.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+class FlowMatchScheduler():
+    def __init__(self, num_inference_steps=100, num_train_timesteps=1000, shift=3.0, sigma_max=1.0, sigma_min=0.003/1.002, inverse_timesteps=False, extra_one_step=False, reverse_sigmas=False):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, shift=None):
+        if shift is not None:
+            self.shift = shift
+        sigma_start = self.sigma_min + (self.sigma_max - self.sigma_min) * denoising_strength
+        if self.extra_one_step:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps + 1)[:-1]
+        else:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps)
+        if self.inverse_timesteps:
+            self.sigmas = torch.flip(self.sigmas, dims=[0])
+        self.sigmas = self.shift * self.sigmas / (1 + (self.shift - 1) * self.sigmas)
+        if self.reverse_sigmas:
+            self.sigmas = 1 - self.sigmas
+        self.timesteps = self.sigmas * self.num_train_timesteps
+        if training:
+            x = self.timesteps
+            y = torch.exp(-2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2)
+            y_shifted = y - y.min()
+            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
+            self.linear_timesteps_weights = bsmntw_weighing
+    def step(self, model_output, timestep, sample, to_final=False):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 1 if (self.inverse_timesteps or self.reverse_sigmas) else 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep.to(self.timesteps.device)).abs())
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

tooncomposer.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os, torch, lightning, imageio
+from peft import LoraConfig, inject_adapter_in_model
+import numpy as np
+from pipeline.i2v_pipeline import WanVideoPipeline
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+torch.set_float32_matmul_precision('medium')
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params)
+    for frame in frames:
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def get_base_model_paths(base_model_name, format='dict', model_root="./weights"):
+        if base_model_name == "Wan2.1-I2V-14B-480P":
+            if format == 'list':
+                return [
+                    [os.path.join(model_root, f"diffusion_pytorch_model-0000{_idx}-of-00007.safetensors") for _idx in range(1, 8)],
+                    os.path.join(model_root, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"),
+                    os.path.join(model_root, "models_t5_umt5-xxl-enc-bf16.pth"),
+                    os.path.join(model_root, "Wan2.1_VAE.pth")
+                ]
+            elif format == 'dict':
+                return {
+                    "dit": [os.path.join(model_root, f"diffusion_pytorch_model-0000{_idx}-of-00007.safetensors") for _idx in range(1, 8)],
+                    "image_encoder": os.path.join(model_root, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"),
+                    "text_encoder": os.path.join(model_root, "models_t5_umt5-xxl-enc-bf16.pth"),
+                    "vae": os.path.join(model_root, "Wan2.1_VAE.pth")
+                }
+            else:
+                raise ValueError(f"Unsupported format: {format}")
+        else:
+            raise ValueError(f"Unsupported base model name: {base_model_name}")
+class ToonComposer(lightning.LightningModule):
+    def __init__(self, base_model_name="Wan2.1-I2V-14B-480P", model_root=None, learning_rate=1e-5, lora_rank=4, lora_alpha=4,
+                 train_architecture=None, lora_target_modules="q,k,v,o,ffn.0,ffn.2",
+                 init_lora_weights="kaiming", use_gradient_checkpointing=True,
+                 checkpoint_path=None, video_condition_preservation_mode="first_and_last",
+                 tiled=False, tile_size=(34, 34), tile_stride=(18, 16), output_path=None,
+                 use_local_lora=False, use_dera=False, dera_rank=None, use_dera_spatial=True, use_dera_temporal=True, use_sequence_cond=False, sequence_cond_mode="sparse",
+                 use_channel_cond=False,
+                 use_sequence_cond_position_aware_residual=False,
+                 use_sequence_cond_loss=False, fast_dev=False,
+                 max_num_cond_images=1, max_num_cond_sketches=2, visualize_attention=False,
+                 random_spaced_cond_frames=False, use_sketch_mask=False, sketch_mask_ratio=0.2, no_first_sketch=False,
+                 test_sampling_steps=15, test_sequence_cond_residual_scale=0.5, height=480, width=832):
+        super().__init__()
+        self.pipe = WanVideoPipeline(device="cpu", torch_dtype=torch.bfloat16)
+        self.use_local_lora = use_local_lora
+        self.use_dera = use_dera
+        self.use_dera_spatial = use_dera_spatial
+        self.use_dera_temporal = use_dera_temporal
+        self.use_sequence_cond = use_sequence_cond
+        self.sequence_cond_mode = sequence_cond_mode
+        self.use_channel_cond = use_channel_cond
+        self.use_sequence_cond_position_aware_residual = use_sequence_cond_position_aware_residual
+        assert not (use_sequence_cond and use_channel_cond), "Cannot use both sequence condition and channel condition."
+        self.use_sequence_cond_loss = use_sequence_cond_loss
+        self.max_num_cond_images = max_num_cond_images
+        self.max_num_cond_sketches = max_num_cond_sketches
+        self.visualize_attention = visualize_attention
+        self.random_spaced_cond_frames = random_spaced_cond_frames
+        self.use_sketch_mask = use_sketch_mask
+        self.sketch_mask_ratio = sketch_mask_ratio
+        self.no_first_sketch = no_first_sketch
+        self.test_sampling_steps = test_sampling_steps
+        self.test_sequence_cond_residual_scale = test_sequence_cond_residual_scale
+        self.height = height
+        self.width = width
+        self.current_checkpoint_path = None
+        paths = get_base_model_paths(base_model_name, format='dict', model_root=model_root)
+        if use_sequence_cond:
+            assert sequence_cond_mode in ["sparse", "full"], f"Unsupported sequence condition model: {sequence_cond_mode}"
+            if sequence_cond_mode == "sparse":
+                if use_sketch_mask:
+                    sequence_cond_in_dim = 24
+                else:
+                    sequence_cond_in_dim = 20
+            else:
+                sequence_cond_in_dim = 20
+            use_channel_cond = False
+            channel_cond_in_dim = None
+        elif use_channel_cond:
+            channel_cond_in_dim = 20
+            sequence_cond_in_dim = None
+            use_sequence_cond = False
+        dit_config = {
+            "use_local_lora": use_local_lora,
+            "use_dera": use_dera,
+            "dera_rank": dera_rank,
+            "use_dera_spatial": use_dera_spatial,
+            "use_dera_temporal": use_dera_temporal,
+            "use_sequence_cond": use_sequence_cond,
+            "sequence_cond_mode": sequence_cond_mode,
+            "sequence_cond_in_dim": sequence_cond_in_dim,
+            "use_channel_cond": use_channel_cond,
+            "channel_cond_in_dim": channel_cond_in_dim,
+            "use_sequence_cond_position_aware_residual": use_sequence_cond_position_aware_residual,
+            "use_sequence_cond_loss": use_sequence_cond_loss
+        }
+        if fast_dev:
+            del paths["dit"]
+            dit_config.update({
+                "model_type": "i2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 36,
+                "dim": 512,
+                "ffn_dim": 512,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 2,  # 40
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            })
+            self.pipe.initialize_dummy_dit(dit_config)
+        self.pipe.fetch_models_from_checkpoints(
+            paths,
+            config_dict={
+                "dit": dit_config
+            })
+        if use_sequence_cond:
+            self.pipe.denoising_model().copy_sequence_cond_patch_embedding_weights()
+        elif use_channel_cond:
+            self.pipe.denoising_model().copy_patch_embedding_weights_for_channel_cond()
+        self.freeze_parameters()
+        if train_architecture == "lora":
+            self.add_lora_to_model(
+                self.pipe.denoising_model(),
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_target_modules=lora_target_modules,
+                init_lora_weights=init_lora_weights
+            )
+        elif train_architecture == "full":
+            self.pipe.denoising_model().requires_grad_(True)
+        if checkpoint_path is not None:
+            self.load_tooncomposer_checkpoint(checkpoint_path)
+        self.learning_rate = learning_rate
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.pipe.scheduler.set_timesteps(1000, training=True)
+        self.vae_tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.video_condition_preservation_mode = video_condition_preservation_mode
+        self.negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+        if output_path is None:
+            output_path = "./"
+        self.output_path = output_path
+    def load_tooncomposer_checkpoint(self, checkpoint_path):
+        if checkpoint_path == self.current_checkpoint_path:
+            print(f"Skipping loading checkpoint {checkpoint_path} because it is the same as the current checkpoint.")
+            return
+        self.current_checkpoint_path = checkpoint_path
+        self.load_patch_to_model(
+            self.pipe.denoising_model(),
+            checkpoint_path
+        )
+    def update_height_width(self, height, width):
+        self.height = height
+        self.width = width
+    def freeze_parameters(self):
+        self.pipe.requires_grad_(False)
+        self.pipe.eval()
+        self.pipe.denoising_model().train()
+    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="q,k,v,o,ffn.0,ffn.2", init_lora_weights="kaiming"):
+        self.lora_alpha = lora_alpha
+        if init_lora_weights == "kaiming":
+            init_lora_weights = True
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            init_lora_weights=init_lora_weights,
+            target_modules=lora_target_modules.split(","),
+        )
+        model = inject_adapter_in_model(lora_config, model)
+        for param in model.parameters():
+            if param.requires_grad:
+                param.data = param.to(torch.float32)
+    def load_patch_to_model(self, model, pretrained_path, state_dict_converter=None):
+        if pretrained_path is not None:
+            state_dict = torch.load(pretrained_path, map_location="cpu", weights_only=True)
+            self.loaded_global_step = 0
+            self.loaded_current_epoch = 0
+            if self.use_sketch_mask:
+                seq_cond_embed_weight = state_dict['sequence_cond_patch_embedding.weight']
+                current_in_channels = self.pipe.denoising_model().sequence_cond_patch_embedding.in_channels
+                if current_in_channels == 24 and seq_cond_embed_weight.shape[1] == 20:
+                    new_weight = torch.zeros(
+                        seq_cond_embed_weight.shape[0],
+                        4,
+                        *seq_cond_embed_weight.shape[2:],
+                        dtype=seq_cond_embed_weight.dtype
+                    )
+                    state_dict['sequence_cond_patch_embedding.weight'] = torch.cat([
+                        seq_cond_embed_weight, new_weight], dim=1)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            all_keys = [i for i, _ in model.named_parameters()]
+            num_updated_keys = len(all_keys) - len(missing_keys)
+            num_unexpected_keys = len(unexpected_keys)
+            print(f"[Checkpoint] {num_updated_keys} parameters are loaded from {pretrained_path}. {num_unexpected_keys} parameters are unexpected.")

util/model_util.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch, os
+from safetensors import safe_open
+from contextlib import contextmanager
+import hashlib
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+import numpy as np
+@contextmanager
+def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors", "bin", "ckpt", "pth", "pt"
+        ]:
+            state_dict.update(load_state_dict(os.path.join(file_path, file_name), torch_dtype=torch_dtype))
+    return state_dict
+def load_state_dict(file_path, torch_dtype=None):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None):
+    state_dict = torch.load(file_path, map_location="cpu", weights_only=True)
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def search_for_embeddings(state_dict):
+    embeddings = []
+    for k in state_dict:
+        if isinstance(state_dict[k], torch.Tensor):
+            embeddings.append(state_dict[k])
+        elif isinstance(state_dict[k], dict):
+            embeddings += search_for_embeddings(state_dict[k])
+    return embeddings
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-3:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
+                    return name
+    return None
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in  keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
+def save_attention_maps(model, output_path, batch_idx, timestep, layer_indices=None):
+    """
+    Visualize and save the attention maps from selected layers of the model
+    Args:
+        model: The DiT model with attention maps stored
+        output_path: Directory to save visualizations
+        batch_idx: Current batch index for file naming
+        layer_indices: List of layer indices to visualize (if None, visualize all)
+    """
+    timestep = int(float(str(timestep)))
+    os.makedirs(os.path.join(output_path, "attention_maps"), exist_ok=True)
+    # If layer indices not specified, visualize all layers
+    if layer_indices is None:
+        layer_indices = range(len(model.blocks))
+    # Create a custom colormap (similar to the ones used in attention visualization papers)
+    colors = [(0, 0, 0.5), (0, 0, 1), (0, 0.5, 1), (0, 1, 1),
+              (0.5, 1, 0.5), (1, 1, 0), (1, 0.5, 0), (1, 0, 0), (0.5, 0, 0)]
+    attention_cmap = LinearSegmentedColormap.from_list('attention_cmap', colors)
+    for i in layer_indices:
+        if not hasattr(model.blocks[i].self_attn, '_last_attn_maps'):
+            continue
+        attn_map = model.blocks[i].self_attn._last_attn_maps
+        grid_size = model.blocks[i].self_attn._last_grid_sizes
+        seq_len = model.blocks[i].self_attn._last_seq_lens
+        # attn_maps.shape=[s, s]
+        np.savez_compressed(os.path.join(output_path,
+                "attention_maps",
+                f"attn_maps_layer{i}_batch{batch_idx}_t{timestep}.npz"),
+                            attn_map=attn_map, grid_size=grid_size, seq_len=seq_len)
+        print(f"Saving Layer {i}, Batch {batch_idx} attention maps")
+        attn_map -= attn_map.min()
+        attn_map /= attn_map.max()
+        plt.figure(figsize=(10, 8))
+        plt.imshow(attn_map ** 0.25, cmap=attention_cmap)
+        plt.colorbar(label='Attention Weight')
+        plt.title(f'Layer {i}, Batch {batch_idx} (Average)')
+        save_path = os.path.join(
+            output_path,
+            "attention_maps",
+            f"attn_map_layer{i}_average_batch{batch_idx}_t{timestep}.png"
+        )
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        plt.close()
+    # Clean up the stored attention maps to free memory
+    for i in layer_indices:
+        if hasattr(model.blocks[i].self_attn, '_last_attn_maps'):
+            del model.blocks[i].self_attn._last_attn_maps

util/optical_flow.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.models.optical_flow import Raft_Large_Weights, raft_large
+from typing import List, Tuple, Dict
+import argparse
+from pathlib import Path
+from sklearn.cluster import KMeans
+from tqdm import tqdm
+import os
+os.environ['OPENBLAS_NUM_THREADS'] = '64'
+class OpticalFlowAnalyzer:
+    def __init__(self, device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
+        self.device = device
+        self.model = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).to(device)
+        self.model.eval()
+    def preprocess_frame(self, frame: np.ndarray) -> torch.Tensor:
+        """Preprocess a frame for RAFT model."""
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame = torch.from_numpy(frame).permute(2, 0, 1).float()
+        frame = frame.unsqueeze(0) / 255.0
+        return frame.to(self.device)
+    def compute_optical_flow(self, frame1: np.ndarray, frame2: np.ndarray) -> np.ndarray:
+        """Compute optical flow between two consecutive frames."""
+        with torch.no_grad():
+            frame1_tensor = self.preprocess_frame(frame1)
+            frame2_tensor = self.preprocess_frame(frame2)
+            flow = self.model(frame1_tensor, frame2_tensor)[-1]
+            flow = flow[0].permute(1, 2, 0).cpu().numpy()
+        return flow
+    def analyze_motion_regions(self, flow: np.ndarray, num_clusters: int = 3) -> Tuple[np.ndarray, Dict]:
+        """Cluster motion regions based on optical flow magnitude and direction."""
+        h, w = flow.shape[:2]
+        magnitude = np.sqrt(flow[..., 0]**2 + flow[..., 1]**2)
+        direction = np.arctan2(flow[..., 1], flow[..., 0])
+        # Create feature matrix for clustering
+        features = np.zeros((h * w, 3))
+        features[:, 0] = magnitude.ravel()
+        features[:, 1] = np.cos(direction).ravel()
+        features[:, 2] = np.sin(direction).ravel()
+        # Normalize features
+        features = (features - features.mean(axis=0)) / features.std(axis=0)
+        # Perform clustering
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42,)
+        labels = kmeans.fit_predict(features)
+        labels = labels.reshape(h, w)
+        # Analyze clusters
+        cluster_stats = {}
+        for i in range(num_clusters):
+            cluster_mask = (labels == i)
+            cluster_magnitude = magnitude[cluster_mask]
+            cluster_stats[i] = {
+                'mean_magnitude': np.mean(cluster_magnitude),
+                'std_magnitude': np.std(cluster_magnitude),
+                'pixel_count': np.sum(cluster_mask),
+                'is_static': np.mean(cluster_magnitude) < 0.1  # Threshold for static regions
+            }
+        return labels, cluster_stats
+    def process_video(self, video_path: str, output_path: str = None) -> List[Tuple[np.ndarray, Dict]]:
+        """Process a video and return motion analysis results for each frame pair."""
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError(f"Could not open video: {video_path}")
+        results = []
+        ret, prev_frame = cap.read()
+        if not ret:
+            raise ValueError("Could not read first frame")
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        pbar = tqdm(total=total_frames-1, desc="Processing video")
+        while True:
+            ret, curr_frame = cap.read()
+            if not ret:
+                break
+            flow = self.compute_optical_flow(prev_frame, curr_frame)
+            labels, stats = self.analyze_motion_regions(flow)
+            if output_path:
+                # Visualize results
+                vis_frame = curr_frame.copy()
+                for i, stat in stats.items():
+                    if not stat['is_static']:
+                        mask = (labels == i).astype(np.uint8) * 255
+                        print("mask:",mask.shape)
+                        print("vis_frame:",vis_frame.shape)
+                        mask = np.expand_dims(mask, axis=-1).repeat(3, axis=-1)
+                        print("mask:",mask.shape)
+                        vis_frame[mask > 0] = cv2.addWeighted(vis_frame[mask > 0], 0.7, 255, 0.3, 0)
+                cv2.imwrite(f"{output_path}/frame_{len(results):04d}.jpg", vis_frame)
+            results.append((labels, stats))
+            prev_frame = curr_frame
+            pbar.update(1)
+        cap.release()
+        pbar.close()
+        return results
+def main():
+    parser = argparse.ArgumentParser(description='Analyze motion regions in a video using RAFT optical flow')
+    parser.add_argument('--video', type=str, required=True, help='Path to input video')
+    parser.add_argument('--output', type=str, help='Path to output directory for visualization')
+    parser.add_argument('--clusters', type=int, default=3, help='Number of motion clusters')
+    args = parser.parse_args()
+    analyzer = OpticalFlowAnalyzer()
+    results = analyzer.process_video(args.video, args.output)
+    # Print summary statistics
+    print("\nMotion Analysis Summary:")
+    for i, (_, stats) in enumerate(results):
+        print(f"\nFrame {i+1}:")
+        for cluster_id, stat in stats.items():
+            motion_type = "Static" if stat['is_static'] else "Moving"
+            print(f"  Cluster {cluster_id} ({motion_type}):")
+            print(f"    Mean magnitude: {stat['mean_magnitude']:.4f}")
+            print(f"    Pixel count: {stat['pixel_count']}")
+if __name__ == "__main__":
+    main()

util/stylesheets.py ADDED Viewed

The diff for this file is too large to render. See raw diff

util/training_util.py ADDED Viewed

	@@ -0,0 +1,317 @@

+from typing import Union
+import torch
+import random
+import numpy as np
+import cv2
+import os
+def create_random_mask(batch_size, num_frames, height, width, device, dtype, shape_type=None):
+    """
+    Create random masks for sketch frames.
+    Args:
+        batch_size: Batch size
+        num_frames: Number of frames to mask
+        height, width: Image dimensions
+        device: Device for tensor
+        dtype: Data type for tensor
+        mask_area_ratio: Ratio of area to mask (0-1)
+        shape_type: Type of shape for masking ('square', 'circle', 'random'). If None, one is randomly selected.
+    Returns:
+        Mask tensor in [b, 1, num_frames, height, width] where 0 indicates areas to mask (inverse of previous implementation)
+    """
+    # Initialize with ones (unmasked)
+    masks = torch.ones(batch_size, 1, num_frames, height, width, device=device, dtype=dtype)
+    for b in range(batch_size):
+        for f in range(num_frames):
+            # Randomly select shape type if not specified
+            if shape_type is None:
+                shape_type = random.choice(['square', 'circle', 'random'])
+            # Create numpy mask for easier shape drawing
+            mask = np.zeros((height, width), dtype=np.float32)
+            if shape_type == 'square':
+                # Random squares
+                num_squares = random.randint(1, 5)
+                for _ in range(num_squares):
+                    # Random square size (proportional to image dimensions)
+                    max_size = min(height, width)
+                    size = random.randint(max_size // 4, max_size)
+                    # Random position
+                    x = random.randint(0, width - size)
+                    y = random.randint(0, height - size)
+                    # Draw square
+                    mask[y:y+size, x:x+size] = 1.0
+            elif shape_type == 'circle':
+                # Random circles
+                num_circles = random.randint(1, 5)
+                for _ in range(num_circles):
+                    # Random radius (proportional to image dimensions)
+                    max_radius = min(height, width) // 2
+                    radius = random.randint(max_radius // 4, max_radius)
+                    # Random center
+                    center_x = random.randint(radius, width - radius)
+                    center_y = random.randint(radius, height - radius)
+                    # Draw circle
+                    cv2.circle(mask, (center_x, center_y), radius, 1.0, -1)
+            elif shape_type == 'random':
+                # Create connected random shape with cv2
+                num_points = random.randint(5, 16)
+                points = []
+                # Generate random points
+                for _ in range(num_points):
+                    x = random.randint(0, width - 1)
+                    y = random.randint(0, height - 1)
+                    points.append([x, y])
+                # Convert to numpy array for cv2
+                points = np.array(points, dtype=np.int32)
+                # Draw filled polygon
+                cv2.fillPoly(mask, [points], 1.0)
+            # Convert numpy mask to tensor and subtract from ones (inverse the mask)
+            masks[b, 0, f] = 1.0 - torch.from_numpy(mask).to(device=device, dtype=dtype)
+    return masks
+@torch.no_grad()
+def extract_img_to_sketch(_sketch_model, _img, model_name="random"):
+    """
+    Return sketch: [-1, 1]
+    """
+    orig_shape = (_img.shape[-2], _img.shape[-1])
+    with torch.amp.autocast(dtype=torch.float32, device_type="cuda"):
+        reshaped_img = torch.nn.functional.interpolate(_img, (2048, 2048))
+        sketch = _sketch_model(reshaped_img, model_name=model_name)
+        sketch = torch.nn.functional.interpolate(sketch, orig_shape)
+    if sketch.shape[1] == 1:
+        sketch = sketch.repeat(1, 3, 1, 1)
+    return sketch
+def video_to_frame_and_sketch(
+    sketch_model,
+    original_video,
+    max_num_preserved_sketch_frames=2,
+    max_num_preserved_image_frames=1,
+    min_num_preserved_sketch_frames=2,
+    min_num_preserved_image_frames=1,
+    model_name=None,
+    detach_image_and_sketch=False,
+    equally_spaced_preserve_sketch=False,
+    apply_sketch_mask=False,
+    sketch_mask_ratio=0.2,
+    sketch_mask_shape=None,
+    no_first_sketch: Union[bool, float] = False,
+    video_clip_names=None,
+    is_flux_sketch_available=None,
+    is_evaluation=False,
+):
+    """
+    Args:
+        sketch_model: torch.nn.Module, a sketch pool for extracting sketches from images
+        original_video: torch.Tensor, shape=(batch_size, num_channels, num_frames, height, width)
+        max_num_preserved_sketch_frames: int, maximum number of preserved sketch frames
+        max_num_preserved_image_frames: int, maximum number of preserved image frames
+        min_num_preserved_sketch_frames: int, minimum number of preserved sketch frames
+        min_num_preserved_image_frames: int, minimum number of preserved image frames
+        model_name: str, name of the sketch model. If None, randomly select from ["lineart", "lineart_anime", "anime2sketch"]. Default: None.
+        equally_spaced_preserve_sketch: bool, whether to preserve sketches at equally spaced intervals. Default: False.
+        apply_sketch_mask: bool, whether to apply random masking to sketch frames. Default: False.
+        sketch_mask_ratio: float, ratio of frames to mask (0-1). Default: 0.2.
+        sketch_mask_shape: str, shape type for masking ('square', 'circle', 'random'). If None, randomly selected. Default: None.
+    Returns:
+        conditional_image: torch.Tensor, shape=(batch_size, num_frames, num_channels, height, width)
+        preserving_image_mask: torch.Tensor, shape=(batch_size, num_frames, 1, height, width)
+        full_sketch_frames: torch.Tensor, shape=(batch_size, num_frames, num_channels, height, width)
+        sketch_local_mask: torch.Tensor, shape=(batch_size, 1, num_frames, height, width) or None if apply_sketch_mask=False
+    """
+    video_shape = original_video.shape
+    video_dtype = original_video.dtype
+    video_device = original_video.device
+    if min_num_preserved_sketch_frames is None or min_num_preserved_sketch_frames < 2:
+        min_num_preserved_sketch_frames = 2  # Minimum num: 2 (the first and the last)
+    num_preserved_sketch_frames = random.randint(min_num_preserved_sketch_frames, max_num_preserved_sketch_frames)
+    num_preserved_sketch_frames = min(num_preserved_sketch_frames, video_shape[2])
+    # Always include first and last frames
+    if video_clip_names is not None and is_flux_sketch_available is not None:
+        if is_flux_sketch_available[0]:
+            num_preserved_sketch_frames = 2
+    if isinstance(no_first_sketch, float):
+        no_first_sketch = random.random() < no_first_sketch
+    if equally_spaced_preserve_sketch:
+        preserved_sketch_indices = torch.linspace(0, video_shape[2] - 1, num_preserved_sketch_frames).long().tolist()
+        if no_first_sketch:
+            preserved_sketch_indices = preserved_sketch_indices[1:]
+    else:
+        if no_first_sketch:
+            preserved_sketch_indices = [video_shape[2] - 1]
+        else:
+            preserved_sketch_indices = [0, video_shape[2] - 1]
+        # If we need more frames than just first and last
+        if num_preserved_sketch_frames > 2 and video_shape[2] > 4:
+            # Create set of all valid candidates (excluding first, last and their adjacent frames)
+            # Exclude indices adjacent to first and last
+            candidates = set(range(2, video_shape[2] - 2))
+            # Determine how many additional frames to select
+            additional_frames_needed = min(num_preserved_sketch_frames - 2, len(candidates))
+            # Keep selecting frames until we have enough or run out of candidates
+            additional_indices = []
+            while len(additional_indices) < additional_frames_needed and candidates:
+                # Convert set to list for random selection
+                candidate_list = list(candidates)
+                # Select a random candidate
+                idx = random.choice(candidate_list)
+                additional_indices.append(idx)
+                # Remove selected index and adjacent indices from candidates
+                candidates.remove(idx)
+                if idx - 1 in candidates:
+                    candidates.remove(idx - 1)
+                if idx + 1 in candidates:
+                    candidates.remove(idx + 1)
+            preserved_sketch_indices.extend(additional_indices)
+            preserved_sketch_indices.sort()
+    # Indices to preserve has been determined.
+    # Later code will not care the number of preserved frames but rely on the indices only.
+    preserved_image_indices = [0]
+    if max_num_preserved_image_frames is not None and max_num_preserved_image_frames > 1:
+        max_num_preserved_image_frames -= 1
+        if min_num_preserved_image_frames is None or min_num_preserved_image_frames < 1:
+            min_num_preserved_image_frames = 1
+        min_num_preserved_image_frames -= 1
+        other_indices = torch.tensor([i for i in range(video_shape[2]) if i not in preserved_sketch_indices])
+        max_num_preserved_image_frames = min(max_num_preserved_image_frames, len(other_indices))
+        min_num_preserved_image_frames = min(min_num_preserved_image_frames, max_num_preserved_image_frames)
+        num_preserved_image_frames = random.randint(min_num_preserved_image_frames, max_num_preserved_image_frames)
+        other_indices = other_indices[torch.randperm(len(other_indices))]
+        if num_preserved_image_frames > 0:
+            preserved_image_indices.extend(other_indices[:num_preserved_image_frames])
+    preserved_condition_mask = torch.zeros(size=(video_shape[0], video_shape[2]), dtype=video_dtype, device=video_device)  # [b, t]
+    masked_condition_video = torch.zeros_like(original_video)   # [b, c, t, h, w]
+    full_sketch_frames = torch.zeros_like(original_video)  # [b, c, t, h, w]
+    if detach_image_and_sketch:
+        preserved_condition_mask_sketch = torch.zeros_like(preserved_condition_mask)
+        masked_condition_video_sketch = torch.zeros_like(masked_condition_video)
+        if 0 not in preserved_sketch_indices and not no_first_sketch:
+            preserved_sketch_indices.append(0)
+    else:
+        preserved_condition_mask_sketch = None
+        masked_condition_video_sketch = None
+    for _idx in preserved_image_indices:
+        preserved_condition_mask[:, _idx] = 1.0
+        masked_condition_video[:, :, _idx, :, :] = original_video[:, :, _idx, :, :]
+    # Set up sketch_local_mask if masking is applied
+    sketch_local_mask = None
+    if apply_sketch_mask:
+        # Create a full-sized mask initialized to all ones (unmasked)
+        sketch_local_mask = torch.ones(
+            video_shape[0], video_shape[2], video_shape[3], video_shape[4],
+            device=video_device,
+            dtype=video_dtype
+        ).unsqueeze(1)  # Add channel dimension to get [b, 1, t, h, w]
+        if not is_evaluation and random.random() < sketch_mask_ratio:
+            # For preserved frames, apply random masking
+            for i, frame_idx in enumerate(preserved_sketch_indices):
+                if i == 0:
+                    # First frame is not masked
+                    continue
+                # Create masks only for preserved frames
+                frame_masks = create_random_mask(
+                    batch_size=video_shape[0],
+                    num_frames=1,  # Just one frame at a time
+                    height=video_shape[3],
+                    width=video_shape[4],
+                    device=video_device,
+                    dtype=video_dtype,
+                    # mask_area_ratio=0.4 * random.random() + 0.1,
+                    shape_type=sketch_mask_shape
+                )
+                # Set the mask for this preserved frame
+                sketch_local_mask[:, :, frame_idx:frame_idx+1, :, :] = frame_masks
+    # Produce sketches for preserved frames
+    # Sketches can either be 1) calculated from sketch pool or 2) loaded from the flux sketch directory
+    if is_flux_sketch_available is not None and is_flux_sketch_available[0]:
+        should_use_flux_sketch = random.random() < 0.75 if not is_evaluation else True
+    else:
+        should_use_flux_sketch = False
+    cur_model_name = "flux" if should_use_flux_sketch else random.choice(["lineart", "lineart_anime", "anime2sketch"]) if model_name is None else model_name # "anime2sketch"
+    # cur_model_name = "anyline"
+    for _idx in preserved_sketch_indices:
+        sketch_frame = None
+        if should_use_flux_sketch:
+            # Load flux sketch
+            sketech_path = f"/group/40005/gzhiwang/iclora/linearts/{video_clip_names[0]}/{_idx}.lineart.png"
+            print(f"Loading flux sketch from {sketech_path}...")
+            if os.path.exists(sketech_path):
+                sketch_frame = cv2.imread(sketech_path)
+                sketch_frame = cv2.cvtColor(sketch_frame, cv2.COLOR_BGR2RGB)
+                # resize to 480p
+                sketch_frame = cv2.resize(sketch_frame, (video_shape[4], video_shape[3]))
+                sketch_frame = torch.from_numpy(sketch_frame).to(video_device, dtype=video_dtype)
+                # Normalize to [-1, 1]
+                sketch_frame = sketch_frame / 255.0 * 2.0 - 1.0
+                sketch_frame = sketch_frame.permute(2, 0, 1)
+                sketch_frame = sketch_frame.unsqueeze(0)
+            else:
+                print(f"FLUX Sketch path {sketech_path} does not exist. Falling back to sketch pool.")
+            #     raise ValueError(f"FLUX Sketch path {sketech_path} does not exist.")
+        if sketch_frame is None:
+            # Calculate sketch from sketch pool
+            sketch_frame = extract_img_to_sketch(
+                    sketch_model, original_video[:, :, _idx, :, :].float(),
+                    model_name=cur_model_name).to(video_device, dtype=video_dtype)
+        # Convert white BG (from sketch pool or loaded from flux sketch files) to black BG (for training)
+        sketch_frame = -torch.clip(sketch_frame, -1, 1)
+        full_sketch_frames[:, :, _idx, :, :] = sketch_frame
+    if len(preserved_sketch_indices) > 0:
+        _mask_to_add = preserved_condition_mask_sketch if detach_image_and_sketch else preserved_condition_mask
+        _video_to_add = masked_condition_video_sketch if detach_image_and_sketch else masked_condition_video
+        if not detach_image_and_sketch:
+            preserved_sketch_indices = preserved_sketch_indices[1:]
+        # Apply masking to sketch frames if required
+        if apply_sketch_mask and sketch_local_mask is not None:
+            # sketch_local_mask: [b, 1, t, h, w]
+            for _idx in preserved_sketch_indices:
+                _mask_to_add[:, _idx] = 1.0 if detach_image_and_sketch else -1.0
+                _video_to_add[:, :, _idx, :, :] = torch.where(sketch_local_mask[:, 0:1, _idx, :, :] == 0, -1.0, full_sketch_frames[:, :, _idx, :, :])
+        else:
+            for _idx in preserved_sketch_indices:
+                _mask_to_add[:, _idx] = 1.0 if detach_image_and_sketch else -1.0
+                _video_to_add[:, :, _idx, :, :] = full_sketch_frames[:, :, _idx, :, :]
+    return masked_condition_video, preserved_condition_mask, masked_condition_video_sketch, preserved_condition_mask_sketch, full_sketch_frames, sketch_local_mask, cur_model_name