Spaces:

alexnasa
/

HuMo_local

Running on Zero

App Files Files Community

alex commited on 19 days ago

Commit

e37991a

1 Parent(s): 2c8ec61

progress bar fixed

Browse files

Files changed (3) hide show

app.py +15 -7
humo/generate.py +9 -3
humo/generate_1_7B.py +326 -46

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import subprocess
 import uuid
 import shutil
 from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
@@ -93,7 +93,6 @@ config = load_config(
 )
 runner = create_object(config)
 os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space")  # or another writable path
 def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
@@ -110,7 +109,7 @@ def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile
 # restore_inductor_cache_from_hub("alexnasa/humo-compiled")
-def get_duration(prompt_text, steps, image_file, audio_file_path, max_duration, session_id):
     return calculate_required_time(steps, max_duration)
@@ -124,6 +123,15 @@ def calculate_required_time(steps, max_duration):
         70: 13,
         95: 21,
     }
     each_step_s = max_duration_duration_mapping[max_duration]
     duration_s = (each_step_s * steps) + warmup_s
@@ -143,7 +151,7 @@ def update_required_time(steps, max_duration):
     return get_required_time_string(steps, max_duration)
-def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None):
     prompt_text_check = (prompt_text or "").strip()
     if not prompt_text_check:
@@ -152,7 +160,7 @@ def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duratio
     if not audio_file_path and not image_paths:
         raise gr.Error("Please provide a reference image or a lipsync audio.")
-    return run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration, session_id)
 def upload_inductor_cache_to_hub(
     repo_id: str,
@@ -206,7 +214,7 @@ def upload_inductor_cache_to_hub(
 @spaces.GPU(duration=get_duration)
-def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None):
     if session_id is None:
         session_id = uuid.uuid4().hex
@@ -267,7 +275,6 @@ def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration
     width, height = 832, 480
-    # Run inference
     runner.inference_loop(
         prompt_text,
         img_paths,
@@ -280,6 +287,7 @@ def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration
         steps,
         frames = int(max_duration),
         tea_cache_l1_thresh = 0.0,
     )
     # Return resulting video path

 import subprocess
 import uuid
 import shutil
+from tqdm import tqdm
 from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
 )
 runner = create_object(config)
 os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space")  # or another writable path
 def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
 # restore_inductor_cache_from_hub("alexnasa/humo-compiled")
+def get_duration(prompt_text, steps, image_file, audio_file_path, max_duration, session_id, progress):
     return calculate_required_time(steps, max_duration)
         70: 13,
         95: 21,
     }
+    # Humo 1.7
+    # max_duration_duration_mapping = {
+    #     20: 2,
+    #     45: 2,
+    #     70: 5,
+    #     95: 6,
+    # }
     each_step_s = max_duration_duration_mapping[max_duration]
     duration_s = (each_step_s * steps) + warmup_s
     return get_required_time_string(steps, max_duration)
+def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),):
     prompt_text_check = (prompt_text or "").strip()
     if not prompt_text_check:
     if not audio_file_path and not image_paths:
         raise gr.Error("Please provide a reference image or a lipsync audio.")
+    return run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration, session_id, progress)
 def upload_inductor_cache_to_hub(
     repo_id: str,
 @spaces.GPU(duration=get_duration)
+def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),):
     if session_id is None:
         session_id = uuid.uuid4().hex
     width, height = 832, 480
     runner.inference_loop(
         prompt_text,
         img_paths,
         steps,
         frames = int(max_duration),
         tea_cache_l1_thresh = 0.0,
+        progress_bar_cmd=progress
     )
     # Return resulting video path

humo/generate.py CHANGED Viewed

@@ -680,6 +680,7 @@ class Generator():
                  n_prompt="",
                  seed=-1,
                  tea_cache_l1_thresh = 0.0,
                  device = get_device(),
         ):
@@ -796,8 +797,11 @@ class Generator():
             arg_tia = {'seq_len': seq_len, 'audio': audio_emb, 'y': y_c, 'context': context, "tea_cache": TeaCache(sampling_steps, rel_l1_thresh=tea_cache_l1_thresh, model_id=tea_cache_model_id) if tea_cache_l1_thresh is not None and tea_cache_l1_thresh > 0 else None}
             torch.cuda.empty_cache()
             # self.dit.to(device=get_device())
-            for _, t in enumerate(tqdm(timesteps)):
                 timestep = [t]
                 timestep = torch.stack(timestep)
@@ -823,6 +827,7 @@ class Generator():
                 del timestep
                 torch.cuda.empty_cache()
             x0 = latents
             x0 = [x0_[:,:-latents_ref[0].shape[1]] for x0_ in x0]
@@ -848,7 +853,7 @@ class Generator():
         return videos[0] # if get_local_rank() == 0 else None
-    def inference_loop(self, prompt, ref_img_path, audio_path, output_dir, filename, inference_mode = "TIA", width = 832, height = 480, steps=50, frames = 97, tea_cache_l1_thresh = 0.0, seed = 0):
         video = self.inference(
             prompt,
@@ -861,7 +866,8 @@ class Generator():
             sampling_steps=steps,
             inference_mode = inference_mode,
             tea_cache_l1_thresh = tea_cache_l1_thresh,
-            seed=seed
         )
         torch.cuda.empty_cache()

                  n_prompt="",
                  seed=-1,
                  tea_cache_l1_thresh = 0.0,
+                 progress_bar_cmd = None,
                  device = get_device(),
         ):
             arg_tia = {'seq_len': seq_len, 'audio': audio_emb, 'y': y_c, 'context': context, "tea_cache": TeaCache(sampling_steps, rel_l1_thresh=tea_cache_l1_thresh, model_id=tea_cache_model_id) if tea_cache_l1_thresh is not None and tea_cache_l1_thresh > 0 else None}
             torch.cuda.empty_cache()
+            total_steps = len(timesteps)
             # self.dit.to(device=get_device())
+            for i, t in progress_bar_cmd.tqdm(enumerate(timesteps), desc=f"/{total_steps} Steps"):
                 timestep = [t]
                 timestep = torch.stack(timestep)
                 del timestep
                 torch.cuda.empty_cache()
             x0 = latents
             x0 = [x0_[:,:-latents_ref[0].shape[1]] for x0_ in x0]
         return videos[0] # if get_local_rank() == 0 else None
+    def inference_loop(self, prompt, ref_img_path, audio_path, output_dir, filename, inference_mode = "TIA", width = 832, height = 480, steps=50, frames = 97, tea_cache_l1_thresh = 0.0, progress_bar_cmd = None, seed = 0):
         video = self.inference(
             prompt,
             sampling_steps=steps,
             inference_mode = inference_mode,
             tea_cache_l1_thresh = tea_cache_l1_thresh,
+            seed=seed,
+            progress_bar_cmd = progress_bar_cmd
         )
         torch.cuda.empty_cache()

humo/generate_1_7B.py CHANGED Viewed

@@ -18,6 +18,7 @@ import gc
 import random
 import sys
 import mediapy
 import torch
 import torch.distributed as dist
 from omegaconf import DictConfig, ListConfig, OmegaConf
@@ -59,7 +60,15 @@ import torch.cuda.amp as amp
 from humo.models.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from humo.utils.audio_processor_whisper import AudioProcessor
 from humo.utils.wav2vec import linear_interpolation_fps
 image_transform = Compose([
     ToTensor(),
@@ -96,14 +105,130 @@ def clever_format(nums, format="%.2f"):
     return clever_nums
 class Generator():
     def __init__(self, config: DictConfig):
         self.config = config.copy()
         OmegaConf.set_readonly(self.config, True)
         self.logger = get_logger(self.__class__.__name__)
-        self.configure_models()
         # init_torch(cudnn_benchmark=False)
     def get_fsdp_sharding_config(self, sharding_strategy, device_mesh_config):
         device_mesh = None
@@ -115,43 +240,63 @@ class Generator():
             device_mesh = init_device_mesh("cuda", tuple(device_mesh_config))
         return device_mesh, fsdp_strategy
     def configure_models(self):
-        self.configure_dit_model(device="cpu")
-        self.configure_vae_model()
         if self.config.generation.get('extract_audio_feat', False):
             self.configure_wav2vec(device="cpu")
-        self.configure_text_model(device="cpu")
-        # Initialize fsdp.
-        self.configure_dit_fsdp_model()
-        self.configure_text_fsdp_model()
     def configure_dit_model(self, device=get_device()):
         init_unified_parallel(self.config.dit.sp_size)
         self.sp_size = get_unified_parallel_world_size()
-        # Create dit model.
         init_device = "meta"
         with torch.device(init_device):
             self.dit = create_object(self.config.dit.model)
         self.logger.info(f"Load DiT model on {init_device}.")
         self.dit.eval().requires_grad_(False)
         # Load dit checkpoint.
         path = self.config.dit.checkpoint_dir
         if path.endswith(".pth"):
-            state = torch.load(path, map_location=device, mmap=True)
             missing_keys, unexpected_keys = self.dit.load_state_dict(state, strict=False, assign=True)
             self.logger.info(
-                f"dit loaded from {path}. "
-                f"Missing keys: {len(missing_keys)}, "
-                f"Unexpected keys: {len(unexpected_keys)}"
             )
         else:
             from safetensors.torch import load_file
             import json
-            def load_custom_sharded_weights(model_dir, base_name, device=device):
                 index_path = f"{model_dir}/{base_name}.safetensors.index.json"
                 with open(index_path, "r") as f:
                     index = json.load(f)
@@ -160,23 +305,28 @@ class Generator():
                 state_dict = {}
                 for shard_file in shard_files:
                     shard_path = f"{model_dir}/{shard_file}"
-                    shard_state = load_file(shard_path)
-                    shard_state = {k: v.to(device) for k, v in shard_state.items()}
                     state_dict.update(shard_state)
                 return state_dict
-            state = load_custom_sharded_weights(path, 'humo', device)
             self.dit.load_state_dict(state, strict=False, assign=True)
         self.dit = meta_non_persistent_buffer_init_fn(self.dit)
-        if device in [get_device(), "cuda"]:
-            self.dit.to(get_device())
         # Print model size.
         params = sum(p.numel() for p in self.dit.parameters())
         self.logger.info(
             f"[RANK:{get_global_rank()}] DiT Parameters: {clever_format(params, '%.3f')}"
         )
     def configure_vae_model(self, device=get_device()):
         self.vae_stride = self.config.vae.vae_stride
         self.vae = WanVAE(
@@ -216,15 +366,93 @@ class Generator():
     def configure_dit_fsdp_model(self):
-        self.dit.to(get_device())
-        return
     def configure_text_fsdp_model(self):
-        self.text_encoder.to(get_device())
-        return
     def load_image_latent_ref_id(self, path: str, size, device):
@@ -390,7 +618,6 @@ class Generator():
                     neg
         return noise_pred
     @torch.no_grad()
     def inference(self,
@@ -401,20 +628,22 @@ class Generator():
                  frame_num=81,
                  shift=5.0,
                  sample_solver='unipc',
                  sampling_steps=50,
                  n_prompt="",
                  seed=-1,
-                 offload_model=True,
                  device = get_device(),
         ):
-        self.vae.model.to(device=device)
         if img_path is not None:
             latents_ref = self.load_image_latent_ref_id(img_path, size, device)
         else:
             latents_ref = [torch.zeros(16, 1, size[1]//8, size[0]//8).to(device)]
-        self.vae.model.to(device="cpu")
         latents_ref_neg = [torch.zeros_like(latent_ref) for latent_ref in latents_ref]
         # audio
@@ -456,10 +685,10 @@ class Generator():
         seed_g = torch.Generator(device=device)
         seed_g.manual_seed(seed)
-        self.text_encoder.model.to(device)
         context = self.text_encoder([input_prompt], device)
         context_null = self.text_encoder([n_prompt], device)
-        self.text_encoder.model.cpu()
         noise = [
             torch.randn(
@@ -477,10 +706,9 @@ class Generator():
             yield
         no_sync = getattr(self.dit, 'no_sync', noop_no_sync)
-        # step_change = self.config.generation.step_change # 980
         # evaluation mode
-        with amp.autocast(dtype=torch.bfloat16), torch.no_grad(), no_sync():
             if sample_solver == 'unipc':
                 sample_scheduler = FlowUniPCMultistepScheduler(
@@ -500,7 +728,7 @@ class Generator():
             arg_null = {'context': context_null, 'seq_len': seq_len, 'audio': audio_emb_neg}
             torch.cuda.empty_cache()
-            self.dit.to(device=get_device())
             for _, t in enumerate(tqdm(timesteps)):
                 timestep = [t]
                 timestep = torch.stack(timestep)
@@ -527,12 +755,13 @@ class Generator():
             x0 = [x0_[:,:-latents_ref[0].shape[1]] for x0_ in x0]
             # if offload_model:
-            self.dit.cpu()
             torch.cuda.empty_cache()
             # if get_local_rank() == 0:
-            self.vae.model.to(device=device)
             videos = self.vae.decode(x0)
-            self.vae.model.to(device="cpu")
         del noise, latents, noise_pred
         del audio_emb, audio_emb_neg, latents_ref, latents_ref_neg, context, context_null
@@ -547,8 +776,7 @@ class Generator():
         return videos[0] # if get_local_rank() == 0 else None
-    def inference_loop(self, prompt, ref_img_path, audio_path, output_dir, filename, width = 832, height = 480, steps=50, frames = 97, seed = 0):
-        print(f'ref_img_path:{ref_img_path}')
         video = self.inference(
             prompt,
@@ -559,14 +787,14 @@ class Generator():
             shift=self.config.diffusion.timesteps.sampling.shift,
             sample_solver='unipc',
             sampling_steps=steps,
-            seed=seed,
-            offload_model=False,
         )
         torch.cuda.empty_cache()
         gc.collect()
         # Save samples.
         if get_sequence_parallel_rank() == 0:
             pathname = self.save_sample(
@@ -580,7 +808,6 @@ class Generator():
         del video, prompt
         torch.cuda.empty_cache()
         gc.collect()
     def save_sample(self, *, sample: torch.Tensor, audio_path: str, output_dir: str, filename: str):
@@ -619,4 +846,57 @@ class Generator():
             raise NotImplementedError
         assert isinstance(pos_prompts, ListConfig)
-        return pos_prompts

 import random
 import sys
 import mediapy
+import numpy as np
 import torch
 import torch.distributed as dist
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from humo.models.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from humo.utils.audio_processor_whisper import AudioProcessor
 from humo.utils.wav2vec import linear_interpolation_fps
+from torchao.quantization import quantize_
+import torch._dynamo as dynamo
+dynamo.config.capture_scalar_outputs = True
+torch.set_float32_matmul_precision("high")
+import torch
+import torch.nn as nn
+import transformer_engine.pytorch as te
 image_transform = Compose([
     ToTensor(),
     return clever_nums
+# --- put near your imports ---
+import torch
+import torch.nn as nn
+import contextlib
+import transformer_engine.pytorch as te
+# FP8 autocast compatibility for different TE versions
+try:
+    # Preferred modern API
+    from transformer_engine.pytorch import fp8_autocast
+    try:
+        # Newer TE: use recipe-based API
+        from transformer_engine.common.recipe import DelayedScaling, Format
+        def make_fp8_ctx(enabled: bool = True):
+            if not enabled:
+                return contextlib.nullcontext()
+            fp8_recipe = DelayedScaling(fp8_format=Format.E4M3)  # E4M3 format
+            return fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)
+    except Exception:
+        # Very old variant that might still accept fp8_format directly
+        def make_fp8_ctx(enabled: bool = True):
+            # If TE doesn't have FP8Format, just no-op
+            if not hasattr(te, "FP8Format"):
+                return contextlib.nullcontext()
+            return te.fp8_autocast(enabled=enabled, fp8_format=te.FP8Format.E4M3)
+except Exception:
+    # TE not present or totally incompatible — no-op
+    def make_fp8_ctx(enabled: bool = True):
+        return contextlib.nullcontext()
+# TE sometimes exposes Linear at different paths; this normalizes it.
+try:
+    TELinear = te.Linear
+except AttributeError:  # very old layouts
+    from transformer_engine.pytorch.modules.linear import Linear as TELinear  # type: ignore
+# --- near imports ---
+import torch
+import torch.nn as nn
+import transformer_engine.pytorch as te
+try:
+    TELinear = te.Linear
+except AttributeError:
+    from transformer_engine.pytorch.modules.linear import Linear as TELinear  # type: ignore
+import torch
+import torch.nn as nn
+import transformer_engine.pytorch as te
+try:
+    TELinear = te.Linear
+except AttributeError:
+    from transformer_engine.pytorch.modules.linear import Linear as TELinear  # type: ignore
+def _default_te_allow(fullname: str, lin: nn.Linear) -> bool:
+    """
+    Allow TE only where it's shape-safe & beneficial.
+    Skip small/special layers (time/timestep/pos embeds, heads).
+    Enforce multiples of 16 for in/out features (FP8 kernel friendly).
+    Also skip very small projections likely to see M=1.
+    """
+    blocked_keywords = (
+        "time_embedding", "timestep", "time_embed",
+        "time_projection", "pos_embedding", "pos_embed",
+        "to_logits", "logits", "final_proj", "proj_out", "output_projection",
+    )
+    if any(k in fullname for k in blocked_keywords):
+        return False
+    # TE FP8 kernels like K, N divisible by 16
+    if lin.in_features % 16 != 0 or lin.out_features % 16 != 0:
+        return False
+    # Heuristic: avoid tiny layers; keeps attention/MLP, skips small MLPs
+    if lin.in_features < 512 or lin.out_features < 512:
+        return False
+    # Whitelist: only convert inside transformer blocks if you know their prefix
+    # This further reduces risk of catching special heads elsewhere.
+    allowed_context = ("blocks", "layers", "transformer", "attn", "mlp", "ffn")
+    if not any(tok in fullname for tok in allowed_context):
+        return False
+    return True
+@torch.no_grad()
+def convert_linears_to_te_fp8(module: nn.Module, allow_pred=_default_te_allow, _prefix=""):
+    for name, child in list(module.named_children()):
+        full = f"{_prefix}.{name}" if _prefix else name
+        convert_linears_to_te_fp8(child, allow_pred, full)
+        if isinstance(child, nn.Linear):
+            if allow_pred is not None and not allow_pred(full, child):
+                continue
+            te_lin = TELinear(
+                in_features=child.in_features,
+                out_features=child.out_features,
+                bias=(child.bias is not None),
+                params_dtype=torch.bfloat16,
+            ).to(child.weight.device)
+            te_lin.weight.copy_(child.weight.to(te_lin.weight.dtype))
+            if child.bias is not None:
+                te_lin.bias.copy_(child.bias.to(te_lin.bias.dtype))
+            setattr(module, name, te_lin)
+    return module
 class Generator():
     def __init__(self, config: DictConfig):
         self.config = config.copy()
         OmegaConf.set_readonly(self.config, True)
         self.logger = get_logger(self.__class__.__name__)
         # init_torch(cudnn_benchmark=False)
+        self.configure_models()
+    def entrypoint(self):
+        self.inference_loop()
     def get_fsdp_sharding_config(self, sharding_strategy, device_mesh_config):
         device_mesh = None
             device_mesh = init_device_mesh("cuda", tuple(device_mesh_config))
         return device_mesh, fsdp_strategy
     def configure_models(self):
+        self.configure_dit_model(device="cuda")
+        self.dit.eval().to("cuda")
+        convert_linears_to_te_fp8(self.dit)
+        self.dit = torch.compile(self.dit, )
+        self.configure_vae_model(device="cuda")
         if self.config.generation.get('extract_audio_feat', False):
             self.configure_wav2vec(device="cpu")
+        self.configure_text_model(device="cuda")
+        # # Initialize fsdp.
+        # self.configure_dit_fsdp_model()
+        # self.configure_text_fsdp_model()
+        # quantize_(self.text_encoder, Int8WeightOnlyConfig())
+        # quantize_(self.dit, Float8DynamicActivationFloat8WeightConfig())
     def configure_dit_model(self, device=get_device()):
         init_unified_parallel(self.config.dit.sp_size)
         self.sp_size = get_unified_parallel_world_size()
+        # Create DiT model on meta, then mark dtype as bfloat16 (no real allocation yet).
         init_device = "meta"
         with torch.device(init_device):
             self.dit = create_object(self.config.dit.model)
+            self.dit = self.dit.to(dtype=torch.bfloat16)  # or: self.dit.bfloat16()
         self.logger.info(f"Load DiT model on {init_device}.")
         self.dit.eval().requires_grad_(False)
         # Load dit checkpoint.
         path = self.config.dit.checkpoint_dir
+        def _cast_state_dict_to_bf16(state):
+            for k, v in state.items():
+                if isinstance(v, torch.Tensor) and v.is_floating_point():
+                    state[k] = v.to(dtype=torch.bfloat16, copy=False)
+            return state
         if path.endswith(".pth"):
+            # Load to CPU first; we’ll move the model later.
+            state = torch.load(path, map_location="cpu", mmap=True)
+            state = _cast_state_dict_to_bf16(state)
             missing_keys, unexpected_keys = self.dit.load_state_dict(state, strict=False, assign=True)
             self.logger.info(
+                f"dit loaded from {path}. Missing keys: {len(missing_keys)}, Unexpected keys: {len(unexpected_keys)}"
             )
         else:
             from safetensors.torch import load_file
             import json
+            def load_custom_sharded_weights(model_dir, base_name):
                 index_path = f"{model_dir}/{base_name}.safetensors.index.json"
                 with open(index_path, "r") as f:
                     index = json.load(f)
                 state_dict = {}
                 for shard_file in shard_files:
                     shard_path = f"{model_dir}/{shard_file}"
+                    # Load on CPU, then cast to bf16; we’ll move the whole module later.
+                    shard_state = load_file(shard_path, device="cpu")
+                    shard_state = {k: (v.to(dtype=torch.bfloat16, copy=False) if v.is_floating_point() else v)
+                                for k, v in shard_state.items()}
                     state_dict.update(shard_state)
                 return state_dict
+            state = load_custom_sharded_weights(path, 'humo')
             self.dit.load_state_dict(state, strict=False, assign=True)
         self.dit = meta_non_persistent_buffer_init_fn(self.dit)
+        target_device = get_device() if device in [get_device(), "cuda"] else device
+        self.dit.to(target_device)  # dtype already bf16
         # Print model size.
         params = sum(p.numel() for p in self.dit.parameters())
         self.logger.info(
             f"[RANK:{get_global_rank()}] DiT Parameters: {clever_format(params, '%.3f')}"
         )
     def configure_vae_model(self, device=get_device()):
         self.vae_stride = self.config.vae.vae_stride
         self.vae = WanVAE(
     def configure_dit_fsdp_model(self):
+        from humo.models.wan_modules.model_humo import WanAttentionBlock
+        dit_blocks = (WanAttentionBlock,)
+        # Init model_shard_cpu_group for saving checkpoint with sharded state_dict.
+        init_model_shard_cpu_group(
+            self.config.dit.fsdp.sharding_strategy,
+            self.config.dit.fsdp.get("device_mesh", None),
+        )
+        # Assert that dit has wrappable blocks.
+        assert any(isinstance(m, dit_blocks) for m in self.dit.modules())
+        # Define wrap policy on all dit blocks.
+        def custom_auto_wrap_policy(module, recurse, *args, **kwargs):
+            return recurse or isinstance(module, dit_blocks)
+        # Configure FSDP settings.
+        device_mesh, fsdp_strategy = self.get_fsdp_sharding_config(
+            self.config.dit.fsdp.sharding_strategy,
+            self.config.dit.fsdp.get("device_mesh", None),
+        )
+        settings = dict(
+            auto_wrap_policy=custom_auto_wrap_policy,
+            sharding_strategy=fsdp_strategy,
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            device_id=get_local_rank(),
+            use_orig_params=False,
+            sync_module_states=True,
+            forward_prefetch=True,
+            limit_all_gathers=False,  # False for ZERO2.
+            mixed_precision=MixedPrecision(
+                param_dtype=torch.bfloat16,
+                reduce_dtype=torch.float32,
+                buffer_dtype=torch.float32,
+            ),
+            device_mesh=device_mesh,
+            param_init_fn=meta_param_init_fn,
+        )
+        # Apply FSDP.
+        self.dit = FullyShardedDataParallel(self.dit, **settings)
+        # self.dit.to(get_device())
     def configure_text_fsdp_model(self):
+        # If FSDP is not enabled, put text_encoder to GPU and return.
+        if not self.config.text.fsdp.enabled:
+            self.text_encoder.to(get_device())
+            return
+        # from transformers.models.t5.modeling_t5 import T5Block
+        from humo.models.wan_modules.t5 import T5SelfAttention
+        text_blocks = (torch.nn.Embedding, T5SelfAttention)
+        # text_blocks_names = ("QWenBlock", "QWenModel")  # QWen cannot be imported. Use str.
+        def custom_auto_wrap_policy(module, recurse, *args, **kwargs):
+            return (
+                recurse
+                or isinstance(module, text_blocks)
+            )
+        # Apply FSDP.
+        text_encoder_dtype = getattr(torch, self.config.text.dtype)
+        device_mesh, fsdp_strategy = self.get_fsdp_sharding_config(
+            self.config.text.fsdp.sharding_strategy,
+            self.config.text.fsdp.get("device_mesh", None),
+        )
+        self.text_encoder = FullyShardedDataParallel(
+            module=self.text_encoder,
+            auto_wrap_policy=custom_auto_wrap_policy,
+            sharding_strategy=fsdp_strategy,
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            device_id=get_local_rank(),
+            use_orig_params=False,
+            sync_module_states=False,
+            forward_prefetch=True,
+            limit_all_gathers=True,
+            mixed_precision=MixedPrecision(
+                param_dtype=text_encoder_dtype,
+                reduce_dtype=text_encoder_dtype,
+                buffer_dtype=text_encoder_dtype,
+            ),
+            device_mesh=device_mesh,
+        )
+        self.text_encoder.to(get_device()).requires_grad_(False)
     def load_image_latent_ref_id(self, path: str, size, device):
                     neg
         return noise_pred
     @torch.no_grad()
     def inference(self,
                  frame_num=81,
                  shift=5.0,
                  sample_solver='unipc',
+                 inference_mode='TIA',
                  sampling_steps=50,
                  n_prompt="",
                  seed=-1,
+                 tea_cache_l1_thresh = 0.0,
                  device = get_device(),
         ):
+        # self.vae.model.to(device=device)
         if img_path is not None:
             latents_ref = self.load_image_latent_ref_id(img_path, size, device)
         else:
             latents_ref = [torch.zeros(16, 1, size[1]//8, size[0]//8).to(device)]
+        # self.vae.model.to(device="cpu")
         latents_ref_neg = [torch.zeros_like(latent_ref) for latent_ref in latents_ref]
         # audio
         seed_g = torch.Generator(device=device)
         seed_g.manual_seed(seed)
+        # self.text_encoder.model.to(device)
         context = self.text_encoder([input_prompt], device)
         context_null = self.text_encoder([n_prompt], device)
+        # self.text_encoder.model.cpu()
         noise = [
             torch.randn(
             yield
         no_sync = getattr(self.dit, 'no_sync', noop_no_sync)
         # evaluation mode
+        with make_fp8_ctx(True), torch.autocast('cuda', dtype=torch.bfloat16), torch.no_grad(), no_sync():
             if sample_solver == 'unipc':
                 sample_scheduler = FlowUniPCMultistepScheduler(
             arg_null = {'context': context_null, 'seq_len': seq_len, 'audio': audio_emb_neg}
             torch.cuda.empty_cache()
             for _, t in enumerate(tqdm(timesteps)):
                 timestep = [t]
                 timestep = torch.stack(timestep)
             x0 = [x0_[:,:-latents_ref[0].shape[1]] for x0_ in x0]
             # if offload_model:
+            # self.dit.cpu()
             torch.cuda.empty_cache()
             # if get_local_rank() == 0:
+            # self.vae.model.to(device=device)
             videos = self.vae.decode(x0)
+            # self.vae.model.to(device="cpu")
         del noise, latents, noise_pred
         del audio_emb, audio_emb_neg, latents_ref, latents_ref_neg, context, context_null
         return videos[0] # if get_local_rank() == 0 else None
+    def inference_loop(self, prompt, ref_img_path, audio_path, output_dir, filename, inference_mode = "TIA", width = 832, height = 480, steps=50, frames = 97, tea_cache_l1_thresh = 0.0, seed = 0):
         video = self.inference(
             prompt,
             shift=self.config.diffusion.timesteps.sampling.shift,
             sample_solver='unipc',
             sampling_steps=steps,
+            inference_mode = inference_mode,
+            tea_cache_l1_thresh = tea_cache_l1_thresh,
+            seed=seed
         )
         torch.cuda.empty_cache()
         gc.collect()
         # Save samples.
         if get_sequence_parallel_rank() == 0:
             pathname = self.save_sample(
         del video, prompt
         torch.cuda.empty_cache()
         gc.collect()
     def save_sample(self, *, sample: torch.Tensor, audio_path: str, output_dir: str, filename: str):
             raise NotImplementedError
         assert isinstance(pos_prompts, ListConfig)
+        return pos_prompts
+class TeaCache:
+    def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
+        self.num_inference_steps = num_inference_steps
+        self.step = 0
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.rel_l1_thresh = rel_l1_thresh
+        self.previous_residual = None
+        self.previous_hidden_states = None
+        self.coefficients_dict = {
+            "Wan2.1-T2V-1.3B": [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02],
+            "Wan2.1-T2V-14B": [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01],
+            "Wan2.1-I2V-14B-480P": [2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01],
+            "Wan2.1-I2V-14B-720P": [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02],
+        }
+        if model_id not in self.coefficients_dict:
+            supported_model_ids = ", ".join([i for i in self.coefficients_dict])
+            raise ValueError(f"{model_id} is not a supported TeaCache model id. Please choose a valid model id in ({supported_model_ids}).")
+        self.coefficients = self.coefficients_dict[model_id]
+    def check(self, dit, x, t_mod):
+        modulated_inp = t_mod.clone()
+        if self.step == 0 or self.step == self.num_inference_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = self.coefficients
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.step += 1
+        if self.step == self.num_inference_steps:
+            self.step = 0
+        if should_calc:
+            self.previous_hidden_states = x.clone()
+        return not should_calc
+    def store(self, hidden_states):
+        if self.previous_hidden_states is None:
+            return
+        self.previous_residual = hidden_states - self.previous_hidden_states
+        self.previous_hidden_states = None
+    def update(self, hidden_states):
+        hidden_states = hidden_states + self.previous_residual
+        return hidden_states