EasyAnimate

Running

EasyAnimate / easyanimate /vae /ldm /models /cogvideox_enc_dec.py

bubbliiiing

Update V5

f62c8b9 about 1 year ago

12.3 kB

	# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
	# All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from typing import Optional, Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	from diffusers.models.autoencoders.autoencoder_kl_cogvideox import (
	CogVideoXCausalConv3d, CogVideoXDownBlock3D, CogVideoXMidBlock3D,
	CogVideoXSafeConv3d, CogVideoXSpatialNorm3D, CogVideoXUpBlock3D)
	from diffusers.utils import logging

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	class CogVideoXEncoder3D(nn.Module):
	r"""
	The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.

	Args:
	in_channels (`int`, optional, defaults to 3):
	The number of input channels.
	out_channels (`int`, optional, defaults to 3):
	The number of output channels.
	down_block_types (`Tuple[str, ...]`, optional, defaults to `("DownEncoderBlock2D",)`):
	The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
	options.
	block_out_channels (`Tuple[int, ...]`, optional, defaults to `(64,)`):
	The number of output channels for each block.
	act_fn (`str`, optional, defaults to `"silu"`):
	The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
	layers_per_block (`int`, optional, defaults to 2):
	The number of layers per block.
	norm_num_groups (`int`, optional, defaults to 32):
	The number of groups for normalization.
	"""

	_supports_gradient_checkpointing = True

	def __init__(
	self,
	in_channels: int = 3,
	out_channels: int = 16,
	down_block_types: Tuple[str, ...] = (
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D",
	),
	block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
	layers_per_block: int = 3,
	act_fn: str = "silu",
	norm_eps: float = 1e-6,
	norm_num_groups: int = 32,
	dropout: float = 0.0,
	pad_mode: str = "first",
	temporal_compression_ratio: float = 4,
	):
	super().__init__()

	# log2 of temporal_compress_times
	temporal_compress_level = int(np.log2(temporal_compression_ratio))

	self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
	self.down_blocks = nn.ModuleList([])

	# down blocks
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1
	compress_time = i < temporal_compress_level

	if down_block_type == "CogVideoXDownBlock3D":
	down_block = CogVideoXDownBlock3D(
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=0,
	dropout=dropout,
	num_layers=layers_per_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	add_downsample=not is_final_block,
	compress_time=compress_time,
	)
	else:
	raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`")

	self.down_blocks.append(down_block)

	# mid block
	self.mid_block = CogVideoXMidBlock3D(
	in_channels=block_out_channels[-1],
	temb_channels=0,
	dropout=dropout,
	num_layers=2,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	pad_mode=pad_mode,
	)

	self.norm_out = nn.GroupNorm(norm_num_groups, block_out_channels[-1], eps=1e-6)
	self.conv_act = nn.SiLU()
	self.conv_out = CogVideoXCausalConv3d(
	block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode
	)

	self.gradient_checkpointing = False

	def _clear_fake_context_parallel_cache(self):
	for name, module in self.named_modules():
	if isinstance(module, CogVideoXCausalConv3d):
	logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
	module._clear_fake_context_parallel_cache()

	def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
	r"""The forward method of the `CogVideoXEncoder3D` class."""
	hidden_states = self.conv_in(sample)

	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module):
	def custom_forward(*inputs):
	return module(*inputs)

	return custom_forward

	# 1. Down
	for down_block in self.down_blocks:
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(down_block), hidden_states, temb, None
	)

	# 2. Mid
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(self.mid_block), hidden_states, temb, None
	)
	else:
	# 1. Down
	for down_block in self.down_blocks:
	hidden_states = down_block(hidden_states, temb, None)

	# 2. Mid
	hidden_states = self.mid_block(hidden_states, temb, None)

	# 3. Post-process
	hidden_states = self.norm_out(hidden_states)
	hidden_states = self.conv_act(hidden_states)
	hidden_states = self.conv_out(hidden_states)
	return hidden_states


	class CogVideoXDecoder3D(nn.Module):
	r"""
	The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
	sample.

	Args:
	in_channels (`int`, optional, defaults to 3):
	The number of input channels.
	out_channels (`int`, optional, defaults to 3):
	The number of output channels.
	up_block_types (`Tuple[str, ...]`, optional, defaults to `("UpDecoderBlock2D",)`):
	The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
	block_out_channels (`Tuple[int, ...]`, optional, defaults to `(64,)`):
	The number of output channels for each block.
	act_fn (`str`, optional, defaults to `"silu"`):
	The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
	layers_per_block (`int`, optional, defaults to 2):
	The number of layers per block.
	norm_num_groups (`int`, optional, defaults to 32):
	The number of groups for normalization.
	"""

	_supports_gradient_checkpointing = True

	def __init__(
	self,
	in_channels: int = 16,
	out_channels: int = 3,
	up_block_types: Tuple[str, ...] = (
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D",
	),
	block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
	layers_per_block: int = 3,
	act_fn: str = "silu",
	norm_eps: float = 1e-6,
	norm_num_groups: int = 32,
	dropout: float = 0.0,
	pad_mode: str = "first",
	temporal_compression_ratio: float = 4,
	):
	super().__init__()

	reversed_block_out_channels = list(reversed(block_out_channels))

	self.conv_in = CogVideoXCausalConv3d(
	in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode
	)

	# mid block
	self.mid_block = CogVideoXMidBlock3D(
	in_channels=reversed_block_out_channels[0],
	temb_channels=0,
	num_layers=2,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	spatial_norm_dim=in_channels,
	pad_mode=pad_mode,
	)

	# up blocks
	self.up_blocks = nn.ModuleList([])

	output_channel = reversed_block_out_channels[0]
	temporal_compress_level = int(np.log2(temporal_compression_ratio))

	for i, up_block_type in enumerate(up_block_types):
	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1
	compress_time = i < temporal_compress_level

	if up_block_type == "CogVideoXUpBlock3D":
	up_block = CogVideoXUpBlock3D(
	in_channels=prev_output_channel,
	out_channels=output_channel,
	temb_channels=0,
	dropout=dropout,
	num_layers=layers_per_block + 1,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	spatial_norm_dim=in_channels,
	add_upsample=not is_final_block,
	compress_time=compress_time,
	pad_mode=pad_mode,
	)
	prev_output_channel = output_channel
	else:
	raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`")

	self.up_blocks.append(up_block)

	self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[-1], in_channels, groups=norm_num_groups)
	self.conv_act = nn.SiLU()
	self.conv_out = CogVideoXCausalConv3d(
	reversed_block_out_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode
	)

	self.gradient_checkpointing = False

	def _clear_fake_context_parallel_cache(self):
	for name, module in self.named_modules():
	if isinstance(module, CogVideoXCausalConv3d):
	logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
	module._clear_fake_context_parallel_cache()

	def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
	r"""The forward method of the `CogVideoXDecoder3D` class."""
	hidden_states = self.conv_in(sample)

	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module):
	def custom_forward(*inputs):
	return module(*inputs)

	return custom_forward

	# 1. Mid
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(self.mid_block), hidden_states, temb, sample
	)

	# 2. Up
	for up_block in self.up_blocks:
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(up_block), hidden_states, temb, sample
	)
	else:
	# 1. Mid
	hidden_states = self.mid_block(hidden_states, temb, sample)

	# 2. Up
	for up_block in self.up_blocks:
	hidden_states = up_block(hidden_states, temb, sample)

	# 3. Post-process
	hidden_states = self.norm_out(hidden_states, sample)
	hidden_states = self.conv_act(hidden_states)
	hidden_states = self.conv_out(hidden_states)
	return hidden_states