Spaces:

TencentARC
/

ToonComposer

Running on Zero

App Files Files Community

ToonComposer / model /dera.py

l-li

update requirements.

00274d1 3 months ago

raw

history blame

10.4 kB

	import torch
	import torch.nn as nn
	from einops import rearrange
	from .dit import flash_attention
	import torch.amp as amp


	class DeRAAttention(nn.Module):

	def __init__(self,
	dim,
	num_heads,
	window_size=(-1, -1),
	mode="spatial"):
	assert dim % num_heads == 0
	super().__init__()
	self.dim = dim
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.window_size = window_size

	self.q = nn.Linear(dim, dim)
	self.k = nn.Linear(dim, dim)
	self.v = nn.Linear(dim, dim)
	self.o = nn.Linear(dim, dim)
	self.visualize_attention = False

	if mode == 'spatial':
	self.rope_apply = self.rope_apply_spatial
	elif mode == 'temporal':
	self.rope_apply = self.rope_apply_temporal
	elif mode == 'spatial_temporal':
	self.rope_apply = self.rope_apply_spatial_temporal
	else:
	raise ValueError("Invalid mode: {}".format(mode))

	@staticmethod
	@amp.autocast(enabled=False, device_type="cuda")
	def rope_apply_spatial(x, grid_size, freqs, sequence_cond_compressed_indices=None):
	batch, _, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
	freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
	assert len(grid_size) == 2, "grid_size mustbe [h, w]"
	h, w = grid_size[0], grid_size[1]
	seq_len = h * w
	x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(
	batch, seq_len, n, -1, 2))
	freqs_i = torch.cat([
	freqs[1][:h].view(1, h, 1, -1).expand(1, h, w, -1),
	freqs[2][:w].view(1, 1, w, -1).expand(1, h, w, -1)
	], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
	freqs_i = torch.concat([freqs_i.new_ones(batch, seq_len, 1, c//3), freqs_i], dim=3)
	x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
	return x_i.float()

	@staticmethod
	@amp.autocast(enabled=False, device_type="cuda")
	def rope_apply_temporal(x, grid_size, freqs, sequence_cond_compressed_indices=None):
	batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
	freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
	assert len(grid_size) == 1, "grid_size must be [t]"
	seq_len = grid_size[0]
	x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(batch, seq_len, n, -1, 2))
	freqs_i = torch.cat([
	freqs[0][:seq_len].view(seq_len, 1, 1, -1)
	], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
	freqs_i = torch.concat([freqs_i, freqs_i.new_ones(batch, seq_len, 1, 2 * c//3)], dim=3)
	x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
	if seq_len_actual > seq_len:
	sequence_cond_seq_length = seq_len_actual - seq_len
	if sequence_cond_seq_length == seq_len:
	x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2))
	x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3)
	else:
	sequence_cond_compressed_index = sequence_cond_compressed_indices[0]
	sequence_cond_t_length = len(sequence_cond_compressed_index)
	assert sequence_cond_t_length == sequence_cond_seq_length, "`sequence_cond_t_length` must be equal to `sequence_cond_seq_length`"
	x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2))
	freqs_i_sequence_cond = torch.cat([
	freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1),
	], dim=-1).reshape(sequence_cond_seq_length, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1)
	freqs_i_sequence_cond = torch.concat([freqs_i_sequence_cond, freqs_i_sequence_cond.new_ones(batch, sequence_cond_t_length, 1, 2 * c//3)], dim=3)
	x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3)
	x_i = torch.cat([x_i, x_i_sequence_cond], dim=1)

	return x_i.float()

	@staticmethod
	@amp.autocast(enabled=False, device_type="cuda")
	def rope_apply_spatial_temporal(x, grid_sizes, freqs, sequence_cond_compressed_indices=None):
	batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
	freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
	assert len(grid_sizes) == 3, "grid_sizes must be ([f, h, w])"
	f, h, w = grid_sizes[0], grid_sizes[1], grid_sizes[2]
	seq_len = f * h * w
	x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(
	batch, seq_len, n, -1, 2))
	freqs_i = torch.cat([
	freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
	freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
	freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
	], dim=-1).reshape(seq_len, 1, -1)
	x_i = torch.view_as_real(x_i * freqs_i).flatten(3)
	if seq_len_actual > seq_len:
	sequence_cond_seq_length = seq_len_actual - seq_len
	if sequence_cond_seq_length == seq_len:
	x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2))
	x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3)
	else:
	sequence_cond_compressed_index = sequence_cond_compressed_indices[0]
	sequence_cond_t_length = len(sequence_cond_compressed_index)
	assert sequence_cond_t_length * h * w == sequence_cond_seq_length, "`sequence_cond_t_length * h * w` must be equal to `sequence_cond_seq_length`"
	x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2))
	freqs_i_sequence_cond = torch.cat([
	freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1).expand(sequence_cond_t_length, h, w, -1),
	freqs[1][:h].view(1, h, 1, -1).expand(sequence_cond_t_length, h, w, -1),
	freqs[2][:w].view(1, 1, w, -1).expand(sequence_cond_t_length, h, w, -1)
	], dim=-1).reshape(sequence_cond_seq_length, 1, -1)
	x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3)
	x_i = torch.cat([x_i, x_i_sequence_cond], dim=1)
	return x_i.float()


	def forward(self, x, seq_lens, grid_size, freqs, sequence_cond_compressed_indices):
	b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
	def qkv_fn(x):
	q = self.q(x).view(b, s, n, d)
	k = self.k(x).view(b, s, n, d)
	v = self.v(x).view(b, s, n, d)
	return q, k, v

	q, k, v = qkv_fn(x)
	q_rope = self.rope_apply(q, grid_size, freqs, sequence_cond_compressed_indices)
	k_rope = self.rope_apply(k, grid_size, freqs, sequence_cond_compressed_indices)
	if self.visualize_attention:
	with torch.no_grad():
	self._last_attn_maps = self._compute_attention_for_visualization(q_rope, k_rope) # CPU tesnor of [S, S]
	self._last_grid_sizes = grid_size
	self._last_seq_lens = seq_lens
	x = flash_attention(
	q=q_rope,
	k=k_rope,
	v=v,
	k_lens=None,
	window_size=self.window_size)
	x = x.flatten(2)
	x = self.o(x)
	return x


	class DeRA(nn.Module):
	def __init__(self, dim, rank, use_spatial=True, use_temporal=True):
	super(DeRA, self).__init__()
	self.dim = dim
	self.rank = rank
	self.use_spatial = use_spatial
	self.use_temporal = use_temporal

	if not use_spatial and not use_temporal:
	self.attention_mode = "none"
	else:
	self.attention_mode = "spatial_temporal" if use_spatial and use_temporal else "spatial" if use_spatial else "temporal"

	self.spatial_down_proj = nn.Linear(self.dim, rank, bias=False)
	self.spatial_up_proj = nn.Linear(rank, self.dim, bias=False)
	self.spatial_up_proj.weight.data.zero_()
	if self.attention_mode != "none":
	self.spatial_attn = DeRAAttention(dim=rank, num_heads=4, window_size=(-1, -1),
	mode=self.attention_mode)
	else:
	self.spatial_attn = None

	def forward(self, x, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices):
	_, actual_seq, _ = x.shape
	if isinstance(grid_sizes, torch.Tensor):
	grid_sizes = tuple(grid_sizes[0].tolist())

	if len(grid_sizes) != 3:
	raise ValueError("`grid_sizes` should contain time, spatial height, and width dimensions")
	_, orig_h, orig_w = grid_sizes
	actual_t = actual_seq // (orig_h * orig_w)

	x_low = self.spatial_down_proj(x)
	if self.attention_mode == "spatial":
	x_low_spatial = rearrange(x_low, 'b (t h w) r -> (b t) (h w) r', t=actual_t, h=orig_h, w=orig_w)
	x_low_spatial = self.spatial_attn(x_low_spatial, seq_lens, grid_sizes[1:], freqs, sequence_cond_compressed_indices)
	x_low = rearrange(x_low_spatial, '(b t) (h w) r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w)
	elif self.attention_mode == "temporal":
	x_low_temporal = rearrange(x_low, 'b (t h w) r -> (b h w) t r', t=actual_t, h=orig_h, w=orig_w)
	x_low_temporal = self.spatial_attn(x_low_temporal, seq_lens, grid_sizes[:1], freqs, sequence_cond_compressed_indices)
	x_low = rearrange(x_low_temporal, '(b h w) t r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w)
	elif self.attention_mode == "spatial_temporal":
	x_low = self.spatial_attn(x_low, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices)
	x_out = self.spatial_up_proj(x_low)
	return x_out