Spaces:
Running
on
Zero
Running
on
Zero
| import torch | |
| import torch.nn as nn | |
| from einops import rearrange | |
| from .dit import flash_attention | |
| import torch.amp as amp | |
| class DeRAAttention(nn.Module): | |
| def __init__(self, | |
| dim, | |
| num_heads, | |
| window_size=(-1, -1), | |
| mode="spatial"): | |
| assert dim % num_heads == 0 | |
| super().__init__() | |
| self.dim = dim | |
| self.num_heads = num_heads | |
| self.head_dim = dim // num_heads | |
| self.window_size = window_size | |
| self.q = nn.Linear(dim, dim) | |
| self.k = nn.Linear(dim, dim) | |
| self.v = nn.Linear(dim, dim) | |
| self.o = nn.Linear(dim, dim) | |
| self.visualize_attention = False | |
| if mode == 'spatial': | |
| self.rope_apply = self.rope_apply_spatial | |
| elif mode == 'temporal': | |
| self.rope_apply = self.rope_apply_temporal | |
| elif mode == 'spatial_temporal': | |
| self.rope_apply = self.rope_apply_spatial_temporal | |
| else: | |
| raise ValueError("Invalid mode: {}".format(mode)) | |
| def rope_apply_spatial(x, grid_size, freqs, sequence_cond_compressed_indices=None): | |
| batch, _, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2 | |
| freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) | |
| assert len(grid_size) == 2, "grid_size mustbe [h, w]" | |
| h, w = grid_size[0], grid_size[1] | |
| seq_len = h * w | |
| x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape( | |
| batch, seq_len, n, -1, 2)) | |
| freqs_i = torch.cat([ | |
| freqs[1][:h].view(1, h, 1, -1).expand(1, h, w, -1), | |
| freqs[2][:w].view(1, 1, w, -1).expand(1, h, w, -1) | |
| ], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1) | |
| freqs_i = torch.concat([freqs_i.new_ones(batch, seq_len, 1, c//3), freqs_i], dim=3) | |
| x_i = torch.view_as_real(x_i * freqs_i).flatten(3) | |
| return x_i.float() | |
| def rope_apply_temporal(x, grid_size, freqs, sequence_cond_compressed_indices=None): | |
| batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2 | |
| freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) | |
| assert len(grid_size) == 1, "grid_size must be [t]" | |
| seq_len = grid_size[0] | |
| x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape(batch, seq_len, n, -1, 2)) | |
| freqs_i = torch.cat([ | |
| freqs[0][:seq_len].view(seq_len, 1, 1, -1) | |
| ], dim=-1).reshape(seq_len, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1) | |
| freqs_i = torch.concat([freqs_i, freqs_i.new_ones(batch, seq_len, 1, 2 * c//3)], dim=3) | |
| x_i = torch.view_as_real(x_i * freqs_i).flatten(3) | |
| if seq_len_actual > seq_len: | |
| sequence_cond_seq_length = seq_len_actual - seq_len | |
| if sequence_cond_seq_length == seq_len: | |
| x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2)) | |
| x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3) | |
| else: | |
| sequence_cond_compressed_index = sequence_cond_compressed_indices[0] | |
| sequence_cond_t_length = len(sequence_cond_compressed_index) | |
| assert sequence_cond_t_length == sequence_cond_seq_length, "`sequence_cond_t_length` must be equal to `sequence_cond_seq_length`" | |
| x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2)) | |
| freqs_i_sequence_cond = torch.cat([ | |
| freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1), | |
| ], dim=-1).reshape(sequence_cond_seq_length, 1, -1).unsqueeze(0).repeat(batch, 1, 1, 1) | |
| freqs_i_sequence_cond = torch.concat([freqs_i_sequence_cond, freqs_i_sequence_cond.new_ones(batch, sequence_cond_t_length, 1, 2 * c//3)], dim=3) | |
| x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3) | |
| x_i = torch.cat([x_i, x_i_sequence_cond], dim=1) | |
| return x_i.float() | |
| def rope_apply_spatial_temporal(x, grid_sizes, freqs, sequence_cond_compressed_indices=None): | |
| batch, seq_len_actual, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2 | |
| freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) | |
| assert len(grid_sizes) == 3, "grid_sizes must be ([f, h, w])" | |
| f, h, w = grid_sizes[0], grid_sizes[1], grid_sizes[2] | |
| seq_len = f * h * w | |
| x_i = torch.view_as_complex(x[:, :seq_len].to(torch.float64).reshape( | |
| batch, seq_len, n, -1, 2)) | |
| freqs_i = torch.cat([ | |
| freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1), | |
| freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1), | |
| freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1) | |
| ], dim=-1).reshape(seq_len, 1, -1) | |
| x_i = torch.view_as_real(x_i * freqs_i).flatten(3) | |
| if seq_len_actual > seq_len: | |
| sequence_cond_seq_length = seq_len_actual - seq_len | |
| if sequence_cond_seq_length == seq_len: | |
| x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, seq_len_actual - seq_len, n, -1, 2)) | |
| x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i).flatten(3) | |
| else: | |
| sequence_cond_compressed_index = sequence_cond_compressed_indices[0] | |
| sequence_cond_t_length = len(sequence_cond_compressed_index) | |
| assert sequence_cond_t_length * h * w == sequence_cond_seq_length, "`sequence_cond_t_length * h * w` must be equal to `sequence_cond_seq_length`" | |
| x_i_sequence_cond = torch.view_as_complex(x[:, seq_len:].to(torch.float64).reshape(batch, sequence_cond_seq_length, n, -1, 2)) | |
| freqs_i_sequence_cond = torch.cat([ | |
| freqs[0][sequence_cond_compressed_index].view(sequence_cond_t_length, 1, 1, -1).expand(sequence_cond_t_length, h, w, -1), | |
| freqs[1][:h].view(1, h, 1, -1).expand(sequence_cond_t_length, h, w, -1), | |
| freqs[2][:w].view(1, 1, w, -1).expand(sequence_cond_t_length, h, w, -1) | |
| ], dim=-1).reshape(sequence_cond_seq_length, 1, -1) | |
| x_i_sequence_cond = torch.view_as_real(x_i_sequence_cond * freqs_i_sequence_cond).flatten(3) | |
| x_i = torch.cat([x_i, x_i_sequence_cond], dim=1) | |
| return x_i.float() | |
| def forward(self, x, seq_lens, grid_size, freqs, sequence_cond_compressed_indices): | |
| b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim | |
| def qkv_fn(x): | |
| q = self.q(x).view(b, s, n, d) | |
| k = self.k(x).view(b, s, n, d) | |
| v = self.v(x).view(b, s, n, d) | |
| return q, k, v | |
| q, k, v = qkv_fn(x) | |
| q_rope = self.rope_apply(q, grid_size, freqs, sequence_cond_compressed_indices) | |
| k_rope = self.rope_apply(k, grid_size, freqs, sequence_cond_compressed_indices) | |
| if self.visualize_attention: | |
| with torch.no_grad(): | |
| self._last_attn_maps = self._compute_attention_for_visualization(q_rope, k_rope) # CPU tesnor of [S, S] | |
| self._last_grid_sizes = grid_size | |
| self._last_seq_lens = seq_lens | |
| x = flash_attention( | |
| q=q_rope, | |
| k=k_rope, | |
| v=v, | |
| k_lens=None, | |
| window_size=self.window_size) | |
| x = x.flatten(2) | |
| x = self.o(x) | |
| return x | |
| class DeRA(nn.Module): | |
| def __init__(self, dim, rank, use_spatial=True, use_temporal=True): | |
| super(DeRA, self).__init__() | |
| self.dim = dim | |
| self.rank = rank | |
| self.use_spatial = use_spatial | |
| self.use_temporal = use_temporal | |
| if not use_spatial and not use_temporal: | |
| self.attention_mode = "none" | |
| else: | |
| self.attention_mode = "spatial_temporal" if use_spatial and use_temporal else "spatial" if use_spatial else "temporal" | |
| self.spatial_down_proj = nn.Linear(self.dim, rank, bias=False) | |
| self.spatial_up_proj = nn.Linear(rank, self.dim, bias=False) | |
| self.spatial_up_proj.weight.data.zero_() | |
| if self.attention_mode != "none": | |
| self.spatial_attn = DeRAAttention(dim=rank, num_heads=4, window_size=(-1, -1), | |
| mode=self.attention_mode) | |
| else: | |
| self.spatial_attn = None | |
| def forward(self, x, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices): | |
| _, actual_seq, _ = x.shape | |
| if isinstance(grid_sizes, torch.Tensor): | |
| grid_sizes = tuple(grid_sizes[0].tolist()) | |
| if len(grid_sizes) != 3: | |
| raise ValueError("`grid_sizes` should contain time, spatial height, and width dimensions") | |
| _, orig_h, orig_w = grid_sizes | |
| actual_t = actual_seq // (orig_h * orig_w) | |
| x_low = self.spatial_down_proj(x) | |
| if self.attention_mode == "spatial": | |
| x_low_spatial = rearrange(x_low, 'b (t h w) r -> (b t) (h w) r', t=actual_t, h=orig_h, w=orig_w) | |
| x_low_spatial = self.spatial_attn(x_low_spatial, seq_lens, grid_sizes[1:], freqs, sequence_cond_compressed_indices) | |
| x_low = rearrange(x_low_spatial, '(b t) (h w) r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w) | |
| elif self.attention_mode == "temporal": | |
| x_low_temporal = rearrange(x_low, 'b (t h w) r -> (b h w) t r', t=actual_t, h=orig_h, w=orig_w) | |
| x_low_temporal = self.spatial_attn(x_low_temporal, seq_lens, grid_sizes[:1], freqs, sequence_cond_compressed_indices) | |
| x_low = rearrange(x_low_temporal, '(b h w) t r -> b (t h w) r', t=actual_t, h=orig_h, w=orig_w) | |
| elif self.attention_mode == "spatial_temporal": | |
| x_low = self.spatial_attn(x_low, seq_lens, grid_sizes, freqs, sequence_cond_compressed_indices) | |
| x_out = self.spatial_up_proj(x_low) | |
| return x_out | |