Spaces:

RoyYang0714
/

3D-MOOD

Running on Zero

App Files Files Community

3D-MOOD / vis4d /op /layer /transformer.py

RoyYang0714

feat: Try to build everything locally.

9b33fca 3 months ago

raw

history blame contribute delete

8.5 kB

	"""Transformer layer.

	Modified from timm (https://github.com/huggingface/pytorch-image-models) and
	mmdetection (https://github.com/open-mmlab/mmdetection).
	"""

	from __future__ import annotations

	import copy

	import torch
	from torch import Tensor, nn

	from .attention import Attention
	from .drop import DropPath
	from .mlp import TransformerBlockMLP
	from .util import build_activation_layer


	def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor:
	"""Inverse function of sigmoid.

	Args:
	x (Tensor): The tensor to do the inverse.
	eps (float): EPS avoid numerical overflow. Defaults 1e-5.

	Returns:
	Tensor: The x has passed the inverse function of sigmoid, has same
	shape with input.
	"""
	x = x.clamp(min=0, max=1)
	x1 = x.clamp(min=eps)
	x2 = (1 - x).clamp(min=eps)
	return torch.log(x1 / x2)


	def get_clones(module: nn.Module, num: int) -> nn.ModuleList:
	"""Create N identical layers."""
	return nn.ModuleList([copy.deepcopy(module) for _ in range(num)])


	class LayerScale(nn.Module):
	"""Layer scaler."""

	def __init__(
	self,
	dim: int,
	inplace: bool = False,
	data_format: str = "channels_last",
	init_values: float = 1e-5,
	):
	"""Init layer scaler.

	Args:
	dim (int): Input tensor's dimension.
	inplace (bool): Whether performs operation in-place. Default:
	False.
	data_format (str): The input data format, could be 'channels_last'
	or 'channels_first', representing (B, C, H, W) and (B, N, C)
	format data respectively. Default: channels_last.
	init_values (float, optional): Initial values for layer scale.
	Defaults to 1e-5.
	"""
	super().__init__()
	assert data_format in {
	"channels_last",
	"channels_first",
	}, "data_format could only be channels_last or channels_first."
	self.inplace = inplace
	self.data_format = data_format
	self.gamma = nn.Parameter(init_values * torch.ones(dim))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Forward pass."""
	if self.data_format == "channels_first":
	shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
	else:
	shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))

	if self.inplace:
	return x.mul_(self.gamma.view(*shape))

	return x * self.gamma.view(*shape)


	class TransformerBlock(nn.Module):
	"""Transformer block for Vision Transformer."""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	qkv_bias: bool = False,
	drop: float = 0.0,
	attn_drop: float = 0.0,
	init_values: float \| None = None,
	drop_path: float = 0.0,
	act_layer: nn.Module = nn.GELU(),
	norm_layer: nn.Module \| None = None,
	):
	"""Init transformer block.

	Args:
	dim (int): Input tensor's dimension.
	num_heads (int): Number of attention heads.
	mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding
	dim. Defaults to 4.0.
	qkv_bias (bool, optional): If to add bias to qkv. Defaults to
	False.
	drop (float, optional): Dropout rate for attention and projection.
	Defaults to 0.0.
	attn_drop (float, optional): Dropout rate for attention. Defaults
	to 0.0.
	init_values (tuple[float, float] \| None, optional): Initial values
	for layer scale. Defaults to None.
	drop_path (float, optional): Dropout rate for drop path. Defaults
	to 0.0.
	act_layer (nn.Module, optional): Activation layer. Defaults to
	nn.GELU.
	norm_layer (nn.Module, optional): Normalization layer. If None, use
	nn.LayerNorm.
	"""
	super().__init__()
	self.norm1 = (
	norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6)
	)
	self.attn = Attention(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	attn_drop=attn_drop,
	proj_drop=drop,
	)
	self.ls1 = (
	LayerScale(dim, init_values=init_values)
	if init_values
	else nn.Identity()
	)
	self.drop_path1 = (
	DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	)

	self.norm2 = (
	norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6)
	)
	self.mlp = TransformerBlockMLP(
	in_features=dim,
	hidden_features=int(dim * mlp_ratio),
	act_layer=act_layer,
	drop=drop,
	)
	self.ls2 = (
	LayerScale(dim, init_values=init_values)
	if init_values
	else nn.Identity()
	)
	self.drop_path2 = (
	DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	)

	def __call__(self, data: torch.Tensor) -> torch.Tensor:
	"""Forward pass.

	Args:
	data (torch.Tensor): Input tensor of shape (B, N, dim).

	Returns:
	torch.Tensor: Output tensor of shape (B, N, dim).
	"""
	return self._call_impl(data)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Forward pass."""
	x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
	x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
	return x


	class FFN(nn.Module):
	"""Implements feed-forward networks (FFNs) with identity connection."""

	def __init__(
	self,
	embed_dims: int = 256,
	feedforward_channels: int = 1024,
	num_fcs: int = 2,
	dropout: float = 0.0,
	activation: str = "ReLU",
	inplace: bool = True,
	dropout_layer: nn.Module \| None = None,
	add_identity: bool = True,
	layer_scale_init_value: float = 0.0,
	) -> None:
	"""Init FFN.

	Args:
	embed_dims (int): The feature dimension. Defaults: 256.
	feedforward_channels (int): The hidden dimension of FFNs.
	Defaults: 1024.
	num_fcs (int): The number of fully-connected layers in FFNs.
	Defaults: 2.
	dropout (float): The dropout rate of FFNs.
	activation (str): The activation function of FFNs.
	inplace (bool): Whether to set inplace for activation.
	dropout_layer (nn.Module \| None, optional): The dropout_layer used
	when adding the shortcut. Defaults to None. If None, Identity
	is used.
	add_identity (bool, optional): Whether to add the identity
	connection. Default: True.
	layer_scale_init_value (float): Initial value of scale factor in
	LayerScale. Default: 0.0
	"""
	super().__init__()
	self.embed_dims = embed_dims

	layers: list[nn.Module] = []
	in_channels = embed_dims
	for _ in range(num_fcs - 1):
	layers.append(
	nn.Sequential(
	nn.Linear(in_channels, feedforward_channels),
	build_activation_layer(activation, inplace),
	nn.Dropout(dropout),
	)
	)
	in_channels = feedforward_channels
	layers.append(nn.Linear(feedforward_channels, embed_dims))
	layers.append(nn.Dropout(dropout))
	self.layers = nn.Sequential(*layers)

	self.dropout_layer = dropout_layer or nn.Identity()
	self.add_identity = add_identity
	self.layer_scale_init_value = layer_scale_init_value

	if self.layer_scale_init_value > 0:
	self.gamma2 = LayerScale(
	embed_dims, init_values=self.layer_scale_init_value
	)

	def forward(self, x: Tensor, identity: Tensor \| None = None) -> None:
	"""Forward function for FFN.

	The function would add x to the output tensor if residue is None.
	"""
	out = self.layers(x)

	if self.layer_scale_init_value > 0:
	out = self.gamma2(out)

	if self.add_identity:
	identity = x if identity is None else identity
	return identity + self.dropout_layer(out)

	return self.dropout_layer(out)