VelocityLM / src /model /transformer.py
dixisouls's picture
Initial Commit
27b9282
raw
history blame
10.6 kB
"""State-of-the-art Transformer model implementation."""
import math
from typing import Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from dataclasses import dataclass
import warnings
warnings.filterwarnings("ignore")
from .layers import RMSNorm, RotaryEmbedding, SwiGLU
@dataclass
class ModelOutput:
"""Model output container."""
loss: Optional[torch.Tensor] = None
logits: Optional[torch.Tensor] = None
hidden_states: Optional[Tuple[torch.Tensor]] = None
attentions: Optional[Tuple[torch.Tensor]] = None
class CausalSelfAttention(nn.Module):
"""Multi-head self-attention with causal mask and RoPE."""
def __init__(self, config):
super().__init__()
assert config.hidden_size % config.num_attention_heads == 0
self.num_attention_heads = config.num_attention_heads
self.head_dim = config.hidden_size // config.num_attention_heads
self.hidden_size = config.hidden_size
self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.k_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.v_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.attention_dropout = nn.Dropout(config.attention_dropout)
self.rotary_emb = RotaryEmbedding(
self.head_dim,
max_position_embeddings=config.max_position_embeddings,
base=config.rope_theta,
)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
q = self.q_proj(hidden_states)
k = self.k_proj(hidden_states)
v = self.v_proj(hidden_states)
q = q.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
k = k.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
v = v.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
# Apply rotary embeddings
cos, sin = self.rotary_emb(v, seq_len=q_len)
q, k = self.rotary_emb.apply_rotary_pos_emb(q, k, cos, sin, position_ids)
# Flash attention or standard attention
attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_dim)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
attn_weights = self.attention_dropout(attn_weights)
attn_output = torch.matmul(attn_weights, v)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
return attn_output, None
class TransformerBlock(nn.Module):
"""Transformer block with RMSNorm and SwiGLU."""
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = CausalSelfAttention(config)
self.mlp = SwiGLU(
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
)
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
self.hidden_dropout = nn.Dropout(config.hidden_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
use_cache=use_cache,
)
hidden_states = self.hidden_dropout(hidden_states)
hidden_states = residual + hidden_states
# MLP
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = self.hidden_dropout(hidden_states)
hidden_states = residual + hidden_states
return hidden_states, present_key_value
class TransformerModel(nn.Module):
"""Main transformer model."""
def __init__(self, config):
super().__init__()
self.config = config
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
self.layers = nn.ModuleList(
[TransformerBlock(config) for _ in range(config.num_hidden_layers)]
)
self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
# Initialize weights
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> torch.Tensor:
batch_size, seq_length = input_ids.shape
# Embed tokens
hidden_states = self.embed_tokens(input_ids)
# Create position IDs
if position_ids is None:
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device
).unsqueeze(0)
# Create causal mask
causal_mask = torch.triu(
torch.full((seq_length, seq_length), float('-inf'), device=input_ids.device),
diagonal=1
).unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, seq_len]
if attention_mask is not None:
# Convert padding mask [batch, seq_len] to 4D [batch, 1, 1, seq_len]
# and combine with causal mask
expanded_mask = attention_mask[:, None, None, :] # [batch, 1, 1, seq_len]
expanded_mask = (1.0 - expanded_mask) * -10000.0 # Convert 0s to -inf
attention_mask = expanded_mask + causal_mask.expand(input_ids.shape[0], -1, -1, -1)
else:
attention_mask = causal_mask.expand(input_ids.shape[0], -1, -1, -1)
# Forward through layers
for layer in self.layers:
if self.gradient_checkpointing and self.training:
hidden_states, _ = torch.utils.checkpoint.checkpoint(
layer,
hidden_states,
attention_mask,
position_ids,
None,
False,
use_reentrant=False,
)
else:
hidden_states, _ = layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=None,
use_cache=False,
)
hidden_states = self.norm(hidden_states)
return hidden_states
class TransformerForCausalLM(nn.Module):
"""Transformer model with language modeling head."""
def __init__(self, config):
super().__init__()
self.model = TransformerModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Tie weights
self.lm_head.weight = self.model.embed_tokens.weight
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> ModelOutput:
hidden_states = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1)
)
return ModelOutput(
loss=loss,
logits=logits,
hidden_states=hidden_states,
attentions=None,
)
def gradient_checkpointing_enable(self):
self.model.gradient_checkpointing = True