Spaces:

dixisouls
/

VelocityLM

Sleeping

App Files Files Community

VelocityLM / src /model /transformer.py

dixisouls

Initial Commit

27b9282 4 months ago

raw

history blame

10.6 kB

	"""State-of-the-art Transformer model implementation."""

	import math
	from typing import Optional, Tuple
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss
	from dataclasses import dataclass

	import warnings
	warnings.filterwarnings("ignore")

	from .layers import RMSNorm, RotaryEmbedding, SwiGLU


	@dataclass
	class ModelOutput:
	"""Model output container."""
	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	hidden_states: Optional[Tuple[torch.Tensor]] = None
	attentions: Optional[Tuple[torch.Tensor]] = None


	class CausalSelfAttention(nn.Module):
	"""Multi-head self-attention with causal mask and RoPE."""

	def __init__(self, config):
	super().__init__()
	assert config.hidden_size % config.num_attention_heads == 0

	self.num_attention_heads = config.num_attention_heads
	self.head_dim = config.hidden_size // config.num_attention_heads
	self.hidden_size = config.hidden_size

	self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
	self.k_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
	self.v_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
	self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)

	self.attention_dropout = nn.Dropout(config.attention_dropout)
	self.rotary_emb = RotaryEmbedding(
	self.head_dim,
	max_position_embeddings=config.max_position_embeddings,
	base=config.rope_theta,
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
	bsz, q_len, _ = hidden_states.size()

	q = self.q_proj(hidden_states)
	k = self.k_proj(hidden_states)
	v = self.v_proj(hidden_states)

	q = q.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
	k = k.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
	v = v.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2)

	# Apply rotary embeddings
	cos, sin = self.rotary_emb(v, seq_len=q_len)
	q, k = self.rotary_emb.apply_rotary_pos_emb(q, k, cos, sin, position_ids)

	# Flash attention or standard attention
	attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_dim)

	if attention_mask is not None:
	attn_weights = attn_weights + attention_mask

	attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
	attn_weights = self.attention_dropout(attn_weights)

	attn_output = torch.matmul(attn_weights, v)
	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
	attn_output = self.o_proj(attn_output)

	return attn_output, None


	class TransformerBlock(nn.Module):
	"""Transformer block with RMSNorm and SwiGLU."""

	def __init__(self, config):
	super().__init__()
	self.hidden_size = config.hidden_size
	self.self_attn = CausalSelfAttention(config)
	self.mlp = SwiGLU(
	hidden_size=config.hidden_size,
	intermediate_size=config.intermediate_size,
	hidden_act=config.hidden_act,
	)
	self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.hidden_dropout = nn.Dropout(config.hidden_dropout)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	use_cache=use_cache,
	)
	hidden_states = self.hidden_dropout(hidden_states)
	hidden_states = residual + hidden_states

	# MLP
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = self.hidden_dropout(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states, present_key_value


	class TransformerModel(nn.Module):
	"""Main transformer model."""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.vocab_size = config.vocab_size
	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
	self.layers = nn.ModuleList(
	[TransformerBlock(config) for _ in range(config.num_hidden_layers)]
	)
	self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.gradient_checkpointing = False

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	use_cache: Optional[bool] = None,
	output_attentions: bool = False,
	output_hidden_states: bool = False,
	return_dict: bool = True,
	) -> torch.Tensor:
	batch_size, seq_length = input_ids.shape

	# Embed tokens
	hidden_states = self.embed_tokens(input_ids)

	# Create position IDs
	if position_ids is None:
	position_ids = torch.arange(
	seq_length, dtype=torch.long, device=input_ids.device
	).unsqueeze(0)

	# Create causal mask
	causal_mask = torch.triu(
	torch.full((seq_length, seq_length), float('-inf'), device=input_ids.device),
	diagonal=1
	).unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, seq_len]

	if attention_mask is not None:
	# Convert padding mask [batch, seq_len] to 4D [batch, 1, 1, seq_len]
	# and combine with causal mask
	expanded_mask = attention_mask[:, None, None, :] # [batch, 1, 1, seq_len]
	expanded_mask = (1.0 - expanded_mask) * -10000.0 # Convert 0s to -inf
	attention_mask = expanded_mask + causal_mask.expand(input_ids.shape[0], -1, -1, -1)
	else:
	attention_mask = causal_mask.expand(input_ids.shape[0], -1, -1, -1)

	# Forward through layers
	for layer in self.layers:
	if self.gradient_checkpointing and self.training:
	hidden_states, _ = torch.utils.checkpoint.checkpoint(
	layer,
	hidden_states,
	attention_mask,
	position_ids,
	None,
	False,
	use_reentrant=False,
	)
	else:
	hidden_states, _ = layer(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=None,
	use_cache=False,
	)

	hidden_states = self.norm(hidden_states)
	return hidden_states


	class TransformerForCausalLM(nn.Module):
	"""Transformer model with language modeling head."""

	def __init__(self, config):
	super().__init__()
	self.model = TransformerModel(config)
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Tie weights
	self.lm_head.weight = self.model.embed_tokens.weight

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: bool = False,
	output_hidden_states: bool = False,
	return_dict: bool = True,
	) -> ModelOutput:
	hidden_states = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	logits = self.lm_head(hidden_states)

	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1)
	)

	return ModelOutput(
	loss=loss,
	logits=logits,
	hidden_states=hidden_states,
	attentions=None,
	)

	def gradient_checkpointing_enable(self):
	self.model.gradient_checkpointing = True