PowerInfer
/

Bamboo-base-v0_1

@@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
@@ -72,11 +73,11 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class MistralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        MistralRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -91,8 +92,9 @@ class MistralRMSNorm(nn.Module):
 # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
 # TODO @Arthur no longer copied from LLama after static cache
-class MistralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -166,7 +168,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed, k_embed
-class MistralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -194,7 +196,8 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class MistralAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
@@ -231,7 +234,7 @@ class MistralAttention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
@@ -322,9 +325,9 @@ class MistralAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class MistralFlashAttention2(MistralAttention):
     """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -618,14 +621,14 @@ class MistralFlashAttention2(MistralAttention):
 # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
 # TODO @Arthur no longer copied from LLama after static cache
-class MistralSdpaAttention(MistralAttention):
     """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from MistralAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -638,7 +641,7 @@ class MistralSdpaAttention(MistralAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -705,23 +708,23 @@ class MistralSdpaAttention(MistralAttention):
         return attn_output, None, past_key_value
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": MistralAttention,
-    "flash_attention_2": MistralFlashAttention2,
-    "sdpa": MistralSdpaAttention,
 }
-class MistralDecoderLayer(nn.Module):
     def __init__(self, config: BambooConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -783,7 +786,7 @@ class MistralDecoderLayer(nn.Module):
         return outputs
-MISTRAL_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -801,14 +804,14 @@ MISTRAL_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
 )
-class MistralPreTrainedModel(PreTrainedModel):
     config_class = BambooConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -826,7 +829,7 @@ class MistralPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-MISTRAL_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -897,12 +900,12 @@ MISTRAL_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
 )
-class MistralModel(MistralPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
     Args:
         config: BambooConfig
@@ -915,10 +918,10 @@ class MistralModel(MistralPreTrainedModel):
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -930,7 +933,7 @@ class MistralModel(MistralPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -993,7 +996,7 @@ class MistralModel(MistralPreTrainedModel):
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
@@ -1078,12 +1081,12 @@ class MistralModel(MistralPreTrainedModel):
         )
-class BambooForCausalLM(MistralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = MistralModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1108,7 +1111,7 @@ class BambooForCausalLM(MistralPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1266,9 +1269,9 @@ class BambooForCausalLM(MistralPreTrainedModel):
 @add_start_docstrings(
     """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1277,14 +1280,14 @@ class BambooForCausalLM(MistralPreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    MISTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(MistralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = MistralModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1296,7 +1299,7 @@ class MistralForSequenceClassification(MistralPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

 # coding=utf-8
 # Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 SJTU-IPADS AI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
     )
+# Copied from transformers.models.mistral.modeling_mistral.MistralRMSNorm with Mistral->Bamboo
+class BambooRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        BambooRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
 # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Bamboo
 # TODO @Arthur no longer copied from LLama after static cache
+class BambooRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
     return q_embed, k_embed
+class BambooMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Copied from transformers.models.mistral.modeling_mistral.MistralAttention
+class BambooAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = BambooRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         return attn_output, attn_weights, past_key_value
+class BambooFlashAttention2(BambooAttention):
     """
+    BAMBOO flash attention module. This module inherits from `BambooAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
 # TODO @Arthur no longer copied from LLama after static cache
+class BambooSdpaAttention(BambooAttention):
     """
+    Bamboo attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `BambooAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from BambooAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "BambooModel is using BambooSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         return attn_output, None, past_key_value
+BAMBOO_ATTENTION_CLASSES = {
+    "eager": BambooAttention,
+    "flash_attention_2": BambooFlashAttention2,
+    "sdpa": BambooSdpaAttention,
 }
+class BambooDecoderLayer(nn.Module):
     def __init__(self, config: BambooConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = BAMBOO_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = BambooMLP(config)
+        self.input_layernorm = BambooRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = BambooRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         return outputs
+BAMBOO_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
 @add_start_docstrings(
+    "The bare Bamboo Model outputting raw hidden-states without any specific head on top.",
+    BAMBOO_START_DOCSTRING,
 )
+class BambooPreTrainedModel(PreTrainedModel):
     config_class = BambooConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["BambooDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
+BAMBOO_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
+    "The bare Bamboo Model outputting raw hidden-states without any specific head on top.",
+    BAMBOO_START_DOCSTRING,
 )
+class BambooModel(BambooPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BambooDecoderLayer`]
     Args:
         config: BambooConfig
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [BambooDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = BambooRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(BAMBOO_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Bamboo. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         )
+class BambooForCausalLM(BambooPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = BambooModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(BAMBOO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
 @add_start_docstrings(
     """
+    The Bamboo Model transformer with a sequence classification head on top (linear layer).
+    [`BambooForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    BAMBOO_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class BambooForSequenceClassification(BambooPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = BambooModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(BAMBOO_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,