zai-org
/

codegeex2-6b

@@ -3,9 +3,7 @@
 import math
 import copy
 import warnings
-import re
 import sys
-import functools
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
@@ -177,14 +175,13 @@ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Ten
 class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, quantized=False, **kwargs):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
         self.eps = eps
-        self.quantized = quantized
     def forward(self, hidden_states: torch.Tensor):
-        if not self.quantized:
             norm_x = torch.mean(hidden_states * hidden_states, dim=-1, keepdim=True)
             x_normed = hidden_states * torch.rsqrt(norm_x + self.eps)
             return self.weight * x_normed
@@ -521,14 +518,7 @@ class GLMBlock(torch.nn.Module):
         self.fp32_residual_connection = config.fp32_residual_connection
-        if config.rmsnorm:
-            if config.quantization_bit != 0:
-                LayerNormFunc = functools.partial(RMSNorm, quantized=True)
-            else:
-                LayerNormFunc = RMSNorm
-        else:
-            LayerNormFunc = LayerNorm
         # Layernorm on the input data.
         self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
@@ -606,13 +596,7 @@ class GLMTransformer(torch.nn.Module):
         self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
         if self.post_layer_norm:
-            if config.rmsnorm:
-                if config.quantization_bit != 0:
-                    LayerNormFunc = functools.partial(RMSNorm, quantized=True)
-                else:
-                    LayerNormFunc = RMSNorm
-            else:
-                LayerNormFunc = LayerNorm
             # Final layer norm before output.
             self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)

 import math
 import copy
 import warnings
 import sys
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
         self.eps = eps
     def forward(self, hidden_states: torch.Tensor):
+        if hidden_states == torch.bfloat16:
             norm_x = torch.mean(hidden_states * hidden_states, dim=-1, keepdim=True)
             x_normed = hidden_states * torch.rsqrt(norm_x + self.eps)
             return self.weight * x_normed
         self.fp32_residual_connection = config.fp32_residual_connection
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
         # Layernorm on the input data.
         self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
         self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
         if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
             # Final layer norm before output.
             self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)