Spaces:

Molbap
/

code-bloat-lite

Running

App Files Files Community

Molbap HF Staff commited on Jul 4

Commit

d793afd

verified ·

1 Parent(s): e221c2d

Upload 2 files

Browse files

Files changed (2) hide show

dependencies.py +0 -0
main_code.py +253 -253

dependencies.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

main_code.py CHANGED Viewed

@@ -8,14 +8,14 @@
 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -56,25 +56,25 @@ logger = logging.get_logger(__name__)
 @dataclass
 @auto_docstring(
-    custom_intro="""
     Base class for Gemma3n outputs, with hidden states and attentions.
-    """
 )
 class Gemma3nModelOutputWithPast(BaseModelOutputWithPast):
-    r"""
-    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
         Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-        `past_key_values` input) to speed up sequential decoding.
-    image_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
         image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
-    audio_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
         audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
-    """
     image_hidden_states: Optional[torch.FloatTensor] = None
@@ -83,29 +83,29 @@ class Gemma3nModelOutputWithPast(BaseModelOutputWithPast):
 @dataclass
 @auto_docstring(
-    custom_intro="""
     Base class for Gemma3n causal language model (or autoregressive) outputs.
-    """
 )
 class Gemma3nCausalLMOutputWithPast(ModelOutput):
-    r"""
-    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
         Language modeling loss (for next-token prediction).
-    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
         Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
         Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-        `past_key_values` input) to speed up sequential decoding.
-    image_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
         image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
-    audio_hidden_states (`torch.FloatTensor`, *optional*):
-        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
         audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
-    """
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
@@ -126,7 +126,7 @@ class Gemma3nRMSNorm(nn.Module):
         if self.with_scale:
             self.weight = nn.Parameter(torch.ones(dim))
         else:
-            self.register_buffer("weight", torch.tensor(1.0), persistent=False)
     def _norm(self, x):
         return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
@@ -138,7 +138,7 @@ class Gemma3nRMSNorm(nn.Module):
         return output.type_as(x)
     def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.eps}"
 # ==== Audio Encoder ====
@@ -163,7 +163,7 @@ class Gemma3nAudioRelativePositionEmbedding(nn.Module):
         log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1)
         inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment)
         self.register_buffer(
-            "inv_timescales",
             inv_timescales.float().unsqueeze(0).unsqueeze(0),
             persistent=False,
         )
@@ -184,7 +184,7 @@ class Gemma3nAudioRelativePositionEmbedding(nn.Module):
         key_context_size: int,
         max_span_plus_1: int,
     ) -> torch.Tensor:
-        """Performs the relative shift.
         Args:
           term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
@@ -193,7 +193,7 @@ class Gemma3nAudioRelativePositionEmbedding(nn.Module):
         Returns:
           Tensor of shape [B, N, U, W, C].
-        """
         # term_bd_before_shift shape: [B, N, U, W, F_span]
         # Target shape after shift:  [B, N, U, W, C]
@@ -209,7 +209,7 @@ class Gemma3nAudioRelativePositionEmbedding(nn.Module):
         term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple)
         # Shape after pad: [B, N, U, W, C+1]
-        # Reshape for slicing (emulating JAX's behavior)
         # [B, N, U, W * (C+1)]
         term_bd_reshaped = term_bd_padded.reshape(
             (
@@ -271,7 +271,7 @@ class Gemma3nAudioRelativePositionEmbedding(nn.Module):
         term_ac = torch.matmul(queries_p, keys_p_t)  # [B, N, U, W, C]
         # term_bd: Query-Position interaction
-        # Original einsum: term_bd_unshifed = torch.einsum('buwnh,fnh->bnuwf', queries, sin_emb)
         # queries shape: [B, U, W, N, H]
         # sin_emb shape: [F, N, H]
         # Target output shape: [B, N, U, W, F]
@@ -338,7 +338,7 @@ class Gemma3nAudioAttention(nn.Module):
         q_scale = self.head_dim**-0.5
         r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0))
-        self.register_buffer("q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False)
         lower_causal_mask = torch.tril(
             torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
@@ -350,10 +350,10 @@ class Gemma3nAudioAttention(nn.Module):
         )
         local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool)
         local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask
-        self.register_buffer("local_causal_valid_mask", local_causal_valid_mask, persistent=False)
         self.register_buffer(
-            "softcap",
             torch.tensor(self.attention_logits_soft_cap).float(),
             persistent=False,
         )
@@ -366,7 +366,7 @@ class Gemma3nAudioAttention(nn.Module):
         return x
     def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """Turns a sequence to non overlapping blocks.
         Args:
             hidden_states: a tensor of [batch, time, ...].
@@ -375,7 +375,7 @@ class Gemma3nAudioAttention(nn.Module):
             A tensor of [batch, num_blocks, block_size, ...], with necessary
             paddings,
             where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
-        """
         shape = hidden_states.shape
         b, t = shape[:2]
         num_blocks = (t + self.chunk_size - 1) // self.chunk_size
@@ -388,7 +388,7 @@ class Gemma3nAudioAttention(nn.Module):
         return hidden_states
     def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """Extracts temporal context for every block.
         Args:
             hidden_states: a tensor of [batch, time, ...].
@@ -400,11 +400,11 @@ class Gemma3nAudioAttention(nn.Module):
             and output[:, i, ...] are x[:, start-left_context:end+right_context,
             ...],
             start = i * block_size, end = (i + 1) * block_size.
-        """
         pad_left = self.max_past_horizon
-        # The JAX equivalent padding for signal.frame with pad_mode='valid' is
         # (left_context, right_context + block_size - 1) on the time dimension.
-        # PyTorch's _pad_dim1 applies padding symmetrically if only one value is given,
         # or (pad_dim_start, pad_dim_end) if two are given.
         # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H])
         # or dim 1 (time for [B,T]).
@@ -424,7 +424,7 @@ class Gemma3nAudioAttention(nn.Module):
         # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len]
         # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len]
-        # We want to match JAX's typical output for such operations which might be
         # [B, num_blocks, frame_len, N, H] if N, H are present.
         # The relative_position_embedding expects keys as [B, U, C, N, H].
         # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C.
@@ -436,7 +436,7 @@ class Gemma3nAudioAttention(nn.Module):
         return x_unfolded.contiguous()
     def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
-        # sl.Dense uses jax.numpy.einsum("...a,abcd->...bcd") and jax.numpy.select()
         qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim)
         query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous()
         key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous()
@@ -472,7 +472,7 @@ class Gemma3nAudioAttention(nn.Module):
             extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
                 batch_size, num_query_blocks, self.context_size
             )
-        # After potential reshape, ensure it's [B, U, C] if it was from a [B,T] mask.
         # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently,
         # but for the mask case, this should hold.
         if extracted_valid_mask_blocks.shape != (
@@ -481,9 +481,9 @@ class Gemma3nAudioAttention(nn.Module):
             self.context_size,
         ):
             raise ValueError(
-                "Shape of extracted_valid_mask_blocks"
-                f" {extracted_valid_mask_blocks.shape} is not ({batch_size},"
-                f" {num_query_blocks}, {self.context_size}) after potential reshape."
             )
         # 3. Expand dimensions for broadcasting with logits and causal mask.
@@ -518,7 +518,7 @@ class Gemma3nAudioAttention(nn.Module):
         logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min)
         probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype)
-        # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...)
         b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
         h_dim = value_blocks.shape[-1]
         prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
@@ -539,21 +539,21 @@ class Gemma3nAudioAttention(nn.Module):
 class Gemma3nAudioCumulativeGroupNorm(nn.Module):
-    """Applies Group Normalization cumulatively over the time dimension.
     This layer normalizes the input by calculating the mean and variance
     cumulatively over the time dimension (dim 1). The statistics are computed
-    over all feature dimensions (specified by `feature_dims` and `num_channels`)
-    for elements marked as valid by the optional `mask`.
-    If a `mask` is provided (True for valid, False for invalid/padded),
     invalid time steps do not contribute to the statistics calculation, and
     their corresponding output values are zeroed out.
     Scale and bias, if enabled, are applied per-channel (last dimension).
-    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
-    and `cumulative=True`.
-    """
     def __init__(
         self,
@@ -574,19 +574,19 @@ class Gemma3nAudioCumulativeGroupNorm(nn.Module):
         self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """Applies cumulative group norm, optionally using a mask.
         Args:
           hidden_states: Input tensor, shape [B, T, *feature_dims, C].
         Returns:
           Normalized tensor with the same shape as x.
-        """
         expected_input_suffix = self.feature_dims + (self.num_channels,)
         if hidden_states.shape[2:] != expected_input_suffix:
             raise ValueError(
-                f"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected"
-                f" suffix (feature_dims + num_channels) {expected_input_suffix}"
             )
         input_dtype = hidden_states.dtype
@@ -594,7 +594,7 @@ class Gemma3nAudioCumulativeGroupNorm(nn.Module):
         calc_dtype = torch.float32
         x_calc = hidden_states.to(calc_dtype)
-        # Prepare a broadcastable mask (`mask_calc`).
         # If no mask is provided, treat all elements as valid
         # (mask_calc is all ones).
         # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
@@ -607,7 +607,7 @@ class Gemma3nAudioCumulativeGroupNorm(nn.Module):
         cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
         # 3. Count of valid elements in the normalization group at each time step.
-        #    (A "group" here consists of all features at a given Batch, Time).
         elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True)
         # 4. Cumulative count of valid elements over time.
         cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
@@ -648,11 +648,11 @@ class Gemma3nAudioCumulativeGroupNorm(nn.Module):
 class Gemma3nAudioSSCPConvBlock(nn.Module):
-    """A single convolution block for the SubSampleConvProjection.
     This block consists of a 2D convolution, followed by CumulativeGroupNorm,
     and a ReLU activation. It handles manual padding for the convolution.
-    """
     def __init__(
         self,
@@ -665,7 +665,7 @@ class Gemma3nAudioSSCPConvBlock(nn.Module):
         self.config = config
         self.manual_padding = manual_padding
-        # in_channels is 1 for the first block, or C_out from previous block's conv
         in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
         out_channels = self.config.sscp_conv_channel_size[idx]
         kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
@@ -701,7 +701,7 @@ class Gemma3nAudioSSCPConvBlock(nn.Module):
         # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1)
         # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
         # F.pad applies to last two dims: F_in then T_in
-        audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode="constant", value=0.0)
         # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2
         # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2
         audio_encodings_conv = self.conv(audio_encodings_padded)
@@ -728,7 +728,7 @@ class Gemma3nAudioSubSampleConvProjection(nn.Module):
             stride_h, stride_w = config.sscp_conv_stride_size[i]
             # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
-            # JAX 'reverse_causal' padding is (0, kernel_size - 1)
             pad_t_top = 0
             pad_t_bottom = kernel_h - 1
@@ -736,7 +736,7 @@ class Gemma3nAudioSubSampleConvProjection(nn.Module):
             # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2
             # and the successful test configuration.
             # If kernel/stride/input_freq for frequency changes, this might need re-evaluation
-            # to match generic JAX 'SAME' behavior if it differs.
             pad_f_left = 1
             pad_f_right = 1
@@ -792,7 +792,7 @@ class Gemma3nAudioConformerAttention(nn.Module):
         super().__init__()
         self.config = config
         self.post_in_features = self.config.hidden_size
-        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
         self.attn = Gemma3nAudioAttention(config)
         self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False)
@@ -820,7 +820,7 @@ class Gemma3nAudioConformerFeedForward(nn.Module):
         super().__init__()
         self.config = config
-        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
         self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False)
@@ -856,7 +856,7 @@ class Gemma3nAudioConformerLightConv1d(nn.Module):
             groups=self.config.hidden_size,  # Depthwise
             bias=False,
         )
-        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
         self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)
@@ -892,7 +892,7 @@ class Gemma3nAudioConformerBlock(nn.Module):
         self.attention = Gemma3nAudioConformerAttention(self.config)
         self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config)
         self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config)
-        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.norm = Gemma3nRMSNorm(self.config.hidden_size)
     def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
@@ -911,11 +911,11 @@ class Gemma3nAudioConformerBlock(nn.Module):
 class Gemma3nAudioEncoder(PreTrainedModel):
-    """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture."""
     config_class = Gemma3nAudioConfig
-    main_input_name = "audio_mel"
     def __init__(self, config: Gemma3nAudioConfig):
         super().__init__(config)
@@ -929,7 +929,7 @@ class Gemma3nAudioEncoder(PreTrainedModel):
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
     ) -> tuple[torch.Tensor, torch.BoolTensor]:
-        """Encodes a batch of MELs.
         Args:
             audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
@@ -937,10 +937,10 @@ class Gemma3nAudioEncoder(PreTrainedModel):
         Returns:
             audio_encodings: a torch.Tensor of shape
-                `[batch_size, self.config.audio_soft_tokens_per_image,
-                self.config.audio_config.hidden_size]`
             audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
-        """
         audio_encodings = self.subsample_conv_projection(audio_mel)  # audio_encodings: [B, T_sub, D]
         # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
@@ -983,20 +983,20 @@ class Gemma3nAudioEncoder(PreTrainedModel):
 class Gemma3nTextScaledWordEmbedding(nn.Embedding):
-    """
-    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
-    """
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
         super().__init__(num_embeddings, embedding_dim, padding_idx)
-        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
     def forward(self, input_ids: torch.Tensor):
         return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
 class Gemma3nTextLaurelBlock(nn.Module):
-    """Learned Augmented Residual Layer"""
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__()
@@ -1052,16 +1052,16 @@ class Gemma3nTextMLP(nn.Module):
 class Gemma3nTextAltUp(nn.Module):
-    """Alternating Updates (AltUp)
-    The AltUp module wraps transformer layers. The `predict` step modifies the
-    input to the transformer layer, and the `correct` step propagates the output
     of the transformer layer to the sparsely updated dimensions.
     See more in the research paper:
     https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
-    """
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__()
@@ -1071,7 +1071,7 @@ class Gemma3nTextAltUp(nn.Module):
         self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False)
         self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False)
         self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
-        self.register_buffer("router_input_scale", torch.tensor(self.config.hidden_size**-1.0), persistent=False)
     def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
         router_inputs = self.router_norm(x) * self.router_input_scale
@@ -1079,15 +1079,15 @@ class Gemma3nTextAltUp(nn.Module):
         return torch.tanh(routed.float()).type_as(x)
     def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """Predicts the output of a layer using a trainable map.
         Args:
-            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
-                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
         Returns:
-            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
-        """
         modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx])
         if self.training and self.config.altup_coef_clip is not None:
@@ -1107,17 +1107,17 @@ class Gemma3nTextAltUp(nn.Module):
         return predictions.contiguous().type_as(hidden_states)
     def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor:
-        """Corrects the predictions relative to the
         Args:
-            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
-                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
-            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.
         Returns:
-            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                 predictions relative to the activated input embeddings.
-        """
         modalities = self.compute_router_modalities(activated)
         innovation = activated - predictions[self.config.altup_active_idx]  # (batch, num_tokens, hidden_size)
         innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1)  # Repeat on dim0 to match predictions
@@ -1125,7 +1125,7 @@ class Gemma3nTextAltUp(nn.Module):
         if self.config.altup_coef_clip is not None:
             self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
-        # all_coefs adapted from jax.numpy.einsum("...p,pi->...i", ...)
         # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input
         # and expand on dim1 for broadcastability
         all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0
@@ -1136,26 +1136,26 @@ class Gemma3nTextAltUp(nn.Module):
         return corrected.contiguous().type_as(activated)
     def forward(self, corrected: torch.Tensor) -> torch.Tensor:
-        """
-        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
         (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
-        `scale_corrected_output`
-        """
         return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected)
     def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
-        """Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size]."""
         return self.forward(corrected)
 class Gemma3nTextRotaryEmbedding(nn.Module):
     def __init__(self, config: Gemma3nTextConfig, device=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
-            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
@@ -1163,7 +1163,7 @@ class Gemma3nTextRotaryEmbedding(nn.Module):
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
@@ -1172,7 +1172,7 @@ class Gemma3nTextRotaryEmbedding(nn.Module):
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
@@ -1183,17 +1183,17 @@ class Gemma3nTextRotaryEmbedding(nn.Module):
 def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
@@ -1243,38 +1243,38 @@ def apply_rotary_pos_emb(
     position_ids: Optional[torch.Tensor] = None,
     unsqueeze_dim: int = 1,
 ):
-    """Applies Rotary Position Embedding to the query and key tensors.
     Args:
-        x (`torch.Tensor`): The tensor to embed.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
             Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
     return (x * cos) + (rotate_half(x) * sin)
 class Gemma3nTextAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
         super().__init__()
-        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         self.config = config
         self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.attention_dropout = self.config.attention_dropout
         self.is_causal = True
@@ -1356,15 +1356,15 @@ class Gemma3nTextAttention(nn.Module):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {
-                "sin": sin,
-                "cos": cos,
-                "cache_position": cache_position,
-                "sliding_window": self.sliding_window,
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
@@ -1407,7 +1407,7 @@ class Gemma3nTextDecoderLayer(GradientCheckpointingLayer):
         self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False)
         self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
-    @deprecate_kwarg("last_cache_position", version="4.53.0")
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1460,12 +1460,12 @@ class Gemma3nTextDecoderLayer(GradientCheckpointingLayer):
         if self.config.altup_correct_scale:
             first_prediction = self.altup.scale_corrected_output(first_prediction)
-        # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...)
         first_prediction = self.per_layer_input_gate(first_prediction)
         first_prediction = self.act_fn(first_prediction)
         first_prediction = torch.multiply(first_prediction, per_layer_input)
-        # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...)
         first_prediction = self.per_layer_projection(first_prediction)
         first_prediction = self.post_per_layer_input_norm(first_prediction)
         corrected_predictions[1:] += first_prediction
@@ -1481,10 +1481,10 @@ class Gemma3nTextDecoderLayer(GradientCheckpointingLayer):
 @auto_docstring
 class Gemma3nPreTrainedModel(PreTrainedModel):
     config_class = Gemma3nConfig
-    base_model_prefix = ""
     supports_gradient_checkpointing = True
-    _no_split_modules = ["Gemma3nTextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_3 = True
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1495,9 +1495,9 @@ class Gemma3nPreTrainedModel(PreTrainedModel):
     _supports_attention_backend = True
     def _init_weights(self, module):
-        # important: this ported version of Gemma2 isn't meant for training from scratch - only
         # inference and fine-tuning - so the proper init weights code has been removed
-        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
         if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
@@ -1518,7 +1518,7 @@ class Gemma3nPreTrainedModel(PreTrainedModel):
             module.correct_output_scale.data.zero_()
-@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.")
 class Gemma3nTextModel(Gemma3nPreTrainedModel):
     config_class = Gemma3nTextConfig
@@ -1544,7 +1544,7 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         # defaults should hold values for global RoPE.
         config = copy.deepcopy(config)
         config.rope_theta = config.rope_local_base_freq
-        config.rope_scaling = {"rope_type": "default"}
         self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config)
         self.hidden_size = config.hidden_size
@@ -1573,8 +1573,8 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
             [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
         )
-        self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False)
-        self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1601,10 +1601,10 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
-        r"""
         per_layer_inputs (torch.Tensor, *optional*, defaults to None):
             Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1612,11 +1612,11 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
             )
             use_cache = False
@@ -1640,20 +1640,20 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        # It may already have been prepared by e.g. `generate`
         if not isinstance(causal_mask_mapping := attention_mask, dict):
             # Prepare mask arguments
             mask_kwargs = {
-                "config": self.config,
-                "input_embeds": inputs_embeds,
-                "attention_mask": attention_mask,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
             }
             # Create the masks
             causal_mask_mapping = {
-                "full_attention": create_causal_mask(**mask_kwargs),
-                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
         # embed positions
@@ -1669,7 +1669,7 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         temp_hidden_states = [hidden_states_0]
         for i in range(1, self.config.altup_num_inputs):
-            # altup_proj adapted from jax.numpy.einsum("btp,pd->btd", ...)
             altup_proj = self.altup_projections[i - 1](hidden_states_0)
             current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
             new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
@@ -1717,7 +1717,7 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
         temp_hidden_states = [hidden_states[0]]
         for i in range(1, self.config.altup_num_inputs):
-            # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...)
             altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i])
             current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
             new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
@@ -1771,14 +1771,14 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel):
         )
-@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.")
 class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}
-    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     config_class = Gemma3nTextConfig
-    base_model_prefix = "model"
-    _checkpoint_conversion_mapping = {"model.language_model": "model"}
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__(config)
@@ -1824,33 +1824,33 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
     ) -> CausalLMOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Example:
-        ```python
         >>> from transformers import AutoTokenizer, Gemma3nForCausalLM
-        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
-        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-        >>> prompt = "What is your favorite condiment?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "What is your favorite condiment?"
-        ```"""
-        if self.training and self.config._attn_implementation != "eager":
             logger.warning_once(
-                "It is strongly recommended to train Gemma3n models with the `eager` attention implementation "
-                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
             )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1893,7 +1893,7 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
 class Gemma3nMultimodalEmbedder(nn.Module):
-    """Embeds token ids or soft tokens for multimodal content into language model space."""
     def __init__(
         self,
@@ -1919,18 +1919,18 @@ class Gemma3nMultimodalEmbedder(nn.Module):
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        """Embeds token ids or soft tokens for multimodal content into language model space.
         Args:
             input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
-                `[vocab_offset, vocab_offset + vocab_size)`.
             inputs_embeds: A torch.Tensor containing the soft tokens to embed.
         Returns:
-            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
-        """
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is not None:
             emb_norm = self.soft_embedding_norm(inputs_embeds)
@@ -1943,14 +1943,14 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 @auto_docstring(
-    custom_intro="""
     The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
     language modeling head.
-    """
 )
 class Gemma3nModel(Gemma3nPreTrainedModel):
     _checkpoint_conversion_mapping = {}
-    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
     accepts_loss_kwargs = False
     def __init__(self, config: Gemma3nConfig):
@@ -1981,16 +1981,16 @@ class Gemma3nModel(Gemma3nPreTrainedModel):
         return self.language_model
     def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        """
         Projects the last hidden state from the vision model into language model space.
         Args:
-            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
                The tensors corresponding to the input images.
         Returns:
-            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
-        """
         vision_outputs = self.vision_tower(
             pixel_values=pixel_values, do_pooling=False, return_dict=True
         ).last_hidden_state
@@ -2024,36 +2024,36 @@ class Gemma3nModel(Gemma3nPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         **lm_kwargs,
     ) -> Gemma3nCausalLMOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
         Example:
-        ```python
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration
-        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
-        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")
-        >>> prompt = "Where is the cat standing?"
-        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(**inputs,)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Where is the cat standing?\nsnow"
-        ```
-        """
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -2103,9 +2103,9 @@ class Gemma3nModel(Gemma3nPreTrainedModel):
             if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
                 image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
                 raise ValueError(
-                    f"Number of images does not match number of special image tokens in the input text. "
-                    f"Got {image_tokens_in_text} image tokens in the text and "
-                    f"{image_features.shape[0] * image_features.shape[1]} tokens from image embeddings."
                 )
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
@@ -2140,9 +2140,9 @@ class Gemma3nModel(Gemma3nPreTrainedModel):
             if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != audio_features.numel():
                 audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
                 raise ValueError(
-                    f"Number of audio input features does not match number of special audio tokens in the input text. "
-                    f"Got {audio_tokens_in_text} audio tokens in the text and "
-                    f"{audio_features.shape[0] * audio_features.shape[1]} tokens from audio embeddings."
                 )
             audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
@@ -2174,32 +2174,32 @@ class Gemma3nModel(Gemma3nPreTrainedModel):
     def get_audio_features(
         self, input_features: torch.Tensor, input_features_mask: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
         Projects the last hidden state from the audio encoder into language model space.
         Args:
-            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
                The tensors corresponding to the input audio.
-            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
                The attention mask for the input audio.
         Returns:
-            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
-        """
         audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask)
         return self.embed_audio(inputs_embeds=audio_outputs), audio_mask
 @auto_docstring(
-    custom_intro="""
     The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
     head.
-    """
 )
 class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
     _checkpoint_conversion_mapping = {}
-    _tied_weights_keys = ["lm_head.weight"]
-    base_model_prefix = "model"
     def __init__(self, config: Gemma3nConfig):
         super().__init__(config)
@@ -2239,7 +2239,7 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
     @property
     def multi_modal_projector(self):
-        raise AttributeError("Use embed_vision instead of multi_modal_projector.")
     @can_return_tuple
     @auto_docstring
@@ -2262,38 +2262,38 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Gemma3nCausalLMOutputWithPast:
-        r"""
         input_features (torch.Tensor, *optional*, defaults to None):
             The audio inputs to be encoded.
         input_features_mask (torch.Tensor, *optional*, defaults to None):
             The attention mask for the input audio.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
             ignored (masked), the loss is only computed for the tokens with labels in
-            `[0, ..., config.text_config.vocab_size]`.
         Example:
-        ```python
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
-        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
         >>> messages = [
         ...     {
-        ...         "role": "system",
-        ...         "content": [
-        ...             {"type": "text", "text": "You are a helpful assistant."}
         ...         ]
         ...     },
         ...     {
-        ...         "role": "user", "content": [
-        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
-        ...             {"type": "text", "text": "Where is the cat standing?"},
         ...         ]
         ...     },
         ... ]
@@ -2302,15 +2302,15 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
         ...     messages,
         ...     tokenizer=True,
         ...     return_dict=True,
-        ...     return_tensors="pt",
         ...     add_generation_prompt=True
         ... )
         >>> # Generate
         >>> generate_ids = model.generate(**inputs)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
-        ```
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2393,7 +2393,7 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
         labels=None,
         **kwargs,
     ):
-        # Overwritten -- custom `position_ids` and `pixel_values` handling
         model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -2407,13 +2407,13 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        # If we're in cached decoding stage, multimodal inputs should be None because input ids do not contain special
         # tokens anymore. Otherwise multimodal inputs should be passed to model.
         # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask
         if cache_position[0] == 0:
-            model_inputs["pixel_values"] = pixel_values
-            model_inputs["input_features"] = input_features
-            model_inputs["input_features_mask"] = input_features_mask
         return model_inputs
@@ -2423,10 +2423,10 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
 __all__ = [
-    "Gemma3nAudioEncoder",
-    "Gemma3nForCausalLM",
-    "Gemma3nForConditionalGeneration",
-    "Gemma3nModel",
-    "Gemma3nPreTrainedModel",
-    "Gemma3nTextModel",
 ]

 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
 #
+# Licensed under the Apache License, Version 2.0 (the \"License\");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 @dataclass
 @auto_docstring(
+    custom_intro=\"\"\"
     Base class for Gemma3n outputs, with hidden states and attentions.
+    \"\"\"
 )
 class Gemma3nModelOutputWithPast(BaseModelOutputWithPast):
+    r\"\"\"
+    past_key_values (\`tuple(tuple(torch.FloatTensor))\`, *optional*, returned when \`use_cache=True\` is passed or when \`config.use_cache=True\`):
+        Tuple of \`tuple(torch.FloatTensor)\` of length \`config.n_layers\`, with each tuple having 2 tensors of shape
+        \`(batch_size, num_heads, sequence_length, embed_size_per_head)\`)
         Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        \`past_key_values\` input) to speed up sequential decoding.
+    image_hidden_states (\`torch.FloatTensor\`, *optional*):
+        A \`torch.FloatTensor\` of size \`(batch_size, num_images, sequence_length, hidden_size)\`.
         image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    audio_hidden_states (\`torch.FloatTensor\`, *optional*):
+        A \`torch.FloatTensor\` of size \`(batch_size, num_images, sequence_length, hidden_size)\`.
         audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    \"\"\"
     image_hidden_states: Optional[torch.FloatTensor] = None
 @dataclass
 @auto_docstring(
+    custom_intro=\"\"\"
     Base class for Gemma3n causal language model (or autoregressive) outputs.
+    \"\"\"
 )
 class Gemma3nCausalLMOutputWithPast(ModelOutput):
+    r\"\"\"
+    loss (\`torch.FloatTensor\` of shape \`(1,)\`, *optional*, returned when \`labels\` is provided):
         Language modeling loss (for next-token prediction).
+    logits (\`torch.FloatTensor\` of shape \`(batch_size, sequence_length, config.text_config.vocab_size)\`):
         Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (\`tuple(tuple(torch.FloatTensor))\`, *optional*, returned when \`use_cache=True\` is passed or when \`config.use_cache=True\`):
+        Tuple of \`tuple(torch.FloatTensor)\` of length \`config.n_layers\`, with each tuple having 2 tensors of shape
+        \`(batch_size, num_heads, sequence_length, embed_size_per_head)\`)
         Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        \`past_key_values\` input) to speed up sequential decoding.
+    image_hidden_states (\`torch.FloatTensor\`, *optional*):
+        A \`torch.FloatTensor\` of size \`(batch_size, num_images, sequence_length, hidden_size)\`.
         image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    audio_hidden_states (\`torch.FloatTensor\`, *optional*):
+        A \`torch.FloatTensor\` of size \`(batch_size, num_images, sequence_length, hidden_size)\`.
         audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    \"\"\"
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
         if self.with_scale:
             self.weight = nn.Parameter(torch.ones(dim))
         else:
+            self.register_buffer(\"weight\", torch.tensor(1.0), persistent=False)
     def _norm(self, x):
         return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
         return output.type_as(x)
     def extra_repr(self):
+        return f\"{tuple(self.weight.shape)}, eps={self.eps}\"
 # ==== Audio Encoder ====
         log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1)
         inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment)
         self.register_buffer(
+            \"inv_timescales\",
             inv_timescales.float().unsqueeze(0).unsqueeze(0),
             persistent=False,
         )
         key_context_size: int,
         max_span_plus_1: int,
     ) -> torch.Tensor:
+        \"\"\"Performs the relative shift.
         Args:
           term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
         Returns:
           Tensor of shape [B, N, U, W, C].
+        \"\"\"
         # term_bd_before_shift shape: [B, N, U, W, F_span]
         # Target shape after shift:  [B, N, U, W, C]
         term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple)
         # Shape after pad: [B, N, U, W, C+1]
+        # Reshape for slicing (emulating JAX\'s behavior)
         # [B, N, U, W * (C+1)]
         term_bd_reshaped = term_bd_padded.reshape(
             (
         term_ac = torch.matmul(queries_p, keys_p_t)  # [B, N, U, W, C]
         # term_bd: Query-Position interaction
+        # Original einsum: term_bd_unshifed = torch.einsum(\'buwnh,fnh->bnuwf\', queries, sin_emb)
         # queries shape: [B, U, W, N, H]
         # sin_emb shape: [F, N, H]
         # Target output shape: [B, N, U, W, F]
         q_scale = self.head_dim**-0.5
         r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0))
+        self.register_buffer(\"q_scale\", (q_scale * r_softplus_0).clone().detach(), persistent=False)
         lower_causal_mask = torch.tril(
             torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
         )
         local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool)
         local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask
+        self.register_buffer(\"local_causal_valid_mask\", local_causal_valid_mask, persistent=False)
         self.register_buffer(
+            \"softcap\",
             torch.tensor(self.attention_logits_soft_cap).float(),
             persistent=False,
         )
         return x
     def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        \"\"\"Turns a sequence to non overlapping blocks.
         Args:
             hidden_states: a tensor of [batch, time, ...].
             A tensor of [batch, num_blocks, block_size, ...], with necessary
             paddings,
             where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
+        \"\"\"
         shape = hidden_states.shape
         b, t = shape[:2]
         num_blocks = (t + self.chunk_size - 1) // self.chunk_size
         return hidden_states
     def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        \"\"\"Extracts temporal context for every block.
         Args:
             hidden_states: a tensor of [batch, time, ...].
             and output[:, i, ...] are x[:, start-left_context:end+right_context,
             ...],
             start = i * block_size, end = (i + 1) * block_size.
+        \"\"\"
         pad_left = self.max_past_horizon
+        # The JAX equivalent padding for signal.frame with pad_mode=\'valid\' is
         # (left_context, right_context + block_size - 1) on the time dimension.
+        # PyTorch\'s _pad_dim1 applies padding symmetrically if only one value is given,
         # or (pad_dim_start, pad_dim_end) if two are given.
         # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H])
         # or dim 1 (time for [B,T]).
         # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len]
         # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len]
+        # We want to match JAX\'s typical output for such operations which might be
         # [B, num_blocks, frame_len, N, H] if N, H are present.
         # The relative_position_embedding expects keys as [B, U, C, N, H].
         # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C.
         return x_unfolded.contiguous()
     def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        # sl.Dense uses jax.numpy.einsum(\"...a,abcd->...bcd\") and jax.numpy.select()
         qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim)
         query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous()
         key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous()
             extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
                 batch_size, num_query_blocks, self.context_size
             )
+        # After potential reshape, ensure it\'s [B, U, C] if it was from a [B,T] mask.
         # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently,
         # but for the mask case, this should hold.
         if extracted_valid_mask_blocks.shape != (
             self.context_size,
         ):
             raise ValueError(
+                \"Shape of extracted_valid_mask_blocks\"
+                f\" {extracted_valid_mask_blocks.shape} is not ({batch_size},\"
+                f\" {num_query_blocks}, {self.context_size}) after potential reshape.\"
             )
         # 3. Expand dimensions for broadcasting with logits and causal mask.
         logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min)
         probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype)
+        # context_vectors is adapted from jax.numpy.einsum(\"BNuwc,BucNH->BuwNH\", ...)
         b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
         h_dim = value_blocks.shape[-1]
         prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
 class Gemma3nAudioCumulativeGroupNorm(nn.Module):
+    \"\"\"Applies Group Normalization cumulatively over the time dimension.
     This layer normalizes the input by calculating the mean and variance
     cumulatively over the time dimension (dim 1). The statistics are computed
+    over all feature dimensions (specified by \`feature_dims\` and \`num_channels\`)
+    for elements marked as valid by the optional \`mask\`.
+    If a \`mask\` is provided (True for valid, False for invalid/padded),
     invalid time steps do not contribute to the statistics calculation, and
     their corresponding output values are zeroed out.
     Scale and bias, if enabled, are applied per-channel (last dimension).
+    This behavior is similar to JAX\'s \`GroupNormalization\` with \`num_groups=1\`
+    and \`cumulative=True\`.
+    \"\"\"
     def __init__(
         self,
         self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        \"\"\"Applies cumulative group norm, optionally using a mask.
         Args:
           hidden_states: Input tensor, shape [B, T, *feature_dims, C].
         Returns:
           Normalized tensor with the same shape as x.
+        \"\"\"
         expected_input_suffix = self.feature_dims + (self.num_channels,)
         if hidden_states.shape[2:] != expected_input_suffix:
             raise ValueError(
+                f\"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected\"
+                f\" suffix (feature_dims + num_channels) {expected_input_suffix}\"
             )
         input_dtype = hidden_states.dtype
         calc_dtype = torch.float32
         x_calc = hidden_states.to(calc_dtype)
+        # Prepare a broadcastable mask (\`mask_calc\`).
         # If no mask is provided, treat all elements as valid
         # (mask_calc is all ones).
         # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
         cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
         # 3. Count of valid elements in the normalization group at each time step.
+        #    (A \"group\" here consists of all features at a given Batch, Time).
         elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True)
         # 4. Cumulative count of valid elements over time.
         cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
 class Gemma3nAudioSSCPConvBlock(nn.Module):
+    \"\"\"A single convolution block for the SubSampleConvProjection.
     This block consists of a 2D convolution, followed by CumulativeGroupNorm,
     and a ReLU activation. It handles manual padding for the convolution.
+    \"\"\"
     def __init__(
         self,
         self.config = config
         self.manual_padding = manual_padding
+        # in_channels is 1 for the first block, or C_out from previous block\'s conv
         in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
         out_channels = self.config.sscp_conv_channel_size[idx]
         kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
         # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1)
         # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
         # F.pad applies to last two dims: F_in then T_in
+        audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode=\"constant\", value=0.0)
         # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2
         # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2
         audio_encodings_conv = self.conv(audio_encodings_padded)
             stride_h, stride_w = config.sscp_conv_stride_size[i]
             # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
+            # JAX \'reverse_causal\' padding is (0, kernel_size - 1)
             pad_t_top = 0
             pad_t_bottom = kernel_h - 1
             # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2
             # and the successful test configuration.
             # If kernel/stride/input_freq for frequency changes, this might need re-evaluation
+            # to match generic JAX \'SAME\' behavior if it differs.
             pad_f_left = 1
             pad_f_right = 1
         super().__init__()
         self.config = config
         self.post_in_features = self.config.hidden_size
+        self.register_buffer(\"gradient_clipping\", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
         self.attn = Gemma3nAudioAttention(config)
         self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False)
         super().__init__()
         self.config = config
+        self.register_buffer(\"gradient_clipping\", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
         self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False)
             groups=self.config.hidden_size,  # Depthwise
             bias=False,
         )
+        self.register_buffer(\"gradient_clipping\", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
         self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)
         self.attention = Gemma3nAudioConformerAttention(self.config)
         self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config)
         self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config)
+        self.register_buffer(\"gradient_clipping\", torch.tensor(self.config.gradient_clipping), persistent=False)
         self.norm = Gemma3nRMSNorm(self.config.hidden_size)
     def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
 class Gemma3nAudioEncoder(PreTrainedModel):
+    \"\"\"An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.\"\"\"
     config_class = Gemma3nAudioConfig
+    main_input_name = \"audio_mel\"
     def __init__(self, config: Gemma3nAudioConfig):
         super().__init__(config)
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
     ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        \"\"\"Encodes a batch of MELs.
         Args:
             audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
         Returns:
             audio_encodings: a torch.Tensor of shape
+                \`[batch_size, self.config.audio_soft_tokens_per_image,
+                self.config.audio_config.hidden_size]\`
             audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
+        \"\"\"
         audio_encodings = self.subsample_conv_projection(audio_mel)  # audio_encodings: [B, T_sub, D]
         # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
 class Gemma3nTextScaledWordEmbedding(nn.Embedding):
+    \"\"\"
+    This module overrides nn.Embeddings\' forward by multiplying with embeddings scale.
+    \"\"\"
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
         super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer(\"embed_scale\", torch.tensor(embed_scale), persistent=False)
     def forward(self, input_ids: torch.Tensor):
         return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
 class Gemma3nTextLaurelBlock(nn.Module):
+    \"\"\"Learned Augmented Residual Layer\"\"\"
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__()
 class Gemma3nTextAltUp(nn.Module):
+    \"\"\"Alternating Updates (AltUp)
+    The AltUp module wraps transformer layers. The \`predict\` step modifies the
+    input to the transformer layer, and the \`correct\` step propagates the output
     of the transformer layer to the sparsely updated dimensions.
     See more in the research paper:
     https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
+    \"\"\"
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__()
         self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False)
         self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False)
         self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.register_buffer(\"router_input_scale\", torch.tensor(self.config.hidden_size**-1.0), persistent=False)
     def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
         router_inputs = self.router_norm(x) * self.router_input_scale
         return torch.tanh(routed.float()).type_as(x)
     def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        \"\"\"Predicts the output of a layer using a trainable map.
         Args:
+            hidden_states: A 4D tensor of shape \`[num_altup_inputs, batch_size, num_tokens, hidden_size]\` derived by
+                stacking the input embeddings and preprocessing the last \`num_altup_inputs - 1\` matrices.
         Returns:
+            A 4D tensor of shape \`[num_altup_inputs, batch_size, num_tokens, hidden_size]\` containing the predictions.
+        \"\"\"
         modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx])
         if self.training and self.config.altup_coef_clip is not None:
         return predictions.contiguous().type_as(hidden_states)
     def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor:
+        \"\"\"Corrects the predictions relative to the
         Args:
+            predictions: A 4D tensor of shape \`[num_altup_inputs, batch_size, num_tokens, hidden_size]\` derived by
+                stacking the input embeddings and preprocessing the last \`num_altup_inputs - 1\` matrices.
+            activated: A 3D tensor of shape \`[batch_size, num_tokens, hidden_size]\` containing the activated inputs.
         Returns:
+            A 4D tensor of shape \`[num_altup_inputs, batch_size, num_tokens, hidden_size]\` correcting the original
                 predictions relative to the activated input embeddings.
+        \"\"\"
         modalities = self.compute_router_modalities(activated)
         innovation = activated - predictions[self.config.altup_active_idx]  # (batch, num_tokens, hidden_size)
         innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1)  # Repeat on dim0 to match predictions
         if self.config.altup_coef_clip is not None:
             self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
+        # all_coefs adapted from jax.numpy.einsum(\"...p,pi->...i\", ...)
         # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input
         # and expand on dim1 for broadcastability
         all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0
         return corrected.contiguous().type_as(activated)
     def forward(self, corrected: torch.Tensor) -> torch.Tensor:
+        \"\"\"
+        This is only defined as the \`forward\` so that accelerate hooks can move correctly \`correct_output_scale\`
         (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
+        \`scale_corrected_output\`
+        \"\"\"
         return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected)
     def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        \"\"\"Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].\"\"\"
         return self.forward(corrected)
 class Gemma3nTextRotaryEmbedding(nn.Module):
     def __init__(self, config: Gemma3nTextConfig, device=None):
         super().__init__()
+        # BC: \"rope_type\" was originally \"type\"
+        if hasattr(config, \"rope_scaling\") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(\"rope_type\", config.rope_scaling.get(\"type\"))
         else:
+            self.rope_type = \"default\"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != \"mps\" else \"cpu\"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
 def rotate_half(x):
+    \"\"\"Rotates half the hidden dims of the input.\"\"\"
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    \"\"\"
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    \"\"\"
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
     position_ids: Optional[torch.Tensor] = None,
     unsqueeze_dim: int = 1,
 ):
+    \"\"\"Applies Rotary Position Embedding to the query and key tensors.
     Args:
+        x (\`torch.Tensor\`): The tensor to embed.
+        cos (\`torch.Tensor\`): The cosine part of the rotary embedding.
+        sin (\`torch.Tensor\`): The sine part of the rotary embedding.
+        position_ids (\`torch.Tensor\`, *optional*):
             Deprecated and unused.
+        unsqueeze_dim (\`int\`, *optional*, defaults to 1):
+            The \'unsqueeze_dim\' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
+        \`tuple(torch.Tensor)\` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    \"\"\"
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
     return (x * cos) + (rotate_half(x) * sin)
 class Gemma3nTextAttention(nn.Module):
+    \"\"\"Multi-headed attention from \'Attention Is All You Need\' paper\"\"\"
     def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
         super().__init__()
+        self.is_sliding = config.layer_types[layer_idx] == \"sliding_attention\"
         self.config = config
         self.layer_idx = layer_idx
+        self.head_dim = getattr(config, \"head_dim\", config.hidden_size // config.num_attention_heads)
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.attention_dropout = self.config.attention_dropout
         self.is_causal = True
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {
+                \"sin\": sin,
+                \"cos\": cos,
+                \"cache_position\": cache_position,
+                \"sliding_window\": self.sliding_window,
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != \"eager\":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
         self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False)
         self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+    @deprecate_kwarg(\"last_cache_position\", version=\"4.53.0\")
     def forward(
         self,
         hidden_states: torch.Tensor,
         if self.config.altup_correct_scale:
             first_prediction = self.altup.scale_corrected_output(first_prediction)
+        # per_layer_input_gate adapted from jax.numpy.einsum(\"btd,dp->btp\", ...)
         first_prediction = self.per_layer_input_gate(first_prediction)
         first_prediction = self.act_fn(first_prediction)
         first_prediction = torch.multiply(first_prediction, per_layer_input)
+        # per_layer_projection adapted from jax.numpy.einsum(\"btp,pd->btd\", ...)
         first_prediction = self.per_layer_projection(first_prediction)
         first_prediction = self.post_per_layer_input_norm(first_prediction)
         corrected_predictions[1:] += first_prediction
 @auto_docstring
 class Gemma3nPreTrainedModel(PreTrainedModel):
     config_class = Gemma3nConfig
+    base_model_prefix = \"\"
     supports_gradient_checkpointing = True
+    _no_split_modules = [\"Gemma3nTextDecoderLayer\"]
+    _skip_keys_device_placement = [\"past_key_values\"]
     _supports_flash_attn_3 = True
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_attention_backend = True
     def _init_weights(self, module):
+        # important: this ported version of Gemma2 isn\'t meant for training from scratch - only
         # inference and fine-tuning - so the proper init weights code has been removed
+        std = getattr(self.config, \"initializer_range\", self.config.get_text_config().initializer_range)
         if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             module.correct_output_scale.data.zero_()
+@auto_docstring(custom_intro=\"The base Gemma 3n language model without a language modeling head.\")
 class Gemma3nTextModel(Gemma3nPreTrainedModel):
     config_class = Gemma3nTextConfig
         # defaults should hold values for global RoPE.
         config = copy.deepcopy(config)
         config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {\"rope_type\": \"default\"}
         self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config)
         self.hidden_size = config.hidden_size
             [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
         )
+        self.register_buffer(\"per_layer_projection_scale\", torch.tensor(self.hidden_size**-0.5), persistent=False)
+        self.register_buffer(\"per_layer_input_scale\", torch.rsqrt(torch.tensor(2.0)), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
+        r\"\"\"
         per_layer_inputs (torch.Tensor, *optional*, defaults to None):
             Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
+        \"\"\"
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(\"You must specify exactly one of input_ids or inputs_embeds\")
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
+                \"\`use_cache=True\` is incompatible with gradient checkpointing. Setting \`use_cache=False\`.\"
             )
             use_cache = False
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. \`generate\`
         if not isinstance(causal_mask_mapping := attention_mask, dict):
             # Prepare mask arguments
             mask_kwargs = {
+                \"config\": self.config,
+                \"input_embeds\": inputs_embeds,
+                \"attention_mask\": attention_mask,
+                \"cache_position\": cache_position,
+                \"past_key_values\": past_key_values,
             }
             # Create the masks
             causal_mask_mapping = {
+                \"full_attention\": create_causal_mask(**mask_kwargs),
+                \"sliding_attention\": create_sliding_window_causal_mask(**mask_kwargs),
             }
         # embed positions
         temp_hidden_states = [hidden_states_0]
         for i in range(1, self.config.altup_num_inputs):
+            # altup_proj adapted from jax.numpy.einsum(\"btp,pd->btd\", ...)
             altup_proj = self.altup_projections[i - 1](hidden_states_0)
             current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
             new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
         target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
         temp_hidden_states = [hidden_states[0]]
         for i in range(1, self.config.altup_num_inputs):
+            # altup_unembed_projections adapted from jax.numpy.einsum(\"btp,pd->btd\", ...)
             altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i])
             current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
             new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
         )
+@auto_docstring(custom_intro=\"The base Gemma 3n language model with a language modeling head.\")
 class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = [\"lm_head.weight\"]
+    _tp_plan = {\"lm_head\": \"colwise_rep\"}
+    _pp_plan = {\"lm_head\": ([\"hidden_states\"], [\"logits\"])}
     config_class = Gemma3nTextConfig
+    base_model_prefix = \"model\"
+    _checkpoint_conversion_mapping = {\"model.language_model\": \"model\"}
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__(config)
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
     ) -> CausalLMOutputWithPast:
+        r\"\"\"
+        labels (\`torch.LongTensor\` of shape \`(batch_size, sequence_length)\`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in \`[0, ...,
+            config.vocab_size]\` or -100 (see \`input_ids\` docstring). Tokens with indices set to \`-100\` are ignored
+            (masked), the loss is only computed for the tokens with labels in \`[0, ..., config.vocab_size]\`.
         Example:
+        \`\`\`python
         >>> from transformers import AutoTokenizer, Gemma3nForCausalLM
+        >>> model = Gemma3nForCausalLM.from_pretrained(\"google/gemma-2-9b\")
+        >>> tokenizer = AutoTokenizer.from_pretrained(\"google/gemma-2-9b\")
+        >>> prompt = \"What is your favorite condiment?\"
+        >>> inputs = tokenizer(prompt, return_tensors=\"pt\")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        \"What is your favorite condiment?\"
+        \`\`\`\"\"\"
+        if self.training and self.config._attn_implementation != \"eager\":
             logger.warning_once(
+                \"It is strongly recommended to train Gemma3n models with the \`eager\` attention implementation \"
+                f\"instead of \`{self.config._attn_implementation}\`. Use \`eager\` with \`AutoModelForCausalLM.from_pretrained(\'<path-to-checkpoint>\', attn_implementation=\'eager\')\`.\"
             )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
 class Gemma3nMultimodalEmbedder(nn.Module):
+    \"\"\"Embeds token ids or soft tokens for multimodal content into language model space.\"\"\"
     def __init__(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        \"\"\"Embeds token ids or soft tokens for multimodal content into language model space.
         Args:
             input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                \`[vocab_offset, vocab_offset + vocab_size)\`.
             inputs_embeds: A torch.Tensor containing the soft tokens to embed.
         Returns:
+            A torch.Tensor of embeddings with  shape \`[batch_size, seq_len, self.config.text_config.hidden_size]\`.
+        \"\"\"
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(\"You must specify exactly one of input_ids or inputs_embeds\")
         if inputs_embeds is not None:
             emb_norm = self.soft_embedding_norm(inputs_embeds)
 @auto_docstring(
+    custom_intro=\"\"\"
     The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
     language modeling head.
+    \"\"\"
 )
 class Gemma3nModel(Gemma3nPreTrainedModel):
     _checkpoint_conversion_mapping = {}
+    # we are filtering the logits/labels so we shouldn\'t divide the loss based on num_items_in_batch
     accepts_loss_kwargs = False
     def __init__(self, config: Gemma3nConfig):
         return self.language_model
     def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        \"\"\"
         Projects the last hidden state from the vision model into language model space.
         Args:
+            pixel_values (\`torch.FloatTensor]\` of shape \`(batch_size, channels, height, width)\`)
                The tensors corresponding to the input images.
         Returns:
+            image_features (\`torch.Tensor\`): Image feature tensor of shape \`(num_images, image_length, embed_dim)\`).
+        \"\"\"
         vision_outputs = self.vision_tower(
             pixel_values=pixel_values, do_pooling=False, return_dict=True
         ).last_hidden_state
         output_hidden_states: Optional[bool] = None,
         **lm_kwargs,
     ) -> Gemma3nCausalLMOutputWithPast:
+        r\"\"\"
+        labels (\`torch.LongTensor\` of shape \`(batch_size, sequence_length)\`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in \`[0, ...,
+            config.text_config.vocab_size]\` or -100 (see \`input_ids\` docstring). Tokens with indices set to \`-100\` are ignored
+            (masked), the loss is only computed for the tokens with labels in \`[0, ..., config.text_config.vocab_size]\`.
         Example:
+        \`\`\`python
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+        >>> model = Gemma3nForConditionalGeneration.from_pretrained(\"google/gemma3n2-3b-mix-224\")
+        >>> processor = AutoProcessor.from_pretrained(\"google/gemma3n2-3b-mix-224\")
+        >>> prompt = \"Where is the cat standing?\"
+        >>> url = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg\"
         >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt,  return_tensors=\"pt\")
         >>> # Generate
         >>> generate_ids = model.generate(**inputs,)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        \"Where is the cat standing?\nsnow\"
+        \`\`\`
+        \"\"\"
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(\"You must specify exactly one of input_ids or inputs_embeds\")
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
                 image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
                 raise ValueError(
+                    f\"Number of images does not match number of special image tokens in the input text. \"
+                    f\"Got {image_tokens_in_text} image tokens in the text and \"
+                    f\"{image_features.shape[0] * image_features.shape[1]} tokens from image embeddings.\"
                 )
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
             if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != audio_features.numel():
                 audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
                 raise ValueError(
+                    f\"Number of audio input features does not match number of special audio tokens in the input text. \"
+                    f\"Got {audio_tokens_in_text} audio tokens in the text and \"
+                    f\"{audio_features.shape[0] * audio_features.shape[1]} tokens from audio embeddings.\"
                 )
             audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
     def get_audio_features(
         self, input_features: torch.Tensor, input_features_mask: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        \"\"\"
         Projects the last hidden state from the audio encoder into language model space.
         Args:
+            input_features (\`torch.FloatTensor]\` of shape \`(num_images, seq_length, num_features)\`):
                The tensors corresponding to the input audio.
+            input_features (\`torch.FloatTensor]\` of shape \`(num_images, seq_length)\`):
                The attention mask for the input audio.
         Returns:
+            audio_features (\`torch.Tensor\`): Audio feature tensor of shape \`(num_images, audio_length, embed_dim)\`).
+        \"\"\"
         audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask)
         return self.embed_audio(inputs_embeds=audio_outputs), audio_mask
 @auto_docstring(
+    custom_intro=\"\"\"
     The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
     head.
+    \"\"\"
 )
 class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
     _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = [\"lm_head.weight\"]
+    base_model_prefix = \"model\"
     def __init__(self, config: Gemma3nConfig):
         super().__init__(config)
     @property
     def multi_modal_projector(self):
+        raise AttributeError(\"Use embed_vision instead of multi_modal_projector.\")
     @can_return_tuple
     @auto_docstring
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ) -> Gemma3nCausalLMOutputWithPast:
+        r\"\"\"
         input_features (torch.Tensor, *optional*, defaults to None):
             The audio inputs to be encoded.
         input_features_mask (torch.Tensor, *optional*, defaults to None):
             The attention mask for the input audio.
+        labels (\`torch.LongTensor\` of shape \`(batch_size, sequence_length)\`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in \`[0, ...,
+            config.text_config.vocab_size]\` or -100 (see \`input_ids\` docstring). Tokens with indices set to \`-100\` are
             ignored (masked), the loss is only computed for the tokens with labels in
+            \`[0, ..., config.text_config.vocab_size]\`.
         Example:
+        \`\`\`python
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained(\"google/gemma-3-4b-it\")
+        >>> processor = AutoProcessor.from_pretrained(\"google/gemma-3-4b-it\")
         >>> messages = [
         ...     {
+        ...         \"role\": \"system\",
+        ...         \"content\": [
+        ...             {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}
         ...         ]
         ...     },
         ...     {
+        ...         \"role\": \"user\", \"content\": [
+        ...             {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg\"},
+        ...             {\"type\": \"text\", \"text\": \"Where is the cat standing?\"},
         ...         ]
         ...     },
         ... ]
         ...     messages,
         ...     tokenizer=True,
         ...     return_dict=True,
+        ...     return_tensors=\"pt\",
         ...     add_generation_prompt=True
         ... )
         >>> # Generate
         >>> generate_ids = model.generate(**inputs)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        \"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to\"
+        \`\`\`
+        \"\"\"
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         labels=None,
         **kwargs,
     ):
+        # Overwritten -- custom \`position_ids\` and \`pixel_values\` handling
         model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             **kwargs,
         )
+        # If we\'re in cached decoding stage, multimodal inputs should be None because input ids do not contain special
         # tokens anymore. Otherwise multimodal inputs should be passed to model.
         # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask
         if cache_position[0] == 0:
+            model_inputs[\"pixel_values\"] = pixel_values
+            model_inputs[\"input_features\"] = input_features
+            model_inputs[\"input_features_mask\"] = input_features_mask
         return model_inputs
 __all__ = [
+    \"Gemma3nAudioEncoder\",
+    \"Gemma3nForCausalLM\",
+    \"Gemma3nForConditionalGeneration\",
+    \"Gemma3nModel\",
+    \"Gemma3nPreTrainedModel\",
+    \"Gemma3nTextModel\",
 ]