Migrate Decoder (Gemma3/Deepseek/Llama4) and utils to NNX

hsuan-lun-chiang · ecnal-cienet · commit 485776c4f56f · 2026-03-16T21:11:56.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1086,8 +1086,8 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: false
-pure_nnx_decoder: false
+enable_nnx: True
+pure_nnx_decoder: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -533,14 +533,14 @@ def __init__(
     elif self.is_qwen3_next:
       self.query_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
       )
       self.key_norm = Qwen3NextRMSNorm(
           num_features=self.config.head_dim,
-          eps=self.config.normalization_layer_epsilon,
+          epsilon=self.config.normalization_layer_epsilon,
           dtype=self.config.dtype,
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -1,4 +1,4 @@
-# Copyright 2026 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -829,10 +829,19 @@ def _find_next_boundary(self, current_idx, end_idx, engram_indices):
   def _apply_single_engram_layer(self, y, current_idx, layer_stack, *args, **kwargs):
     """Applies a single, unscanned Engram layer by dynamically slicing the NNX state."""
     graphdef, state = nnx.split(layer_stack)
+    params, rest = state.split(nnx.Param, ...)
+    scan_axis = self.config.param_scan_axis
+
+    # Helper to generate N-dimensional basic slices (e.g., x[:, idx, :])
+    def _extract_slice(x, idx, axis):
+      slices = tuple(idx if i == axis else slice(None) for i in range(x.ndim))
+      return x[slices]
 
-    # Slice the parameters for the current index (assuming scan axis is 0)
-    sliced_state = jax.tree.map(lambda x: x[current_idx], state)
-    single_layer = nnx.merge(graphdef, sliced_state)
+    # Slice using native indexing instead of jnp.take
+    sliced_params = jax.tree.map(lambda x: _extract_slice(x, current_idx, scan_axis), params)
+    sliced_rest = jax.tree.map(lambda x: _extract_slice(x, current_idx, 0), rest)
+
+    single_layer = nnx.merge(graphdef, sliced_params, sliced_rest)
 
     # Run the single layer
     out = single_layer(
@@ -841,37 +850,57 @@ def _apply_single_engram_layer(self, y, current_idx, layer_stack, *args, **kwarg
     y = out[0] if isinstance(out, tuple) else out
 
     # Re-merge the updated state back into the specific slice of the stack
-    new_single_state = nnx.state(single_layer)
-    updated_state = jax.tree.map(
+    new_state = nnx.state(single_layer)
+    new_params, new_rest = new_state.split(nnx.Param, ...)
+
+    updated_params = jax.tree.map(
+        lambda s, new_s: jax.lax.dynamic_update_slice_in_dim(
+            s, jnp.expand_dims(new_s, axis=scan_axis), current_idx, axis=scan_axis
+        ),
+        params,
+        new_params,
+    )
+    updated_rest = jax.tree.map(
         lambda s, new_s: jax.lax.dynamic_update_slice_in_dim(s, jnp.expand_dims(new_s, axis=0), current_idx, axis=0),
-        state,
-        new_single_state,
+        rest,
+        new_rest,
     )
-    nnx.update(layer_stack, updated_state)
 
+    nnx.update(layer_stack, updated_params, updated_rest)
     return y
 
   def _apply_scanned_chunk(self, y, current_idx, next_boundary, layer_stack, *args, **kwargs):
     """Applies a contiguous chunk of layers using scan over a state slice."""
     scan_length = next_boundary - current_idx
     if scan_length > 0:
       graphdef, state = nnx.split(layer_stack)
+      params, rest = state.split(nnx.Param, ...)
+      scan_axis = self.config.param_scan_axis
 
-      # Slice the chunk state
-      chunk_state = jax.tree.map(lambda x: jax.lax.dynamic_slice_in_dim(x, current_idx, scan_length, axis=0), state)
-      chunk_stack = nnx.merge(graphdef, chunk_state)
+      # Slice the chunk state along the correct axes
+      chunk_params = jax.tree.map(
+          lambda x: jax.lax.dynamic_slice_in_dim(x, current_idx, scan_length, axis=scan_axis), params
+      )
+      chunk_rest = jax.tree.map(lambda x: jax.lax.dynamic_slice_in_dim(x, current_idx, scan_length, axis=0), rest)
+      chunk_stack = nnx.merge(graphdef, chunk_params, chunk_rest)
 
       # Apply sequentially
       y, chunk_stack = self._apply_layers_sequentially(
           chunk_stack, y, *args, length=scan_length, **kwargs.get("layer_kwargs", {})
       )
 
       # Update the original stack state
-      new_chunk_state = nnx.state(chunk_stack)
-      updated_state = jax.tree.map(
-          lambda s, new_s: jax.lax.dynamic_update_slice_in_dim(s, new_s, current_idx, axis=0), state, new_chunk_state
+      new_state = nnx.state(chunk_stack)
+      new_params, new_rest = new_state.split(nnx.Param, ...)
+
+      updated_params = jax.tree.map(
+          lambda s, new_s: jax.lax.dynamic_update_slice_in_dim(s, new_s, current_idx, axis=scan_axis), params, new_params
       )
-      nnx.update(layer_stack, updated_state)
+      updated_rest = jax.tree.map(
+          lambda s, new_s: jax.lax.dynamic_update_slice_in_dim(s, new_s, current_idx, axis=0), rest, new_rest
+      )
+
+      nnx.update(layer_stack, updated_params, updated_rest)
 
     return y
 
diff --git a/src/maxtext/layers/normalizations.py b/src/maxtext/layers/normalizations.py
@@ -102,7 +102,17 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
     return y_flat.reshape(input_shape)
 
 
-def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
+def Qwen3NextRMSNorm(
+    num_features: int,
+    epsilon: float = 1e-6,
+    dtype: DType = jnp.float32,
+    weight_dtype: DType = jnp.float32,
+    shard_mode: ShardMode = ShardMode.AUTO,
+    kernel_axes: tuple[None | str, ...] = (),
+    parameter_memory_host_offload: bool = False,
+    *,
+    rngs: nnx.Rngs,
+):
   """
   Used for input and post attention layernorms
   in Qwen3NextDecoderLayer.
@@ -115,10 +125,13 @@ def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype:
   return nnx.data(
       RMSNorm(
           num_features=num_features,
-          epsilon=eps,
+          epsilon=epsilon,
           dtype=dtype,
           weight_dtype=weight_dtype,
+          shard_mode=shard_mode,
+          kernel_axes=kernel_axes,
           scale_init=linen_initializers.zeros,
+          parameter_memory_host_offload=parameter_memory_host_offload,
           scale_offset=1.0,
           rngs=rngs,
       )
diff --git a/src/maxtext/models/gpt_oss.py b/src/maxtext/models/gpt_oss.py
@@ -28,6 +28,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import moe
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
@@ -130,6 +131,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -181,7 +184,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
diff --git a/src/maxtext/models/llama2.py b/src/maxtext/models/llama2.py
@@ -70,6 +70,7 @@ def __init__(
         shard_mode=config.shard_mode,
         kernel_axes=("norm",),
         epsilon=config.normalization_layer_epsilon,
+        parameter_memory_host_offload=config.parameter_memory_host_offload,
         rngs=rngs,
     )
 
diff --git a/src/maxtext/models/olmo3.py b/src/maxtext/models/olmo3.py
@@ -29,6 +29,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
 from maxtext.layers.attentions import Attention
@@ -140,6 +141,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -193,7 +196,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -962,7 +962,7 @@ def __init__(
     # First LayerNorm, applied before the attention block.
     self.input_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,
@@ -987,7 +987,7 @@ def __init__(
     # Second LayerNorm, applied before the MoE block.
     self.post_attention_layernorm = Qwen3NextRMSNorm(
         num_features=cfg.emb_dim,
-        eps=cfg.normalization_layer_epsilon,
+        epsilon=cfg.normalization_layer_epsilon,
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         rngs=rngs,
diff --git a/tests/unit/models_test.py b/tests/unit/models_test.py
diff --git a/tests/unit/multi_token_prediction_test.py b/tests/unit/multi_token_prediction_test.py
diff --git a/tests/unit/nnx_decoder_test.py b/tests/unit/nnx_decoder_test.py
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ def __init__(`
`70`	`70`	`shard_mode=config.shard_mode,`
`71`	`71`	`kernel_axes=("norm",),`
`72`	`72`	`epsilon=config.normalization_layer_epsilon,`
	`73`	`+ parameter_memory_host_offload=config.parameter_memory_host_offload,`
`73`	`74`	`rngs=rngs,`
`74`	`75`	`)`
`75`	`76`