Migrate Decoder (Gemma3/Deepseek/Llama4) and utils to NNX

hsuan-lun-chiang · hsuan-lun-chiang · commit f26e16091196 · 2026-02-11T08:17:44.000Z
diff --git a/src/MaxText/layers/gemma3.py b/src/MaxText/layers/gemma3.py
@@ -91,7 +91,6 @@ def __init__(
 
     batch_size, seq_len = max_utils.get_batch_seq_len_for_mode(config, model_mode)
     dummy_inputs_shape = (batch_size, seq_len, config.emb_dim)
-
     self.pre_self_attention_norm = RMSNorm(
         num_features=config.emb_dim,
         dtype=config.dtype,
@@ -198,7 +197,6 @@ def __call__(
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
-
     lnx = self.pre_self_attention_norm(inputs)
     lnx = nn.with_logical_constraint(lnx, self.activation_axis_names)
 
diff --git a/src/MaxText/layers/nnx_decoders.py b/src/MaxText/layers/nnx_decoders.py
@@ -29,18 +29,16 @@
 from flax import nnx
 from flax.nnx import wrappers as nnx_wrappers
 
-from MaxText.configs.types import PositionalEmbedding
 from MaxText.common_types import DecoderBlockType, ShardMode, Config, EP_AS_CONTEXT
 from MaxText.common_types import MODEL_MODE_TRAIN, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 from MaxText.sharding import create_sharding
 from MaxText.layers import linears
 from MaxText.layers import initializers
 from MaxText.layers import quantizations
-from MaxText import multimodal_utils
 from MaxText import sharding
 from MaxText.layers.attentions import Attention
 from MaxText.layers.normalizations import RMSNorm
-from MaxText.layers.embeddings import Embed, attend_on_embedding
+from MaxText.layers.embeddings import Embed, attend_on_embedding, PositionalEmbedding
 from MaxText.layers.quantizations import AqtQuantization as Quant
 from MaxText.layers import (
     deepseek,
@@ -61,6 +59,7 @@
 from maxtext.inference import page_manager
 from maxtext.utils import max_logging
 from maxtext.utils import maxtext_utils
+from maxtext.multimodal import utils as mm_utils
 
 # ------------------------------------------------------------------------------
 # The network: Decoder Definitions
@@ -284,19 +283,28 @@ def __init__(
         attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
         scan_length = config.num_decoder_layers // attention_pattern_length
         num_remaining_layers = config.num_decoder_layers % attention_pattern_length
+        layer_kwargs = {"num_of_layers": attention_pattern_length}
+
         rem_layer_kwargs = {"num_of_layers": num_remaining_layers}
 
         RemattedGemma3Block = gemma3.Gemma3ScannableBlock
 
         if scan_length > 0:
-          self.layers = self._create_scanned_layers(RemattedGemma3Block, length=scan_length, rngs=rngs)
+          self.layers = self._create_scanned_layers(RemattedGemma3Block, length=scan_length, rngs=rngs, **layer_kwargs)
         self.layers_remainder = RemattedGemma3Block(
             config=self.config, mesh=mesh, quant=self.quant, model_mode=self.model_mode, **rem_layer_kwargs, rngs=rngs
         )  # pytype: disable=wrong-keyword-args
       else:
         layer_cls = decoder_block_classes[0]
-        num_layers = config.num_decoder_layers
-        self.layers = self._create_scanned_layers(layer_cls, length=num_layers, rngs=rngs)
+        num_layers = int(config.num_decoder_layers / config.inhomogeneous_layer_cycle_interval)
+        layer_kwargs = {}
+        if config.decoder_block == DecoderBlockType.LLAMA4:
+          layer_kwargs = {
+              "nope_layer_interval": self.config.nope_layer_interval,
+              "interleave_moe_layer_step": self.config.interleave_moe_layer_step,
+          }
+
+        self.layers = self._create_scanned_layers(layer_cls, length=num_layers, rngs=rngs, **layer_kwargs)
     else:
       self.layers = nnx.List([])
       if self.is_deepseek:
@@ -309,6 +317,32 @@ def __init__(
         for i in range(config.num_decoder_layers):
           self._create_and_register_layer(layer_cls, rngs, "layers", i)
 
+      self.layers = nnx.List([])
+      
+      if self.is_deepseek:
+        dense_cls, moe_cls = decoder_block_classes
+        for i in range(config.first_num_dense_layers):
+          self._create_and_register_layer(dense_cls, rngs, "dense_layer", i)
+        for i in range(config.num_decoder_layers - config.first_num_dense_layers):
+          self._create_and_register_layer(moe_cls, rngs, "moe_layer", i)
+      else:
+        layer_cls = decoder_block_classes[0]
+        
+        for i in range(config.num_decoder_layers):
+          layer_kwargs = {}
+          if config.decoder_block == DecoderBlockType.GEMMA3:
+             layer_kwargs = {"attention_type": gemma3.get_attention_type(layer_id=i)}
+          elif config.decoder_block == DecoderBlockType.LLAMA4:
+             layer_kwargs = {
+                 "is_nope_layer": llama4.determine_is_nope_layer(i, self.config.nope_layer_interval),
+                 "is_moe_layer": llama4.determine_is_moe_layer(i, self.config.interleave_moe_layer_step),
+             }
+          elif config.decoder_block == DecoderBlockType.QWEN3_NEXT:
+             layer_kwargs = {"layer_idx": i}
+          elif config.decoder_block == DecoderBlockType.GPT_OSS:
+             layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=i)}
+          self._create_and_register_layer(layer_cls, rngs, "layers", i, **layer_kwargs)
+  
   def _create_and_register_layer(self, layer_cls, rngs, base_name, i):
     attr_name = f"{base_name}_{i}"
     layer = self._create_single_layer(layer_cls, rngs)
@@ -366,7 +400,6 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
 
     layer_cls = layers.__class__  # Access the underlying class
     sig = inspect.signature(layer_cls.__call__)
-
     # Filter kwargs to only include keys that exist in the layer's signature
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
 
@@ -584,7 +617,7 @@ def _apply_embedding(
           "llama4-17b-128e",
           "qwen3-omni-30b-a3b",
       ]:
-        y = multimodal_utils.merge_mm_embeddings(
+        y = mm_utils.merge_mm_embeddings(
             text_embeddings=y,
             multimodal_embeddings=image_embeddings,
             mask=bidirectional_mask,
@@ -596,7 +629,7 @@ def _apply_embedding(
 
     if audio_embeddings is not None and cfg.use_audio:
       if cfg.model_name in ["qwen3-omni-30b-a3b"]:
-        y = multimodal_utils.merge_mm_embeddings(
+        y = mm_utils.merge_mm_embeddings(
             text_embeddings=y,
             multimodal_embeddings=audio_embeddings,
             mask=audio_masks,
@@ -609,7 +642,7 @@ def _apply_embedding(
     y = y.astype(cfg.dtype)
 
     if cfg.use_untrainable_positional_embedding:
-      y = self.positional_embedding(y, decoder_positions)
+      y += self.positional_embedding(y, decoder_positions)
 
     if cfg.trainable_position_size > 0 and self.position_embedder:
       y += self.position_embedder(decoder_positions.astype("int32"), model_mode=model_mode)
@@ -625,7 +658,7 @@ def apply_output_head(self, shared_embedding, y, deterministic, model_mode):
     else:
       norm_out_sharding = None
 
-    y = self.decoder_norm(y, norm_out_sharding)
+    y = self.decoder_norm(y, out_sharding=norm_out_sharding)
     y = self.dropout(y, deterministic=deterministic)  # NNX call
 
     if model_mode in (MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE):
@@ -693,19 +726,18 @@ def __call__(
         audio_masks,
     )
     layer_args = (decoder_segment_ids, decoder_positions, deterministic, model_mode)
-
-    layer_kwargs = {
-        "previous_chunk": previous_chunk,
-        "page_state": page_state,
-        "slot": slot,
-        "attention_metadata": attention_metadata,
-    }
-
+    
+    layer_kwargs = {}
     if cfg.decoder_block == DecoderBlockType.GEMMA3:
       layer_kwargs["bidirectional_mask"] = bidirectional_mask
 
     if cfg.scan_layers:
       if self.is_deepseek:
+        layer_kwargs = {
+            "previous_chunk": previous_chunk,
+            "page_state": page_state,
+            "slot": slot,
+        }
         y, self.dense_layers = self._apply_layers_sequentially(
             self.dense_layers, y, *layer_args, length=cfg.first_num_dense_layers, **layer_kwargs
         )
@@ -733,8 +765,24 @@ def __call__(
     else:
       for i, layer in enumerate(self.layers):
         kv_cache = kv_caches[i] if kv_caches is not None else None
+        
+        layer_call_kwargs = {}
+        if cfg.decoder_block == DecoderBlockType.GEMMA3:
+            layer_call_kwargs = {"bidirectional_mask": bidirectional_mask}
 
-        out = layer(y, *layer_args, kv_cache=kv_cache, **layer_kwargs)
+        out = layer(
+            y,
+            decoder_segment_ids,
+            decoder_positions,
+            deterministic,
+            model_mode,
+            previous_chunk=previous_chunk,
+            page_state=page_state,
+            slot=slot,
+            kv_cache=kv_cache,
+            attention_metadata=attention_metadata,
+            **layer_call_kwargs
+        )
 
         if isinstance(out, tuple):
           y, kv_cache_out = out
@@ -775,17 +823,12 @@ def _apply_gemma3_scanned_blocks(
     attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
     scan_length = cfg.num_decoder_layers // attention_pattern_length
 
-    layer_call_kwargs = {"bidirectional_mask": bidirectional_mask}
+    layer_args = (decoder_segment_ids, decoder_positions, deterministic, model_mode)
+    layer_kwargs = {"bidirectional_mask": bidirectional_mask}
 
     # Apply the main scan over the full blocks
     if scan_length > 0:
-      broadcast_args = (
-          decoder_segment_ids,
-          decoder_positions,
-          deterministic,
-          model_mode,
-      )
-      y, _ = self.layers(y, *broadcast_args, **layer_call_kwargs)
+      y, self.layers = self._apply_layers_sequentially(self.layers, y, *layer_args, length=scan_length, **layer_kwargs)
 
     # Apply any remaining layers that did not fit into a full scanned block
     num_remaining_layers = cfg.num_decoder_layers % attention_pattern_length
@@ -800,8 +843,9 @@ def _apply_gemma3_scanned_blocks(
           previous_chunk=previous_chunk,
           page_state=page_state,
           slot=slot,
-          **layer_call_kwargs,
+          **layer_kwargs,
       )
+
     return y
 
 
diff --git a/src/MaxText/utils/ckpt_conversion/to_maxtext.py b/src/MaxText/utils/ckpt_conversion/to_maxtext.py
@@ -385,14 +385,22 @@ def _build_single_axis_stacked_tensor(
       The final, assembled NumPy array for the MaxText parameter.
   """
   tensors_to_stack = []
+  # Heuristic to determine if we are stacking layers or experts.
+  # If the number of items to stack equals the number of layers, it's a standard
+  # scanned layer, and we use the configured param_scan_axis. Otherwise, it's
+  # an unscanned MoE layer, and we stack along the expert axis (0).
+  """
+  axis_to_stack = config.param_scan_axis if len(hf_source_keys) == config.base_num_decoder_layers else 0 
+  """
 
-  if config.scan_layers:
-    # If it's a standard scanned layer, we use the configured param_scan_axis.
-    axis_to_stack = config.param_scan_axis
+  # Workaround to load the HF model due to mismatched tensor ordering
+  if len(hf_source_keys) == config.base_num_decoder_layers:
+    if getattr(config, "enable_nnx", False):
+      axis_to_stack = 0
+    else:
+      axis_to_stack = config.param_scan_axis
   else:
-    # Otherwise, if an unscanned MoE layer, and we stack along the expert axis (0).
     axis_to_stack = 0
-
   # The hook function needs the shape of an individual slice, not the full stacked tensor.
   # We calculate it by removing the stacking dimension from the final target shape.
   mt_slice_shape_list = list(target_shape)
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -315,7 +315,7 @@ attention_out: 'remat'
 
 optimizer_memory_host_offload: False
 parameter_memory_host_offload: False
-scan_layers: True # We recommend setting this to false when using pipeline parallelism, instead scanning the PP iterations.
+scan_layers: False # We recommend setting this to false when using pipeline parallelism, instead scanning the PP iterations.
 param_scan_axis: 1
 
 # The attention parameter dictates the specific algorithm/methodology used to compute the attention scores
@@ -1049,8 +1049,8 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: false
-pure_nnx_decoder: false
+enable_nnx: True
+pure_nnx_decoder: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
diff --git a/tests/checkpoint_compare.py b/tests/checkpoint_compare.py
diff --git a/tests/unit/multi_token_prediction_test.py b/tests/unit/multi_token_prediction_test.py