Merge pull request #3670 from AI-Hypercomputer:parambole/502806272

Google-ML-Automation · Google-ML-Automation · commit 21c4433e7be8 · 2026-04-16T13:01:08.000-07:00
PiperOrigin-RevId: 900869830
diff --git a/src/maxtext/layers/multi_token_prediction.py b/src/maxtext/layers/multi_token_prediction.py
@@ -79,7 +79,6 @@ def __init__(
     self.layer_number = layer_number
     self.transformer_layer_module = transformer_layer_module
     self.rngs = rngs
-    k = layer_number
     cfg = self.config
 
     self.embedding_norm = RMSNorm(
@@ -112,7 +111,6 @@ def __init__(
         config=cfg,
         mesh=mesh,
         model_mode=MODEL_MODE_TRAIN,
-        name=f"mtp_{k}_transformer_layer",
         rngs=rngs,
     )
 
diff --git a/src/maxtext/layers/nnx_wrappers.py b/src/maxtext/layers/nnx_wrappers.py
@@ -627,5 +627,6 @@ def __init_subclass__(cls, **kwargs):
   ToLinenPartial.__qualname__ = class_name
 
   ToLinenPartial.__init__ = __init__
+  ToLinenPartial.module_class = base_nnx_class
 
   return ToLinenPartial
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -93,14 +93,17 @@ def setup(self):
     # If MTP is enabled via config, set up the MTP block.
     if self.config.mtp_num_layers > 0:
       # Get the list of layer blueprints for the current model.
-      layer_types = self.decoder.get_decoder_layers()
       # For MTP, we use the DecoderLayer blueprint to ensure architectural consistency.
       # By convention, this is the last layer in the list.
-      mtp_layer = layer_types[-1]
+      layer_types = self.decoder.get_decoder_layers()
+      mtp_layer_linen = layer_types[-1]
+      # UNWRAP: The MTP block is pure NNX. If the decoder returned a Linen wrapper,
+      # extract the native NNX class to preserve parameter tracing/scoping.
+      mtp_layer_nnx = getattr(mtp_layer_linen, "module_class", mtp_layer_linen)
       self.mtp_block = multi_token_prediction_block_as_linen(
           config=self.config,
           mesh=self.mesh,
-          transformer_layer_module=mtp_layer,
+          transformer_layer_module=mtp_layer_nnx,
           decoder=self.decoder,
           rngs=self.make_rng("mtp_block"),
       )