AI-Hypercomputer
diff --git a/‎src/maxtext/checkpoint_conversion/to_maxtext.py‎
Lines changed: 13 additions & 5 deletions b/‎src/maxtext/checkpoint_conversion/to_maxtext.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 3 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxtext/layers/multi_token_prediction.py‎
Lines changed: 0 additions & 2 deletions b/‎src/maxtext/layers/multi_token_prediction.py‎
Lines changed: 0 additions & 2 deletions
@@ -384,14 +384,22 @@ def _build_single_axis_stacked_tensor(
       The final, assembled NumPy array for the MaxText parameter.
   """
   tensors_to_stack = []
+  # Heuristic to determine if we are stacking layers or experts.
+  # If the number of items to stack equals the number of layers, it's a standard
+  # scanned layer, and we use the configured param_scan_axis. Otherwise, it's
+  # an unscanned MoE layer, and we stack along the expert axis (0).
+  """
+  axis_to_stack = config.param_scan_axis if len(hf_source_keys) == config.base_num_decoder_layers else 0 
+  """
 
-  if config.scan_layers:
-    # If it's a standard scanned layer, we use the configured param_scan_axis.
-    axis_to_stack = config.param_scan_axis
+  # Workaround to load the HF model due to mismatched tensor ordering
+  if len(hf_source_keys) == config.base_num_decoder_layers:
+    if getattr(config, "enable_nnx", False):
+      axis_to_stack = 0
+    else:
+      axis_to_stack = config.param_scan_axis
   else:
-    # Otherwise, if an unscanned MoE layer, and we stack along the expert axis (0).
     axis_to_stack = 0
-
   # The hook function needs the shape of an individual slice, not the full stacked tensor.
   # We calculate it by removing the stacking dimension from the final target shape.
   mt_slice_shape_list = list(target_shape)
 
@@ -706,7 +706,7 @@ autoregressive_decode_assert: ""
 
 # For nsys profiler, pass the training command to nsys command
 # e.g. nsys profile -s none --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop {training command}
-profiler: "" # Supported profiler: '', xplane, nsys
+profiler: "xplane" # Supported profiler: '', xplane, nsys
 # If set to true, upload all profiler results from all hosts. Otherwise, only upload the profiler result from the first host.
 upload_all_profiler_results: False
 # Skip first n steps for profiling, to omit things like compilation and to give
@@ -1060,8 +1060,8 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: false
-pure_nnx_decoder: false
+enable_nnx: True
+pure_nnx_decoder: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
 
@@ -136,7 +136,6 @@ def __init__(
         model_mode=MODEL_MODE_TRAIN,
     )
 
-
   @property
   def embedding_norm(self):
     return getattr(self, f"mtp_{self.layer_number}_embedding_norm")
@@ -169,7 +168,6 @@ def transformer_layer(self):
   def transformer_layer(self, module):
     setattr(self, f"mtp_{self.layer_number}_transformer_layer", module)
 
-
   def __call__(
       self,
       prev_hidden_state: jnp.ndarray,