NNX: fix checkpointing in the training loop

ecnal-cienet · ecnal-cienet · commit bd6f2800a15b · 2026-03-19T19:39:44.000Z
- Convert nnx.State to pure dict for checkpoint saving
- Restore pure dict back to nnx.State after loading
diff --git a/src/maxtext/common/checkpointing.py b/src/maxtext/common/checkpointing.py
@@ -590,8 +590,13 @@ def map_to_pspec(data):
         )
         ocp.type_handlers.register_type_handler(jax.Array, array_handler, override=True)
 
-      restore_args = jax.tree_util.tree_map(map_to_pspec, abstract_unboxed_pre_state)
-      checkpoint_args = ocp.args.PyTreeRestore(item=abstract_unboxed_pre_state, restore_args=restore_args)
+      # Convert nnx.State to pure dict to match how checkpoints are saved for NNX
+      restore_target = abstract_unboxed_pre_state
+      if isinstance(abstract_unboxed_pre_state, nnx.State):
+        restore_target = abstract_unboxed_pre_state.to_pure_dict()
+
+      restore_args = jax.tree_util.tree_map(map_to_pspec, restore_target)
+      checkpoint_args = ocp.args.PyTreeRestore(item=restore_target, restore_args=restore_args)
 
       match (checkpoint_manager, dataset_type, data_iterator):
         # Case 1: Matches if 'checkpoint_manager' is an instance of either EmergencyCheckpointManager
@@ -717,15 +722,35 @@ def save_params_to_path(checkpoint_dir, params, use_ocdbt=True, use_zarr3=True):
   print(f"Quantized params checkpoint saved at: {checkpoint_dir}")
 
 
-def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None):
-  """Save checkpoint if checkpointing is enabled."""
+def maybe_save_checkpoint(checkpoint_manager, state, config, data_iterator, step=None, force=False):
+  """Save checkpoint if checkpointing is enabled.
+
+  Args:
+    checkpoint_manager: The checkpoint manager.
+    state: The training state to save.
+    config: The config object.
+    data_iterator: The data iterator.
+    step: The step number. If None, extracts from state (for Linen TrainState).
+    force: If True, force save the checkpoint regardless of checkpoint_period.
+  """
   if checkpoint_manager is None:
     return
 
   # Determine the effective step for saving a checkpoint.
   # If 'step' is not provided, this call is for a potential final checkpoint
   # and use the last completed step from the state.
-  actual_step = (int(state.step) - 1) if step is None else int(step)
+  if step is not None:
+    actual_step = int(step)
+  else:
+    if config.pure_nnx:
+      actual_step = int(state.optimizer.step) - 1
+    else:
+      # Linen TrainState has .step attribute
+      actual_step = int(state.step) - 1
+
+  if config.pure_nnx:
+    # Convert nnx.State to dict.
+    state = state.to_pure_dict()
 
   # Determine if a checkpoint save should be forced, overriding the usual `config.checkpoint_period` logic.
   # This occurs if this function was called:
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -1168,7 +1168,10 @@ def setup_initial_state(
         # The update of data_iterator state happens in place, no need to assign explicitly
         state = restored["items"]
 
-      # TODO: For NNX, convert the pure dict to nnx.State.
+      # For NNX, convert the pure dict to nnx.State using the abstract state as template
+      if config.pure_nnx:
+        nnx.replace_by_pure_dict(unboxed_abstract_state, state)
+        state = unboxed_abstract_state
     else:
       init_state_partial = init_state_fn
       init_state_partial.__name__ = "initialize_state"