Fix diloco related unit tests

hsuan-lun-chiang · ecnal-cienet · commit 5c07324306e7 · 2026-05-21T18:12:03.000Z
diff --git a/src/maxtext/trainers/diloco/diloco.py b/src/maxtext/trainers/diloco/diloco.py
@@ -35,6 +35,7 @@
 import optax
 
 from maxtext.configs import pyconfig
+from maxtext.layers.train_state_nnx import TrainStateNNX
 
 Batch = Any
 Params = PyTree
@@ -157,8 +158,10 @@ def add_diloco_dim(x):
   # For NNX, model params (Param variables only) live under abstract_state.model;
   # for Linen under abstract_state.params.
   if config.pure_nnx:
-    model_params = abstract_state.model.filter(nnx.Param)
-    model_params_sharding = state_mesh_shardings.model.filter(nnx.Param)
+    _, model_params, _ = nnx.split(abstract_state.model, nnx.Param, ...)
+    model_params = model_params.to_pure_dict()
+    _, model_params_sharding, _ = nnx.split(state_mesh_shardings.model, nnx.Param, ...)
+    model_params_sharding = model_params_sharding.to_pure_dict()
   else:
     model_params = abstract_state.params
     model_params_sharding = state_mesh_shardings.params
@@ -262,9 +265,11 @@ def synchronize(state):
     # state (since last synchronization).
     broadcast_outer_params = drjax.broadcast(state.params, mesh=mesh)
     # For NNX, model Param vars live under inner_state.model; for Linen under inner_state.params.
-    inner_model_params = (
-        nnx.filter_state(state.inner_state.model, nnx.Param) if config.pure_nnx else state.inner_state.params
-    )
+    if config.pure_nnx:
+      _, inner_model_params, _ = nnx.split(state.inner_state.model, nnx.Param, ...)
+      inner_model_params = inner_model_params.to_pure_dict()
+    else:
+      inner_model_params = state.inner_state.params
     model_delta = jax.tree.map(lambda x, y: y - x, inner_model_params, broadcast_outer_params)
     # Treat the average delta as the outer optimizer's gradient and apply to
     # the global (outer) model params.
@@ -277,15 +282,25 @@ def synchronize(state):
     if config.pure_nnx:
       # For NNX: merge new Param vars back with the non-Param model vars (e.g. RNG state).
       def replace_nnx_model_params(s, new_params):
-        non_param_model = nnx.filter_state(s.model, nnx.Not(nnx.Param))
-        new_model = nnx.merge_state(non_param_model, new_params)
-        # Assign via __setitem__ so nested States are stored as plain dicts (matching
-        # nnx.state()'s pytree structure). The dict-literal constructor keeps them as
-        # State objects, which makes jax.lax.cond see mismatched pytree structures.
-        result = type(s)({})
-        result["model"] = new_model
-        result["optimizer"] = s["optimizer"]
-        return result
+        s_model = s["model"] if hasattr(s, "keys") else s.model
+        s_opt = s["optimizer"] if hasattr(s, "keys") else s.optimizer
+
+        graphdef, _, non_param_state = nnx.split(s_model, nnx.Param, ...)
+        new_model = nnx.merge(graphdef, new_params, non_param_state)
+
+        if type(s_model).__name__ == "State":
+          new_model = nnx.state(new_model)
+        elif isinstance(s_model, dict):
+          new_model = nnx.to_pure_dict(new_model)
+
+        if hasattr(s, "keys"):
+          leaves, treedef = jax.tree_util.tree_flatten(s)
+          new_model_leaves, _ = jax.tree_util.tree_flatten(new_model)
+          N = len(new_model_leaves)
+          new_leaves = new_model_leaves + leaves[N:]
+          return jax.tree_util.tree_unflatten(treedef, new_leaves)
+        else:
+          return TrainStateNNX(new_model, s_opt)
 
       new_inner_state = drjax.map_fn(
           lambda s: replace_nnx_model_params(s, new_outer_params),
diff --git a/src/maxtext/utils/train_utils.py b/src/maxtext/utils/train_utils.py
@@ -295,12 +295,13 @@ def create_train_state_fn():
         state, outer_opt_state_sharding = diloco.build_diloco_state(config, lambda: state, mesh=mesh)
 
         # create state_mesh_shardings for the DilocoState
+        step_mesh = state_mesh_shardings.optimizer.step.mesh if config.pure_nnx else state_mesh_shardings.step.mesh
         inner_state_shardings = diloco.add_diloco_to_sharding(state_mesh_shardings)
         state_mesh_shardings = diloco.DiLoCoTrainState(
             inner_state_shardings,
-            state_mesh_shardings.params,
+            state_mesh_shardings_params,
             outer_opt_state_sharding,
-            jax.sharding.NamedSharding(mesh=state_mesh_shardings.step.mesh, spec=jax.sharding.PartitionSpec()),
+            jax.sharding.NamedSharding(mesh=step_mesh, spec=jax.sharding.PartitionSpec()),
         )
 
     # TODO(aireenmei, hengtaoguo): support sharding in vit for multimodal
diff --git a/tests/integration/diloco_test.py b/tests/integration/diloco_test.py
@@ -30,6 +30,7 @@
 import pytest
 
 from maxtext.configs.pyconfig import initialize_pydantic
+from maxtext.layers.train_state_nnx import TrainStateNNX
 from maxtext.trainers.pre_train.train_compile import main as train_compile_main
 from maxtext.trainers.diloco import diloco
 from tests.utils.test_helpers import get_test_config_path
@@ -86,51 +87,49 @@ def test_diloco_training_simulation_with_mesh(self):
       graphdef, params = nnx.split(model)
 
       if test_config.pure_nnx:
-        from maxtext.layers.train_state_nnx import TrainStateNNX
         optimizer = nnx.Optimizer(model, tx, wrt=nnx.Param)
-        
-        # We must split the state so drjax can broadcast it (drjax needs pure dicts/pytrees)
-        _, initial_test_state_dict = nnx.split(TrainStateNNX(model, optimizer))
-        # But wait, diloco_test_state expects a TrainStateNNX instance if pure_nnx is True!
+        # diloco_test_state expects a TrainStateNNX instance when pure_nnx is True.
         initial_test_state = TrainStateNNX(model, optimizer)
 
         # For NNX, train_step needs to take the TrainStateNNX and mutate it
+
         def _test_train_step(state, batch, prng_key: diloco.PRNGKey):
           del prng_key
+
           def loss_fn(model, batch):
             inputs, labels = batch
             logits = jax.vmap(model)(inputs)
             residual = logits - labels
             return jnp.mean(jnp.square(residual))
-          
+
           loss, grads = nnx.value_and_grad(loss_fn)(state.model, batch)
           state.optimizer.update(state.model, grads)
-          state.optimizer.step.value += 1
           return state, loss
 
       else:
+
         def nnx_apply_fn(params, inputs):
           model_replica = nnx.merge(graphdef, params)
           return model_replica(inputs)
-  
+
         # 2. Vmap this new wrapper function
         vmapped_apply = jax.vmap(nnx_apply_fn, in_axes=(None, 0))
-  
+
         def _test_train_step(state: train_state.TrainState, batch, prng_key: diloco.PRNGKey):
           """A simple MSE loss train step to enable numerics testing."""
           del prng_key
-  
+
           def loss_fn(params, batch):
             inputs, labels = batch
             logits = vmapped_apply(params, inputs)
             residual = logits - labels
             sq_residual = jnp.square(residual)
             msq_residual = jnp.mean(sq_residual)
             return msq_residual
-  
+
           loss, grad = jax.value_and_grad(loss_fn)(state.params, batch)
           return state.apply_gradients(grads=grad), loss
-  
+
         initial_test_state = train_state.TrainState.create(
             apply_fn=vmapped_apply,
             params=params,
@@ -141,12 +140,11 @@ def loss_fn(params, batch):
       chex.assert_equal(diloco_test_state.step, 0)
       if test_config.pure_nnx:
         _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
-        
+
         # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
         # We need to unwrap them if they do.
         diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, 'value') else x,
-            diloco_test_state.params
+            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
         )
         chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
       else:
@@ -200,12 +198,11 @@ def loss_fn(params, batch):
       # Assert no updates to the global model yet (no synchronization)
       if test_config.pure_nnx:
         _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
-        
+
         # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
         # We need to unwrap them if they do.
         diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, 'value') else x,
-            diloco_test_state.params
+            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
         )
         chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
       else:
@@ -241,12 +238,11 @@ def loss_fn(params, batch):
       # Assert no updates to the global model yet (no synchronization)
       if test_config.pure_nnx:
         _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
-        
+
         # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
         # We need to unwrap them if they do.
         diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, 'value') else x,
-            diloco_test_state.params
+            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
         )
         chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
       else:
@@ -285,14 +281,31 @@ def loss_fn(params, batch):
       chex.assert_trees_all_close(loss, 0.4481)
       # Assert that inner and outer parameters are all equal now that
       # synchronization has happened.
-      chex.assert_trees_all_equal(
-          diloco_test_state.params,
-          jax.tree.map(lambda arr: arr[0, ...], diloco_test_state.inner_state.params),
-      )
-      chex.assert_trees_all_equal(
-          diloco_test_state.params,
-          jax.tree.map(lambda arr: arr[1, ...], diloco_test_state.inner_state.params),
-      )
+      if test_config.pure_nnx:
+        _, inner_params, _ = nnx.split(diloco_test_state.inner_state.model, nnx.Param, ...)
+        inner_params_pure = jax.tree_util.tree_map(
+            lambda x: x.value if hasattr(x, "value") else x, inner_params.to_pure_dict()
+        )
+        diloco_params_pure_3 = jax.tree_util.tree_map(
+            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
+        )
+        chex.assert_trees_all_equal(
+            diloco_params_pure_3,
+            jax.tree.map(lambda arr: arr[0, ...], inner_params_pure),
+        )
+        chex.assert_trees_all_equal(
+            diloco_params_pure_3,
+            jax.tree.map(lambda arr: arr[1, ...], inner_params_pure),
+        )
+      else:
+        chex.assert_trees_all_equal(
+            diloco_test_state.params,
+            jax.tree.map(lambda arr: arr[0, ...], diloco_test_state.inner_state.params),
+        )
+        chex.assert_trees_all_equal(
+            diloco_test_state.params,
+            jax.tree.map(lambda arr: arr[1, ...], diloco_test_state.inner_state.params),
+        )
 
       # Run the fourth step (no synchronization).
       # Replica 0:
diff --git a/tests/unit/max_utils_test.py b/tests/unit/max_utils_test.py
@@ -206,14 +206,14 @@ def test_unscan_train_state_params(self):
 
     # Check that the original state is unchanged.
 
-    if hasattr(state, 'model'):
+    if hasattr(state, "model"):
       _, params_state, _ = nnx.split(state.model, nnx.Param, ...)
       state_decoder_params = params_state.to_pure_dict()["decoder"]
       self.assertIn("layers", state_decoder_params)
     else:
       self.assertIn("layers", state.params["params"]["decoder"])
 
-    if hasattr(state, 'model'):
+    if hasattr(state, "model"):
       self.assertNotIn("layers_0", state_decoder_params)
     else:
       self.assertNotIn("layers_0", state.params["params"]["decoder"])