fix(nnx): support Zero-1 input shardings on NNX flat state

ecnal-cienet · ecnal-cienet · commit 4f4c0b0cf955 · 2026-06-01T20:42:05.000Z
Under shard_optimizer_over_data, train_compile builds the AOT train-step input shardings by calling state_mesh_shardings.replace(params=params_shardings). That's a TrainState (flax.struct) method; with PR#11's NNX defaults, state_mesh_shardings is a flat nnx.State and the call dies with 'No attribute replace in State'. Add sharding.build_zero1_input_state_mesh_shardings that overlays params_shardings' Param leaves onto state_mesh_shardings.model for the NNX path while keeping the existing .replace behavior for Linen, and route both train_compile call sites through it. Fixes test_zero1_optimizer_sharding.
diff --git a/src/maxtext/trainers/pre_train/train_compile.py b/src/maxtext/trainers/pre_train/train_compile.py
@@ -61,12 +61,9 @@
 def validate_config(config):
   """Validates the config is is setup correctly to compile, returning a useful error message if not."""
   assert config.compile_topology != "", (
-      "You must pass your desired target hardware in compile_topology, e.g."
-      " compile_topology=v5e-256"
+      "You must pass your desired target hardware in compile_topology, e.g." " compile_topology=v5e-256"
   )
-  assert (
-      config.compile_topology_num_slices > 0
-  ), "You must set compile_topology_num_slices to a positive integer"
+  assert config.compile_topology_num_slices > 0, "You must set compile_topology_num_slices to a positive integer"
 
 
 def get_topology_mesh(config):
@@ -78,18 +75,12 @@ def get_topology_mesh(config):
         num_slices=config.compile_topology_num_slices,
     ).devices
   else:
-    target_hardware = accelerator_to_spec_map.get_system_characteristics(
-        config.compile_topology
-    )
+    target_hardware = accelerator_to_spec_map.get_system_characteristics(config.compile_topology)
     if target_hardware.platform == "gpu":
       # Disable sharded autotuning. This is an optimization to distribute
       # autotuning across the fleet, but can cause hangs with AoT compilation.
-      os.environ["XLA_FLAGS"] = (
-          os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
-      )
-      jax.config.update(
-          "mock_num_gpu_processes", config.compile_topology_num_slices
-      )
+      os.environ["XLA_FLAGS"] = os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
+      jax.config.update("mock_num_gpu_processes", config.compile_topology_num_slices)
       topology_devices = jax.devices()
     else:
       topology_devices = get_topology_desc(
@@ -104,14 +95,8 @@ def get_topology_mesh(config):
       "jax_remove_size_one_mesh_axis_from_type",
       config.remove_size_one_mesh_axis_from_type,
   )
-  topology_device_mesh = maxtext_utils.create_device_mesh(
-      config, topology_devices
-  )
-  mesh_axis_type = (
-      AxisType.Explicit
-      if config.shard_mode == ShardMode.EXPLICIT
-      else AxisType.Auto
-  )
+  topology_device_mesh = maxtext_utils.create_device_mesh(config, topology_devices)
+  mesh_axis_type = AxisType.Explicit if config.shard_mode == ShardMode.EXPLICIT else AxisType.Auto
   topology_mesh = Mesh(
       topology_device_mesh,
       config.mesh_axes,
@@ -129,9 +114,7 @@ def _collect_nnx_activation_shardings(create_model_fn, config, mesh):
   input_shape = (config.micro_batch_size_to_train_on, config.max_target_length)
   abstract_input = jax.ShapeDtypeStruct(input_shape, jnp.int32)
 
-  def _nnx_forward(
-      decoder_input_tokens, decoder_positions, decoder_segment_ids
-  ):
+  def _nnx_forward(decoder_input_tokens, decoder_positions, decoder_segment_ids):
     model_instance = create_model_fn()
     return model_instance(
         decoder_input_tokens=decoder_input_tokens,
@@ -140,9 +123,7 @@ def _nnx_forward(
         enable_dropout=False,
     )
 
-  with jax.set_mesh(mesh), nn_partitioning.axis_rules(
-      config.logical_axis_rules
-  ):
+  with jax.set_mesh(mesh), nn_partitioning.axis_rules(config.logical_axis_rules):
     jax.eval_shape(_nnx_forward, abstract_input, abstract_input, abstract_input)
 
 
@@ -151,13 +132,9 @@ def get_shaped_inputs(topology_mesh, config):
   # Construct the model and optimizer to get shaped versions of the state
   quant = quantizations.configure_quantization(config)
   if config.pure_nnx:
-    _create_model_partial, model = (
-        model_creation_utils.create_nnx_abstract_model(config, topology_mesh)
-    )
+    _create_model_partial, model = model_creation_utils.create_nnx_abstract_model(config, topology_mesh)
   else:
-    model = Transformer(
-        config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN
-    )
+    model = Transformer(config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   # The learning_rate_schedule is baked into the compiled object.
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
   # pass in model for muon
@@ -176,20 +153,14 @@ def create_train_state_fn():
 
     init_state_fn = create_train_state_fn
   else:
-    init_state_fn = functools.partial(
-        maxtext_utils.init_initial_state, model, tx, config, True, example_rng
-    )
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, example_rng)
 
   # Shaped state
-  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(
-      config, topology_mesh, init_state_fn, True
-  )
+  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(config, topology_mesh, init_state_fn, True)
 
   if config.pure_nnx:
     # NNX doesn't use Linen logical annotations; derive PartitionSpecs from the physical shardings.
-    logical_annotations = maxtext_utils_nnx.get_partition_spec_nnx(
-        state_mesh_shardings
-    )
+    logical_annotations = maxtext_utils_nnx.get_partition_spec_nnx(state_mesh_shardings)
     # For NNX, get_functional_train_with_signature expects the graphdef (static structure),
     # not the raw model — mirroring how the training loop does nnx.split(train_state).
     with nn_partitioning.axis_rules(config.logical_axis_rules):
@@ -198,9 +169,7 @@ def create_train_state_fn():
     model = graphdef
   else:
     # unsharded logical annotations
-    logical_annotations = maxtext_utils.get_logical_annotations(
-        config, topology_mesh, init_state_fn
-    )
+    logical_annotations = maxtext_utils.get_logical_annotations(config, topology_mesh, init_state_fn)
 
   # Shaped batch
   shaped_batch = maxtext_utils.get_shaped_batch(config)
@@ -217,9 +186,7 @@ def create_train_state_fn():
   # Collect NNX activation shardings via an abstract forward pass (must run
   # after get_abstract_state, which only traces __init__).
   if config.debug_sharding and config.pure_nnx:
-    _collect_nnx_activation_shardings(
-        _create_model_partial, config, topology_mesh
-    )
+    _collect_nnx_activation_shardings(_create_model_partial, config, topology_mesh)
 
   return (
       shaped_train_args,
@@ -256,9 +223,7 @@ def jit_and_compile(
     maxtext_utils.maybe_dump_jaxpr(config, jitted, func_input_args)
     lowered = jitted.lower(*func_input_args, **func_input_kwargs)
   # Import libtpu flags as compiler options. Defaults to empty dict if string is empty.
-  compiler_options = max_utils.parse_libtpu_flags_to_dict(
-      config.compile_xla_flags
-  )
+  compiler_options = max_utils.parse_libtpu_flags_to_dict(config.compile_xla_flags)
   compiled = lowered.compile(compiler_options=compiler_options)
   return compiled
 
@@ -293,20 +258,11 @@ def is_oom(argv: Sequence[str]) -> bool:
   ) = get_shaped_inputs(topology_mesh, config)
 
   # Update params_shardings when shard_optimizer_over_data is enabled (Zero-1)
-  params_shardings, state_mesh_shardings = (
-      sharding.maybe_update_params_sharding_with_opt(
-          config, state_mesh_shardings
-      )
-  )
+  params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
 
-  # When ZeRO-1 is enabled, we need to use the original params_shardings for input shardings
-  # but keep the updated state_mesh_shardings for the optimizer state
-  if config.shard_optimizer_over_data:
-    input_state_mesh_shardings = state_mesh_shardings.replace(
-        params=params_shardings
-    )
-  else:
-    input_state_mesh_shardings = state_mesh_shardings
+  input_state_mesh_shardings = sharding.build_zero1_input_state_mesh_shardings(
+      config, state_mesh_shardings, params_shardings
+  )
 
   # Get data sharding
   data_sharding = sharding.get_input_data_sharding(config, topology_mesh)
@@ -355,8 +311,7 @@ def is_oom(argv: Sequence[str]) -> bool:
 def main(argv: Sequence[str]) -> None:
   jax.config.update("jax_default_prng_impl", "unsafe_rbg")
   os.environ["LIBTPU_INIT_ARGS"] = (
-      os.environ.get("LIBTPU_INIT_ARGS", "")
-      + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"
+      os.environ.get("LIBTPU_INIT_ARGS", "") + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"
   )
   print("Starting train_compile.py...", flush=True)
 
@@ -381,41 +336,26 @@ def main(argv: Sequence[str]) -> None:
   ) = get_shaped_inputs(topology_mesh, config)
 
   # Update params_shardings when shard_optimizer_over_data is enabled (Zero-1)
-  params_shardings, state_mesh_shardings = (
-      sharding.maybe_update_params_sharding_with_opt(
-          config, state_mesh_shardings
-      )
-  )
+  params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
 
-  # When ZeRO-1 is enabled, we need to use the original params_shardings for input shardings
-  # but keep the updated state_mesh_shardings for the optimizer state
-  if config.shard_optimizer_over_data:
-    input_state_mesh_shardings = state_mesh_shardings.replace(
-        params=params_shardings
-    )
-  else:
-    input_state_mesh_shardings = state_mesh_shardings
+  input_state_mesh_shardings = sharding.build_zero1_input_state_mesh_shardings(
+      config, state_mesh_shardings, params_shardings
+  )
 
   # Get data sharding
   data_sharding = sharding.get_input_data_sharding(config, topology_mesh)
   if config.enable_diloco:
     # Build abstract DiLoCo state and shardings for AOT compilation
     abstract_state = shaped_train_args[0]
-    diloco_state, state_mesh_shardings, inner_state_shardings = (
-        diloco.build_abstract_diloco_state(
-            config, abstract_state, state_mesh_shardings, topology_mesh
-        )
+    diloco_state, state_mesh_shardings, inner_state_shardings = diloco.build_abstract_diloco_state(
+        config, abstract_state, state_mesh_shardings, topology_mesh
     )
     # For NNX, shaped_train_args has 2 elements (state, batch) — no rng; pass None for prng.
-    shaped_rng_arg = (
-        shaped_train_args[2] if len(shaped_train_args) > 2 else None
-    )
+    shaped_rng_arg = shaped_train_args[2] if len(shaped_train_args) > 2 else None
     shaped_train_args = (diloco_state, shaped_train_args[1], shaped_rng_arg)
 
     # Wrap train_step with diloco
-    train_step_partial = functools.partial(
-        train.train_step, model, config, inner_state_shardings, params_shardings
-    )
+    train_step_partial = functools.partial(train.train_step, model, config, inner_state_shardings, params_shardings)
     train_step_fn = diloco.build_diloco_train_step(config, train_step_partial)
 
     # For DiLoCo, the train_step_fn is already fully wrapped and takes (state, batch, prng)
@@ -480,10 +420,7 @@ def main(argv: Sequence[str]) -> None:
   if config.compiled_trainstep_file != "":
     print("Saving compiled object...")
     save_compiled(compiled, config.compiled_trainstep_file)
-    print(
-        "Successfully saved compiled object as"
-        f" {config.compiled_trainstep_file}"
-    )
+    print("Successfully saved compiled object as" f" {config.compiled_trainstep_file}")
   print("Finished train_compile.py successfully!", flush=True)
   print(f"Cost analysis: {compiled.cost_analysis()}")
   print(f"Memory analysis: {compiled.memory_analysis()}")
diff --git a/src/maxtext/utils/sharding.py b/src/maxtext/utils/sharding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # pylint: disable=line-too-long, disable=bare-except, consider-using-generator
-""" Utils that are only interesting to MaxText and sharding related. """
+"""Utils that are only interesting to MaxText and sharding related."""
 
 from flax import linen as nn, nnx
 
@@ -620,6 +620,34 @@ def _update_model_var(path, var):
   return prev_params_shardings, updated_state
 
 
+def build_zero1_input_state_mesh_shardings(config, state_mesh_shardings, params_shardings):
+  """Build the train-step input shardings under shard_optimizer_over_data (Zero-1).
+
+  Model params on input use the original pre-Zero-1 sharding (params_shardings), while the rest
+  of the state — including the optimizer state — keeps the Zero-1 layout from state_mesh_shardings,
+  so the optimizer state input matches its output. When shard_optimizer_over_data is False,
+  state_mesh_shardings passes through unchanged.
+  """
+  if not config.shard_optimizer_over_data:
+    return state_mesh_shardings
+  if not config.pure_nnx:
+    return state_mesh_shardings.replace(params=params_shardings)
+  # nnx.State has no .replace. tree_map below shallow-copies state_mesh_shardings preserving
+  # nested container types; then we walk params_shardings and overwrite the matching keys under
+  # input_state.model (the NNX home of model params).
+  input_state = jax.tree_util.tree_map(lambda x: x, state_mesh_shardings, is_leaf=lambda x: isinstance(x, nnx.Variable))
+
+  def _overlay(model_node, params_node):
+    for k, pv in params_node.items():
+      if isinstance(pv, nnx.Variable):
+        model_node[k] = pv
+      elif hasattr(pv, "items"):
+        _overlay(model_node[k], pv)
+
+  _overlay(input_state.model, params_shardings)
+  return input_state
+
+
 def logical_axis_rules_pp_act_as_dp(logical_rules):
   """Add stage as a physical axes before data for each rule, so stage acts just like data instead of PP.
   This is used when we want to pipeline only a subset of layers, and leave the rest like DP.