AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 3 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 28 additions & 0 deletions b/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 53 additions & 0 deletions b/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎src/maxtext/layers/nnx_decoders.py‎
Lines changed: 10 additions & 6 deletions b/‎src/maxtext/layers/nnx_decoders.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎src/maxtext/models/gpt_oss.py‎
Lines changed: 4 additions & 1 deletion b/‎src/maxtext/models/gpt_oss.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/maxtext/trainers/diloco/diloco.py‎
Lines changed: 49 additions & 10 deletions b/‎src/maxtext/trainers/diloco/diloco.py‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 33 additions & 2 deletions b/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 33 additions & 2 deletions
@@ -1125,9 +1125,9 @@ position_id_per_seconds: 25
 subslice_shape: ""
 
 # NNX
-enable_nnx: False
-pure_nnx_decoder: False
-pure_nnx: False
+enable_nnx: True
+pure_nnx_decoder: True
+pure_nnx: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
 
@@ -72,4 +72,32 @@ logical_axis_rules: [
                       ['exp_with_fsdp', 'fsdp'],
                       ['paged_kv_heads', ['tensor']],
                       ['engram_dim', ['tensor']],
+                      # Axes unsharded: sequence/context/tensor_transpose/autoregressive do not exist in this mesh
+                      ['activation_attn_length_no_exp', []],
+                      ['activation_length_no_exp', []],
+                      ['activation_norm_length', []],
+                      ['activation_q_length_no_exp', []],
+                      ['prefill_activation_length', []],
+                      ['prefill_activation_norm_length', []],
+                      ['activation_kv_length', []],
+                      ['decode_length', []],
+                      ['embed_tensor_transpose', []],
+                      ['q_lora_up_proj', []],
+                      ['kv_lora_up_proj', []],
+                      ['kv', []],
+                      ['qkv', []],
+                      ['kv_head_dim', []],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                      ['dense_layers', []],
+                      ['moe_layers', []],
+                      ['num_activations', []],
+                      ['mhc', []],
+                      ['diloco', []],
                     ]
@@ -34,4 +34,57 @@ logical_axis_rules: [
                       ['q_lora', ['fsdp']],
                       ['kv_lora', ['fsdp']],
                       ['exp_with_fsdp', 'fsdp'],
+                      # All other axes are unsharded (tensor/sequence/expert axes do not exist in pure-fsdp)
+                      ['activation_heads', []],
+                      ['activation_kv_heads', []],
+                      ['activation_length', []],
+                      ['activation_attn_length', []],
+                      ['activation_attn_length_no_exp', []],
+                      ['activation_length_no_exp', []],
+                      ['activation_norm_length', []],
+                      ['activation_q_length', []],
+                      ['activation_q_length_no_exp', []],
+                      ['prefill_activation_length', []],
+                      ['prefill_activation_norm_length', []],
+                      ['activation_kv_length', []],
+                      ['activation_attn_embed', []],
+                      ['activation_embed', []],
+                      ['activation_mlp', []],
+                      ['activation_kv', []],
+                      ['activation_kv_head_dim', []],
+                      ['activation_vocab', []],
+                      ['activation_stage', []],
+                      ['activation_exp', []],
+                      ['decode_length', []],
+                      ['mlp', []],
+                      ['mlp_no_fsdp', []],
+                      ['vocab', []],
+                      ['heads', []],
+                      ['q_heads', []],
+                      ['kv_heads', []],
+                      ['embed_tensor_transpose', []],
+                      ['q_lora_up_proj', []],
+                      ['kv_lora_up_proj', []],
+                      ['norm', []],
+                      ['layers', []],
+                      ['qkv', []],
+                      ['kv', []],
+                      ['kv_head_dim', []],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_heads', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['exp', []],
+                      ['paged_kv_heads', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                      ['dense_layers', []],
+                      ['moe_layers', []],
+                      ['num_activations', []],
+                      ['engram_dim', []],
+                      ['mhc', []],
+                      ['diloco', []],
                     ]
@@ -311,7 +311,7 @@ def __init__(
 
         num_moe = config.num_decoder_layers - config.first_num_dense_layers
 
-        self.moe_layer = self._create_scanned_layers(moe_cls, length=num_moe, rngs=rngs)
+        self.moe_layers = self._create_scanned_layers(moe_cls, length=num_moe, rngs=rngs)
       elif self.is_gemma3:
         attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
         scan_length = config.num_decoder_layers // attention_pattern_length
@@ -346,7 +346,7 @@ def __init__(
         for i in range(config.first_num_dense_layers):
           self._create_and_register_layer(dense_cls, rngs, "dense_layer", i)
         for i in range(config.num_decoder_layers - config.first_num_dense_layers):
-          self._create_and_register_layer(moe_cls, rngs, "moe_layer", i)
+          self._create_and_register_layer(moe_cls, rngs, "moe_layers", i)
       else:
         layer_cls = decoder_block_classes[0]
 
@@ -388,6 +388,8 @@ def _create_single_layer(self, decoder_layer_class, rngs, **kwargs):
 
   def _create_scanned_layers(self, decoder_layer_class, length: int, rngs: nnx.Rngs, **layer_kwargs):
     """Creates a VMapped stack of layers, forcing parameter init for Compact modules."""
+    if length == 0:
+      return nnx.List([])
 
     def create_layer_fn(rng):
       layer = decoder_layer_class(
@@ -433,6 +435,8 @@ def pure_layer_fn(state_in, y_in):
 
   def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs):
     """Runs the layer stack using nnx.scan."""
+    if length == 0:
+      return x_in, layers
     policy = self.get_remat_policy()
     prevent_cse = maxtext_utils.should_prevent_cse_in_remat(self.config)
     graphdef, params, state = nnx.split(
@@ -961,7 +965,7 @@ def __call__(
 
           y = self._apply_interleaved_scanned_layers(
               y,
-              self.moe_layer,
+              self.moe_layers,
               0,
               (cfg.num_decoder_layers - cfg.first_num_dense_layers),
               [e - cfg.first_num_dense_layers for e in cfg.engram_layers],
@@ -978,7 +982,7 @@ def __call__(
           if cfg.use_batch_split_schedule:
             policy = self.get_remat_policy()
 
-            mock_params = self._build_linen_params(self.moe_layer)
+            mock_params = self._build_linen_params(self.moe_layers)
 
             y = deepseek_batchsplit.scan_batch_split_layers(
                 y,
@@ -992,8 +996,8 @@ def __call__(
                 policy=policy,
             )
           else:
-            y, self.moe_layer = self._apply_layers_sequentially(
-                self.moe_layer, y, *layer_args, length=num_moe, **layer_kwargs
+            y, self.moe_layers = self._apply_layers_sequentially(
+                self.moe_layers, y, *layer_args, length=num_moe, **layer_kwargs
             )
       elif self.is_gemma3:
         y = self._apply_gemma3_scanned_blocks(
 
@@ -28,6 +28,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import moe
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
@@ -130,6 +131,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -181,7 +184,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
 
@@ -26,6 +26,7 @@
 from typing import Any, Callable
 
 import drjax
+from flax import nnx
 from flax import struct
 from flax.training import train_state
 import jax
@@ -153,15 +154,23 @@ def add_diloco_dim(x):
       momentum=config.diloco_outer_momentum,
       nesterov=True,
   )
-  outer_opt_state = jax.eval_shape(outer_optimizer.init, abstract_state.params)
+  # For NNX, model params (Param variables only) live under abstract_state.model;
+  # for Linen under abstract_state.params.
+  if config.pure_nnx:
+    model_params = abstract_state.model.filter(nnx.Param)
+    model_params_sharding = state_mesh_shardings.model.filter(nnx.Param)
+  else:
+    model_params = abstract_state.params
+    model_params_sharding = state_mesh_shardings.params
+  outer_opt_state = jax.eval_shape(outer_optimizer.init, model_params)
 
   # Create abstract step
   abstract_step = jax.ShapeDtypeStruct((), jnp.int32)
 
   # Build abstract DiLoCo state
   diloco_state = DiLoCoTrainState(
       inner_state=inner_state,
-      params=abstract_state.params,
+      params=model_params,
       outer_opt_state=outer_opt_state,
       step=abstract_step,
   )
@@ -171,12 +180,12 @@ def add_diloco_dim(x):
   # Sharding for outer_opt_state. For SGD with momentum, it is (TraceState(trace=...), EmptyState())
   # We shard the momentum trace the same way as the parameters.
   outer_opt_state_sharding = (
-      optax.TraceState(trace=state_mesh_shardings.params),
+      optax.TraceState(trace=model_params_sharding),
       optax.EmptyState(),
   )
   diloco_state_shardings = DiLoCoTrainState(
       inner_state=inner_state_shardings,
-      params=state_mesh_shardings.params,
+      params=model_params_sharding,
       outer_opt_state=outer_opt_state_sharding,
       step=None,
   )
@@ -205,11 +214,15 @@ def init_diloco_state() -> tuple[DiLoCoTrainState, PyTree]:
     # mesh automatically when jax.set_mesh is used.
     inner_state = drjax.broadcast(state, mesh=mesh)
     # Outer state retains a single copy of the model parameters and optimizer state.
-    outer_params = state.params
+    # For NNX, model params (Param variables only) live under state.model;
+    # for Linen under state.params.
+    outer_params = state.model.filter(nnx.Param) if config.pure_nnx else state.params
     outer_opt_state = outer_optimizer.init(outer_params)
     outer_opt_state_sharding = jax.tree_util.tree_map(lambda x: x.sharding, outer_opt_state)
+    # For NNX, the step counter lives at state.optimizer.step; for Linen at state.step.
+    step = state.optimizer.step if config.pure_nnx else state.step
     return (
-        DiLoCoTrainState(inner_state=inner_state, params=outer_params, outer_opt_state=outer_opt_state, step=state.step),
+        DiLoCoTrainState(inner_state=inner_state, params=outer_params, outer_opt_state=outer_opt_state, step=step),
         outer_opt_state_sharding,
     )
 
@@ -244,7 +257,11 @@ def synchronize(state):
     # Calculate the delta between the current replica's state and the global
     # state (since last synchronization).
     broadcast_outer_params = drjax.broadcast(state.params, mesh=mesh)
-    model_delta = jax.tree.map(lambda x, y: y - x, state.inner_state.params, broadcast_outer_params)
+    # For NNX, model Param vars live under inner_state.model; for Linen under inner_state.params.
+    inner_model_params = (
+        nnx.filter_state(state.inner_state.model, nnx.Param) if config.pure_nnx else state.inner_state.params
+    )
+    model_delta = jax.tree.map(lambda x, y: y - x, inner_model_params, broadcast_outer_params)
     # Treat the average delta as the outer optimizer's gradient and apply to
     # the global (outer) model params.
     averaged_pseudo_grad = drjax.reduce_mean(model_delta)
@@ -253,7 +270,27 @@ def synchronize(state):
     # Replace inner model params with the new global model params.
     # NOTE: inner optimizer state is retained despite the change in parameters,
     # see section 6.1 in https://arxiv.org/pdf/2311.08105.
-    new_inner_state = drjax.map_fn(lambda state: state.replace(params=new_outer_params), state.inner_state, mesh=mesh)
+    if config.pure_nnx:
+      # For NNX: merge new Param vars back with the non-Param model vars (e.g. RNG state).
+      def replace_nnx_model_params(s, new_params):
+        non_param_model = nnx.filter_state(s.model, nnx.Not(nnx.Param))
+        new_model = nnx.merge_state(non_param_model, new_params)
+        # Build result via __setitem__ so nested States are stored as plain dicts
+        # internally, matching the pytree structure produced by nnx.state().
+        # (Passing State objects via the constructor dict literal stores them
+        # as-is, causing jax.lax.cond to see mismatched pytree structures.)
+        result = type(s)({})
+        result["model"] = new_model
+        result["optimizer"] = s["optimizer"]
+        return result
+
+      new_inner_state = drjax.map_fn(
+          lambda s: replace_nnx_model_params(s, new_outer_params),
+          state.inner_state,
+          mesh=mesh,
+      )
+    else:
+      new_inner_state = drjax.map_fn(lambda s: s.replace(params=new_outer_params), state.inner_state, mesh=mesh)
     return state.replace(
         params=new_outer_params,
         outer_opt_state=new_opt_state,
@@ -271,14 +308,16 @@ def diloco_train_step(state, batch, prng):
     broadcast_rng = drjax.broadcast(prng, mesh=mesh)
     inner_state, metrics = drjax.map_fn(train_step, (state.inner_state, batch, broadcast_rng), mesh=mesh)
     avg_metrics = typed_reduce_mean(metrics)
+    # For NNX, the step counter lives at inner_state.optimizer.step; for Linen at inner_state.step.
+    new_step = inner_state.optimizer.step[0] if config.pure_nnx else inner_state.step[0]
     state = state.replace(
         inner_state=inner_state,
-        step=inner_state.step[0],
+        step=new_step,
     )
     # Either synchronize the model, or no-op, depending on whether the current
     # step falls on the synchronization period.
     state = jax.lax.cond(
-        inner_state.step[0] % config.diloco_sync_period == 0,
+        new_step % config.diloco_sync_period == 0,
         synchronize,
         lambda x: x,  # no-op
         state,
 
@@ -30,6 +30,7 @@
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 import jax
+import jax.numpy as jnp
 from jax.experimental.serialize_executable import serialize
 from jax.experimental.topologies import get_topology_desc
 from jax.sharding import AxisType, Mesh
@@ -93,6 +94,27 @@ def get_topology_mesh(config):
   return topology_mesh
 
 
+def _collect_nnx_activation_shardings(create_model_fn, config, mesh):
+  """Run an NNX forward pass in abstract mode to populate _ACTIVATION_SHARDINGS_DUMP.
+
+  get_abstract_state_nnx uses nnx.eval_shape which only traces model initialization,
+  not __call__. Activation shardings are only collected during a forward pass.
+  """
+  input_shape = (config.micro_batch_size_to_train_on, config.max_target_length)
+
+  def _nnx_forward():
+    model_instance = create_model_fn()
+    return model_instance(
+        decoder_input_tokens=jnp.ones(input_shape, dtype=jnp.int32),
+        decoder_positions=jnp.ones(input_shape, dtype=jnp.int32),
+        decoder_segment_ids=jnp.ones(input_shape, dtype=jnp.int32),
+        enable_dropout=False,
+    )
+
+  with nn_partitioning.axis_rules(config.logical_axis_rules):
+    jax.eval_shape(_nnx_forward)
+
+
 def get_shaped_inputs(topology_mesh, config):
   """Get shaped abstractions of inputs to train_step: state, batch and rng"""
   # Construct the model and optimizer to get shaped versions of the state
@@ -140,10 +162,17 @@ def create_train_state_fn():
   shaped_batch = maxtext_utils.get_shaped_batch(config)
 
   if config.pure_nnx:
-    shaped_train_args = (abstract_state, shaped_batch, None)  # NNX doesn't use dropout_rng
+    shaped_train_args = (abstract_state, shaped_batch)  # NNX doesn't use dropout_rng
   else:
     shaped_train_args = (abstract_state, shaped_batch, shaped_rng)
   shaped_train_kwargs = {}
+
+  # Collect activation shardings for NNX by running an abstract forward pass.
+  # This must happen after get_abstract_state (which uses nnx.eval_shape and only
+  # traces __init__, not __call__).
+  if config.debug_sharding and config.pure_nnx:
+    _collect_nnx_activation_shardings(_create_model_partial, config, topology_mesh)
+
   return shaped_train_args, shaped_train_kwargs, state_mesh_shardings, logical_annotations, model
 
 
@@ -279,7 +308,9 @@ def main(argv: Sequence[str]) -> None:
     diloco_state, state_mesh_shardings, inner_state_shardings = diloco.build_abstract_diloco_state(
         config, abstract_state, state_mesh_shardings, topology_mesh
     )
-    shaped_train_args = (diloco_state, shaped_train_args[1], shaped_train_args[2])
+    # For NNX, shaped_train_args has 2 elements (state, batch) — no rng; pass None for prng.
+    shaped_rng_arg = shaped_train_args[2] if len(shaped_train_args) > 2 else None
+    shaped_train_args = (diloco_state, shaped_train_args[1], shaped_rng_arg)
 
     # Wrap train_step with diloco
     train_step_partial = functools.partial(train.train_step, model, config, inner_state_shardings, None)