AI-Hypercomputer
diff --git a/‎src/maxtext/common/gcloud_stub.py‎
Lines changed: 9 additions & 0 deletions b/‎src/maxtext/common/gcloud_stub.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 37 additions & 9 deletions b/‎src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml‎
Lines changed: 37 additions & 9 deletions
diff --git a/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 56 additions & 1 deletion b/‎src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎src/maxtext/configs/decoupled_base_test.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxtext/configs/decoupled_base_test.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/maxtext/layers/nnx_decoders.py‎
Lines changed: 28 additions & 5 deletions b/‎src/maxtext/layers/nnx_decoders.py‎
Lines changed: 28 additions & 5 deletions
diff --git a/‎src/maxtext/layers/normalizations.py‎
Lines changed: 3 additions & 3 deletions b/‎src/maxtext/layers/normalizations.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxtext/models/gpt_oss.py‎
Lines changed: 4 additions & 1 deletion b/‎src/maxtext/models/gpt_oss.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/maxtext/models/llama2.py‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/models/llama2.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/models/olmo3.py‎
Lines changed: 4 additions & 1 deletion b/‎src/maxtext/models/olmo3.py‎
Lines changed: 4 additions & 1 deletion
@@ -43,6 +43,15 @@ def is_decoupled() -> bool:  # dynamic check so setting env after initial import
   return os.environ.get("DECOUPLE_GCLOUD", "").upper() == "TRUE"
 
 
+def is_pure_nnx() -> bool:  # dynamic check so setting env after initial import still works
+  """Return True when running in pure NNX mode (PURE_NNX=TRUE env var).
+
+  Defaults to FALSE — Linen is the default test mode.
+  Set PURE_NNX=TRUE to opt in to NNX mode (skips linen_only tests, runs nnx_only tests).
+  """
+  return os.environ.get("PURE_NNX", "FALSE").upper() == "TRUE"
+
+
 T = TypeVar("T")
 
 
 
@@ -534,6 +534,7 @@ logical_axis_rules: [
                       ['paged_kv_head_dim_size', []],
                       ['dense_layers', []],
                       ['moe_layers', []],
+                      ['num_activations', []],
                       ['engram_dim', ['tensor']],
                       ['mhc', []],
                       ['diloco', 'diloco'],
 
@@ -12,17 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This logical rule is designed to optimize pipeline parallelism for large-scale jobs. 
-# Key changes include removing expert weight sharding on the `q_lora` dimension, which 
-# is relatively small (e.g., 512 for DeepSeek), and limiting sharding strategies when 
-# EP x FSDP > 512. 
+# This logical rule is designed to optimize pipeline parallelism for large-scale jobs.
+# Key changes include removing expert weight sharding on the `q_lora` dimension, which
+# is relatively small (e.g., 512 for DeepSeek), and limiting sharding strategies when
+# EP x FSDP > 512.
 #
-# The `data` axis is preserved for two reasons: first, the pipeline stage acts as a 
-# data parallel (DP) domain externally, making the `data` axis a necessary reference; 
-# second, it may be required for DCN communication. 
+# The `data` axis is preserved for two reasons: first, the pipeline stage acts as a
+# data parallel (DP) domain externally, making the `data` axis a necessary reference;
+# second, it may be required for DCN communication.
 #
-# Finally, the `tensor` axis is used to shard weights when `pipeline_fsdp_ag_once` or 
-# `pipeline_fsdp_ag_per_repeat` is enabled, ensuring we have sufficient memory to 
+# Finally, the `tensor` axis is used to shard weights when `pipeline_fsdp_ag_once` or
+# `pipeline_fsdp_ag_per_repeat` is enabled, ensuring we have sufficient memory to
 # store prefetched weights.
 mesh_axes: ['data', 'stage', 'fsdp', 'tensor', 'expert']
 data_sharding: [['data', 'stage', 'fsdp', 'tensor', 'expert']]
@@ -71,4 +71,32 @@ logical_axis_rules: [
                       ['exp_with_fsdp', 'fsdp'],
                       ['paged_kv_heads', ['tensor']],
                       ['engram_dim', ['tensor']],
+                      # Axes unsharded: sequence/context/tensor_transpose/autoregressive do not exist in this mesh
+                      ['activation_attn_length_no_exp', []],
+                      ['activation_length_no_exp', []],
+                      ['activation_norm_length', []],
+                      ['activation_q_length_no_exp', []],
+                      ['prefill_activation_length', []],
+                      ['prefill_activation_norm_length', []],
+                      ['activation_kv_length', []],
+                      ['decode_length', []],
+                      ['embed_tensor_transpose', []],
+                      ['q_lora_up_proj', []],
+                      ['kv_lora_up_proj', []],
+                      ['kv', []],
+                      ['qkv', []],
+                      ['kv_head_dim', []],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                      ['dense_layers', []],
+                      ['moe_layers', []],
+                      ['num_activations', []],
+                      ['mhc', []],
+                      ['diloco', []],
                     ]
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This rule only uses FSDP. Pure FSDP is the go-to sharding strategy 
+# This rule only uses FSDP. Pure FSDP is the go-to sharding strategy
 # for small-scale training and this rule simplifies the overall configuration.
 mesh_axes: ['fsdp']
 data_sharding: [['fsdp']]
 logical_axis_rules: [
+                      # Batch/data dimensions sharded on fsdp
                       ['activation_batch', ['fsdp']],
                       ['activation_batch_no_exp', ['fsdp']],
                       ['activation_batch_moe', ['fsdp']],
@@ -27,11 +28,65 @@ logical_axis_rules: [
                       ['activation_kv_batch', ['fsdp']],
                       ['activation_kv_batch_no_exp', ['fsdp']],
                       ['decode_batch', ['fsdp']],
+                      # Weight dimensions sharded on fsdp
                       ['embed', ['fsdp']],
                       ['embed_no_exp', ['fsdp']],
                       ['embed_moe', ['fsdp']],
                       ['embed_no_exp_moe', ['fsdp']],
                       ['q_lora', ['fsdp']],
                       ['kv_lora', ['fsdp']],
                       ['exp_with_fsdp', 'fsdp'],
+                      # All other axes are unsharded (tensor/sequence/expert axes do not exist in pure-fsdp)
+                      ['activation_heads', []],
+                      ['activation_kv_heads', []],
+                      ['activation_length', []],
+                      ['activation_attn_length', []],
+                      ['activation_attn_length_no_exp', []],
+                      ['activation_length_no_exp', []],
+                      ['activation_norm_length', []],
+                      ['activation_q_length', []],
+                      ['activation_q_length_no_exp', []],
+                      ['prefill_activation_length', []],
+                      ['prefill_activation_norm_length', []],
+                      ['activation_kv_length', []],
+                      ['activation_attn_embed', []],
+                      ['activation_embed', []],
+                      ['activation_mlp', []],
+                      ['activation_kv', []],
+                      ['activation_kv_head_dim', []],
+                      ['activation_vocab', []],
+                      ['activation_stage', []],
+                      ['activation_exp', []],
+                      ['decode_length', []],
+                      ['mlp', []],
+                      ['mlp_no_fsdp', []],
+                      ['vocab', []],
+                      ['heads', []],
+                      ['q_heads', []],
+                      ['kv_heads', []],
+                      ['embed_tensor_transpose', []],
+                      ['q_lora_up_proj', []],
+                      ['kv_lora_up_proj', []],
+                      ['norm', []],
+                      ['layers', []],
+                      ['qkv', []],
+                      ['kv', []],
+                      ['kv_head_dim', []],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_heads', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['exp', []],
+                      ['paged_kv_heads', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                      ['dense_layers', []],
+                      ['moe_layers', []],
+                      ['num_activations', []],
+                      ['engram_dim', []],
+                      ['mhc', []],
+                      ['diloco', []],
                     ]
@@ -30,6 +30,10 @@ eval_dataset_name: 'c4/en:3.1.0'
 # Use dot_product attention to avoid GPU Pallas shared memory limits on AMD GPUs
 attention: "dot_product"
 
+# Default to Linen mode for tests; NNX is opt-in via PURE_NNX=TRUE.
+pure_nnx: False
+pure_nnx_decoder: False
+
 # Avoid HLO dump overhead.
 dump_hlo: false
 jax_cache_dir: ""
 
@@ -486,8 +486,16 @@ def pure_layer_fn(state_in, y_in):
       out = merged_layer(y_in, **kwargs)
       return out, nnx.state(merged_layer)
 
-    checkpointed_fn = jax.checkpoint(pure_layer_fn, policy=policy, prevent_cse=prevent_cse)
-    out, new_state = checkpointed_fn(state, y)
+    # Linen-based FP8 ops (fp8_nanoo, fp8_gpu) store scale/amax_history in Linen
+    # mutable scope. jax.checkpoint re-traces the scan body during backward (remat),
+    # but the Linen scope retains JAX tracers from the first trace, causing
+    # UnexpectedTracerError. Skip checkpoint for these quantization types.
+    uses_linen_fp8_mutable_state = self.config.quantization in ("fp8_nanoo", "fp8_gpu")
+    if uses_linen_fp8_mutable_state:
+      out, new_state = pure_layer_fn(state, y)
+    else:
+      checkpointed_fn = jax.checkpoint(pure_layer_fn, policy=policy, prevent_cse=prevent_cse)
+      out, new_state = checkpointed_fn(state, y)
     nnx.update(layer, new_state)
 
     return out
@@ -529,9 +537,24 @@ def layer_fn(carry, scanned_vars):
       # ONLY return non-param state to prevent memory duplication of weights
       return new_carry, new_current_state
 
-    layer_fn = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
-
-    final_carry, scanned_other = jax.lax.scan(layer_fn, x_in, (params, state))
+    # Linen-based FP8 ops (fp8_nanoo, fp8_gpu) store scale/amax_history in Linen
+    # mutable scope. jax.lax.scan traces the body function and Linen's setup() creates
+    # intermediate tracer values (amax_history float32[1024]) that escape the scan scope,
+    # causing UnexpectedTracerError. Use a Python for loop instead for these types.
+    uses_linen_fp8_mutable_state = self.config.quantization in ("fp8_nanoo", "fp8_gpu")
+    if uses_linen_fp8_mutable_state:
+      carry = x_in
+      per_layer_states = []
+      for i in range(length):
+        current_params = jax.tree.map(lambda x, i=i: x[i], params)
+        current_state = jax.tree.map(lambda x, i=i: x[i], state)
+        carry, new_state_i = layer_fn(carry, (current_params, current_state))
+        per_layer_states.append(new_state_i)
+      final_carry = carry
+      scanned_state = jax.tree.map(lambda *xs: jnp.stack(list(xs)), *per_layer_states)
+    else:
+      layer_fn = jax.checkpoint(layer_fn, policy=policy, prevent_cse=prevent_cse)
+      final_carry, scanned_state = jax.lax.scan(layer_fn, x_in, (params, state))
 
     if scan_axis != 0:
       params = jax.tree.map(lambda x: jnp.moveaxis(x, 0, scan_axis), params)
 
@@ -104,9 +104,9 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
 
 def Qwen3NextRMSNorm(
     num_features: int,
-    epsilon: float,
-    dtype: DType,
-    weight_dtype: DType,
+    epsilon: float = 1e-6,
+    dtype: DType = jnp.float32,
+    weight_dtype: DType = jnp.float32,
     shard_mode: ShardMode = ShardMode.AUTO,
     kernel_axes: tuple[None | str, ...] = (),
     parameter_memory_host_offload: bool = False,
 
@@ -28,6 +28,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import moe
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
@@ -130,6 +131,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -181,7 +184,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
 
@@ -70,6 +70,7 @@ def __init__(
         shard_mode=config.shard_mode,
         kernel_axes=("norm",),
         epsilon=config.normalization_layer_epsilon,
+        parameter_memory_host_offload=config.parameter_memory_host_offload,
         rngs=rngs,
     )
 
 
@@ -29,6 +29,7 @@
 from maxtext.common.common_types import AttentionType, Config
 from maxtext.layers import attentions
 from maxtext.layers import initializers
+from maxtext.layers import linears
 from maxtext.layers import nnx_wrappers
 from maxtext.layers import quantizations
 from maxtext.layers.attentions import Attention
@@ -140,6 +141,8 @@ def __init__(
         rngs=rngs,
     )
 
+    self.dropout = linears.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
   def __call__(
       self,
       inputs,
@@ -193,7 +196,7 @@ def __call__(
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ def __init__(`
`70`	`70`	`shard_mode=config.shard_mode,`
`71`	`71`	`kernel_axes=("norm",),`
`72`	`72`	`epsilon=config.normalization_layer_epsilon,`
	`73`	`+ parameter_memory_host_offload=config.parameter_memory_host_offload,`
`73`	`74`	`rngs=rngs,`
`74`	`75`	`)`
`75`	`76`