AI-Hypercomputer
diff --git a/‎tests/assets/logits_generation/generate_grpo_golden_logits.py‎
Lines changed: 8 additions & 16 deletions b/‎tests/assets/logits_generation/generate_grpo_golden_logits.py‎
Lines changed: 8 additions & 16 deletions
diff --git a/‎tests/integration/deepseek_scan_engram_test.py‎
Lines changed: 10 additions & 84 deletions b/‎tests/integration/deepseek_scan_engram_test.py‎
Lines changed: 10 additions & 84 deletions
diff --git a/‎tests/integration/diloco_test.py‎
Lines changed: 50 additions & 102 deletions b/‎tests/integration/diloco_test.py‎
Lines changed: 50 additions & 102 deletions
diff --git a/‎tests/integration/hlo_diff_test.py‎
Lines changed: 7 additions & 2 deletions b/‎tests/integration/hlo_diff_test.py‎
Lines changed: 7 additions & 2 deletions
@@ -73,26 +73,18 @@ def setUp(self):
     devices_array = maxtext_utils.create_device_mesh(self.cfg)
     mesh = Mesh(devices_array, self.cfg.mesh_axes)
     # With checkpoint
-    if self.cfg.pure_nnx:
-      # NNX has a different function to init the training state.
-      raise NotImplementedError("Pure NNX support has not been implemented yet.")
-    else:
-      self.model = models.transformer_as_linen(config=self.cfg, mesh=mesh, quant=None, model_mode=MODEL_MODE_TRAIN)
-      init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.cfg, False, self.rng)
+    self.model = models.transformer_as_linen(config=self.cfg, mesh=mesh, quant=None, model_mode=MODEL_MODE_TRAIN)
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, self.model, None, self.cfg, False, self.rng)
     self.state, state_mesh_annotations = maxtext_utils.setup_decode_state(self.cfg, mesh, None, init_state_fn)
     self.state_mesh_shardings = nn.logical_to_mesh_sharding(state_mesh_annotations, mesh, self.cfg.logical_axis_rules)
     self.data_sharding = jax.NamedSharding(mesh, jax.sharding.PartitionSpec(None))
     # Without checkpoint
-    if self.cfg_no_ckpt_loading.pure_nnx:
-      # NNX has a different function to init the training state.
-      raise NotImplementedError("Pure NNX support has not been implemented yet.")
-    else:
-      self.model_no_ckpt_loading = models.transformer_as_linen(
-          config=self.cfg_no_ckpt_loading, mesh=mesh, quant=None, model_mode=MODEL_MODE_TRAIN
-      )
-      init_state_fn = functools.partial(
-          maxtext_utils.init_initial_state, self.model_no_ckpt_loading, None, self.cfg_no_ckpt_loading, False, self.rng
-      )
+    self.model_no_ckpt_loading = models.transformer_as_linen(
+        config=self.cfg_no_ckpt_loading, mesh=mesh, quant=None, model_mode=MODEL_MODE_TRAIN
+    )
+    init_state_fn = functools.partial(
+        maxtext_utils.init_initial_state, self.model_no_ckpt_loading, None, self.cfg_no_ckpt_loading, False, self.rng
+    )
     self.state_no_ckpt_loading, _ = maxtext_utils.setup_decode_state(self.cfg_no_ckpt_loading, mesh, None, init_state_fn)
 
     self.tokenizer_model = transformers.AutoTokenizer.from_pretrained(
 
@@ -14,22 +14,19 @@
 
 """Unit tests for DeepSeek Engram across scanned decoder layers."""
 
-import gc
-import os
 import unittest
 from unittest.mock import patch
 
-import jax
 import jax.numpy as jnp
-from jax.sharding import Mesh
 
-from maxtext.configs import pyconfig
-from maxtext.utils.globals import MAXTEXT_PKG_DIR
-from maxtext.common.common_types import MODEL_MODE_TRAIN
-from maxtext.layers.decoders import Decoder
-from maxtext.utils import maxtext_utils
 import pytest
 
+# The Linen Decoder this test exercised was removed in PR12 (Delete Linen).
+# NNX decoder coverage is in tests/unit/nnx_decoders_test.py.
+pytestmark = pytest.mark.skip(
+    reason="Linen Decoder removed in PR12 (Delete Linen); NNX decoder coverage is in tests/unit/nnx_decoders_test.py"
+)
+
 
 class DummyEmbedding:
   """Dummy embedding layer for testing."""
@@ -91,81 +88,10 @@ def _test_engram_pattern(
       base_num_decoder_layers=10,
   ):
     """Helper method to test different engram layer patterns."""
-
-    # Setup mock tokenizer
-    class MockTokenizer:
-      """Mock tokenizer for testing."""
-
-      pad_token_id = 0
-
-      def __len__(self):
-        return 128
-
-      def __call__(self, x):
-        return jnp.ones_like(x)
-
-      def convert_ids_to_tokens(self, *args, **kwargs):
-        return "a"
-
-      def decode(self, *args, **kwargs):
-        return "a"
-
-      def batch_decode(self, token_ids, *args, **kwargs):
-        return ["a" for _ in token_ids]
-
-    mock_from_pretrained.return_value = MockTokenizer()
-
-    config_path = os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")
-    config = pyconfig.initialize(
-        [None, config_path]
-        + self._COMMON_CONFIG
-        + [
-            f"engram_layers=[{engram_layers_str}]",
-            f"first_num_dense_layers={first_num_dense_layers}",
-            f"base_num_decoder_layers={base_num_decoder_layers}",
-            f"num_decoder_layers={base_num_decoder_layers}",
-        ]
-    )
-
-    devices_array = maxtext_utils.create_device_mesh(config)
-    mesh = Mesh(devices_array, config.mesh_axes)
-
-    decoder = Decoder(
-        config=config,
-        mesh=mesh,
-        model_mode=MODEL_MODE_TRAIN,
-    )
-
-    batch_size = config.global_batch_size_to_load
-    seq_len = config.max_target_length
-
-    decoder_input_tokens = jnp.ones((batch_size, seq_len), dtype=jnp.int32)
-    decoder_positions = jnp.ones((batch_size, seq_len), dtype=jnp.int32)
-    decoder_segment_ids = jnp.ones((batch_size, seq_len), dtype=jnp.int32)
-
-    shared_embedding = DummyEmbedding(emb_dim=config.emb_dim)
-
-    with jax.set_mesh(mesh), jax.disable_jit():
-      variables = decoder.init(
-          {"params": jax.random.PRNGKey(0), "dropout": jax.random.PRNGKey(1), "aqt": jax.random.PRNGKey(2)},
-          shared_embedding=shared_embedding,
-          decoder_input_tokens=decoder_input_tokens,
-          decoder_positions=decoder_positions,
-          decoder_segment_ids=decoder_segment_ids,
-          deterministic=True,
-          model_mode=MODEL_MODE_TRAIN,
-      )
-
-    self.assertIn("params", variables)
-    params = variables["params"]
-    for key in expected_keys:
-      self.assertIn(key, params)
-
-    del variables
-    del params
-    del decoder
-    jax.clear_caches()
-    gc.collect()
+    # The Linen Decoder this exercised was removed in PR12 (Delete Linen);
+    # NNX decoder coverage lives in tests/unit/nnx_decoders_test.py.
+    del mock_from_pretrained, engram_layers_str, expected_keys, first_num_dense_layers, base_num_decoder_layers
+    raise unittest.SkipTest("Linen Decoder removed in PR12 (Delete Linen)")
 
   @pytest.mark.tpu_only
   @patch("transformers.AutoTokenizer.from_pretrained")
 
@@ -21,7 +21,6 @@
 
 import chex
 from flax.experimental import nnx
-from flax.training import train_state
 import jax
 import jax.numpy as jnp
 import jax.sharding
@@ -84,71 +83,36 @@ def test_diloco_training_simulation_with_mesh(self):
       tx = optax.sgd(learning_rate=0.1)
       rngs = nnx.Rngs(params=jax.random.key(seed=42))
       model = SimpleNNXModel(rngs=rngs)
-      graphdef, params = nnx.split(model)
 
-      if test_config.pure_nnx:
-        optimizer = nnx.Optimizer(model, tx, wrt=nnx.Param)
-        # diloco_test_state expects a TrainStateNNX instance when pure_nnx is True.
-        initial_test_state = TrainStateNNX(model, optimizer)
+      optimizer = nnx.Optimizer(model, tx, wrt=nnx.Param)
+      # diloco_test_state expects a TrainStateNNX instance.
+      initial_test_state = TrainStateNNX(model, optimizer)
 
-        # For NNX, train_step needs to take the TrainStateNNX and mutate it
+      # train_step takes the TrainStateNNX and mutates it.
 
-        def _test_train_step(state, batch, prng_key: diloco.PRNGKey):
-          del prng_key
+      def _test_train_step(state, batch, prng_key: diloco.PRNGKey):
+        del prng_key
 
-          def loss_fn(model, batch):
-            inputs, labels = batch
-            logits = jax.vmap(model)(inputs)
-            residual = logits - labels
-            return jnp.mean(jnp.square(residual))
+        def loss_fn(model, batch):
+          inputs, labels = batch
+          logits = jax.vmap(model)(inputs)
+          residual = logits - labels
+          return jnp.mean(jnp.square(residual))
 
-          loss, grads = nnx.value_and_grad(loss_fn)(state.model, batch)
-          state.optimizer.update(state.model, grads)
-          return state, loss
-
-      else:
-
-        def nnx_apply_fn(params, inputs):
-          model_replica = nnx.merge(graphdef, params)
-          return model_replica(inputs)
-
-        # 2. Vmap this new wrapper function
-        vmapped_apply = jax.vmap(nnx_apply_fn, in_axes=(None, 0))
-
-        def _test_train_step(state: train_state.TrainState, batch, prng_key: diloco.PRNGKey):
-          """A simple MSE loss train step to enable numerics testing."""
-          del prng_key
-
-          def loss_fn(params, batch):
-            inputs, labels = batch
-            logits = vmapped_apply(params, inputs)
-            residual = logits - labels
-            sq_residual = jnp.square(residual)
-            msq_residual = jnp.mean(sq_residual)
-            return msq_residual
-
-          loss, grad = jax.value_and_grad(loss_fn)(state.params, batch)
-          return state.apply_gradients(grads=grad), loss
-
-        initial_test_state = train_state.TrainState.create(
-            apply_fn=vmapped_apply,
-            params=params,
-            tx=tx,
-        )
+        loss, grads = nnx.value_and_grad(loss_fn)(state.model, batch)
+        state.optimizer.update(state.model, grads)
+        return state, loss
 
       diloco_test_state, _ = diloco.build_diloco_state(test_config, lambda: initial_test_state)
       chex.assert_equal(diloco_test_state.step, 0)
-      if test_config.pure_nnx:
-        _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
+      _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
 
-        # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
-        # We need to unwrap them if they do.
-        diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
-        )
-        chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
-      else:
-        chex.assert_trees_all_equal(diloco_test_state.params, initial_test_state.params)
+      # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
+      # We need to unwrap them if they do.
+      diloco_params_pure = jax.tree_util.tree_map(
+          lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
+      )
+      chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
 
       diloco_train_step = diloco.build_diloco_train_step(test_config, _test_train_step)
       inputs = jnp.array(
@@ -196,17 +160,14 @@ def loss_fn(params, batch):
       chex.assert_equal(diloco_test_state.step, 1.0)
       chex.assert_equal(loss, 1.0)
       # Assert no updates to the global model yet (no synchronization)
-      if test_config.pure_nnx:
-        _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
+      _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
 
-        # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
-        # We need to unwrap them if they do.
-        diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
-        )
-        chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
-      else:
-        chex.assert_trees_all_equal(diloco_test_state.params, initial_test_state.params)
+      # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
+      # We need to unwrap them if they do.
+      diloco_params_pure = jax.tree_util.tree_map(
+          lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
+      )
+      chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
 
       # Run the second step (no synchronization).
       # Replica 0:
@@ -236,17 +197,14 @@ def loss_fn(params, batch):
       chex.assert_equal(diloco_test_state.step, 2.0)
       chex.assert_trees_all_close(loss, 0.65)
       # Assert no updates to the global model yet (no synchronization)
-      if test_config.pure_nnx:
-        _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
+      _, params_pure, _ = nnx.split(initial_test_state.model, nnx.Param, ...)
 
-        # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
-        # We need to unwrap them if they do.
-        diloco_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
-        )
-        chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
-      else:
-        chex.assert_trees_all_equal(diloco_test_state.params, initial_test_state.params)
+      # diloco_test_state.params might contain nnx.Variables instead of pure arrays.
+      # We need to unwrap them if they do.
+      diloco_params_pure = jax.tree_util.tree_map(
+          lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
+      )
+      chex.assert_trees_all_equal(diloco_params_pure, params_pure.to_pure_dict())
 
       # Run the third step, which synchronizes afterwards.
       # Replica 0:
@@ -281,31 +239,21 @@ def loss_fn(params, batch):
       chex.assert_trees_all_close(loss, 0.4481)
       # Assert that inner and outer parameters are all equal now that
       # synchronization has happened.
-      if test_config.pure_nnx:
-        _, inner_params, _ = nnx.split(diloco_test_state.inner_state.model, nnx.Param, ...)
-        inner_params_pure = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, "value") else x, inner_params.to_pure_dict()
-        )
-        diloco_params_pure_3 = jax.tree_util.tree_map(
-            lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
-        )
-        chex.assert_trees_all_equal(
-            diloco_params_pure_3,
-            jax.tree.map(lambda arr: arr[0, ...], inner_params_pure),
-        )
-        chex.assert_trees_all_equal(
-            diloco_params_pure_3,
-            jax.tree.map(lambda arr: arr[1, ...], inner_params_pure),
-        )
-      else:
-        chex.assert_trees_all_equal(
-            diloco_test_state.params,
-            jax.tree.map(lambda arr: arr[0, ...], diloco_test_state.inner_state.params),
-        )
-        chex.assert_trees_all_equal(
-            diloco_test_state.params,
-            jax.tree.map(lambda arr: arr[1, ...], diloco_test_state.inner_state.params),
-        )
+      _, inner_params, _ = nnx.split(diloco_test_state.inner_state.model, nnx.Param, ...)
+      inner_params_pure = jax.tree_util.tree_map(
+          lambda x: x.value if hasattr(x, "value") else x, inner_params.to_pure_dict()
+      )
+      diloco_params_pure_3 = jax.tree_util.tree_map(
+          lambda x: x.value if hasattr(x, "value") else x, diloco_test_state.params
+      )
+      chex.assert_trees_all_equal(
+          diloco_params_pure_3,
+          jax.tree.map(lambda arr: arr[0, ...], inner_params_pure),
+      )
+      chex.assert_trees_all_equal(
+          diloco_params_pure_3,
+          jax.tree.map(lambda arr: arr[1, ...], inner_params_pure),
+      )
 
       # Run the fourth step (no synchronization).
       # Replica 0:
 
@@ -138,12 +138,17 @@ def test_hlo_diff(self, test_id, config_file, overrides):
 
     try:
       base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-      config_path = os.path.join(base_dir, config_file)
+      # Compile via base.yml + model_name (the normal training path) so the config inherits
+      # base.yml's logical_axis_rules and exercises the real NNX path. Loading the model yml
+      # directly as the top-level config skips base.yml, leaving logical_axis_rules empty.
+      base_config_path = os.path.join(base_dir, "src/maxtext/configs/base.yml")
+      model_name = os.path.splitext(os.path.basename(config_file))[0]
 
       # Arguments for train_compile
       test_args = [
           None,
-          config_path,
+          base_config_path,
+          f"model_name={model_name}",
           "dataset_type=synthetic",
           "override_model_config=true",
           "compile_topology_num_slices=1",