diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml
index 60b2dd3d93..a48b4b4326 100644
--- a/.github/workflows/run_pathways_tests.yml
+++ b/.github/workflows/run_pathways_tests.yml
@@ -102,7 +102,7 @@ jobs:
           export MAXTEXT_TEST_ASSETS_ROOT=$(pwd)/tests/assets
           export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
           # TODO(b/454659463): Enable test_default_hlo_match after volume mount is supported.
-          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not CompileThenLoad" --durations=0
+          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not AotJaxprIdenticalTest and not CompileThenLoad and not test_diloco_two_slices" --durations=0
         env:
           PYTHONPATH: "${{ github.workspace }}/src"
     services:
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
index 7817e2d678..814d805946 100644
--- a/.github/workflows/run_tests_against_package.yml
+++ b/.github/workflows/run_tests_against_package.yml
@@ -19,6 +19,11 @@ name: Run Tests Against MaxText Package
 on:
   workflow_call:
     inputs:
+      flavor:
+        description: 'Test flavor name (e.g. cpu-unit, tpu-unit) - used for artifact naming'
+        required: false
+        type: string
+        default: ''
       device_type:
         required: true
         type: string
@@ -164,6 +169,11 @@ jobs:
           if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then
             $PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist
             SPLIT_ARGS="--splits ${INPUTS_TOTAL_WORKERS} --group ${INPUTS_WORKER_GROUP} -n auto"
+            # On scheduled runs, record per-shard durations so future splits balance by time (LPT).
+            # Merge artifacts offline and commit as .test_durations at repo root.
+            if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then
+              SPLIT_ARGS="${SPLIT_ARGS} --store-durations --durations-path=.test_durations.${INPUTS_WORKER_GROUP}"
+            fi
           else
             SPLIT_ARGS=""
           fi
@@ -195,3 +205,11 @@ jobs:
           # If scheduled, upload to scheduled flag only. If PR, upload to regular flag only.
           flags: ${{ inputs.is_scheduled_run == 'true' && 'scheduled' || 'regular' }}
           verbose: true
+      - name: Upload test durations artifact
+        if: ${{ inputs.is_scheduled_run == 'true' && inputs.total_workers > 1 }}
+        uses: actions/upload-artifact@v4
+        continue-on-error: true
+        with:
+          name: test-durations-${{ inputs.flavor }}-${{ inputs.worker_group }}
+          path: .test_durations.*
+          if-no-files-found: ignore
diff --git a/.github/workflows/run_tests_coordinator.yml b/.github/workflows/run_tests_coordinator.yml
index 2b7a8b5d2a..537675bcdd 100644
--- a/.github/workflows/run_tests_coordinator.yml
+++ b/.github/workflows/run_tests_coordinator.yml
@@ -61,6 +61,7 @@ jobs:
 
     uses: ./.github/workflows/run_tests_against_package.yml
     with:
+      flavor: ${{ inputs.flavor }}
       # Infrastructure Mapping
       device_type: >-
         ${{ fromJSON('{
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
index 4230c46174..5047f498ae 100644
--- a/tests/unit/train_compile_test.py
+++ b/tests/unit/train_compile_test.py
@@ -17,6 +17,12 @@
 This module contains unit tests for `train_compile.py`, ensuring that various
 model configurations and parallelism strategies can be successfully compiled
 for different hardware topologies.
+
+These tests exercise the compilation pipeline only, not numerical correctness,
+so most use `override_model_config=true` with a reduced `base_num_decoder_layers`
+to keep CPU compile times bounded. Full-scale model correctness is covered
+elsewhere. Tests that deliberately keep the full layer count do so to exercise
+sharding/pipeline edge cases and are annotated inline.
 """
 
 import unittest
@@ -373,6 +379,8 @@ def test_moe_dropping_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=mixtral-8x7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=False",
             "capacity_factor=1",
             "per_device_batch_size=4",
@@ -420,6 +428,8 @@ def test_moe_megablox_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=mixtral-8x7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=True",
             "megablox=True",
             "per_device_batch_size=4",
@@ -442,6 +452,8 @@ def test_moe_megablox_ring_ep_random(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=deepseek3-test",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=True",
             "megablox=True",
             "per_device_batch_size=4",
@@ -466,6 +478,8 @@ def test_moe_ragged_dot_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=mixtral-8x7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=True",
             "megablox=False",
             "per_device_batch_size=4",
@@ -488,6 +502,8 @@ def test_moe_dense_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=mixtral-8x7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=False",
             "capacity_factor=-1",
             "per_device_batch_size=4",
@@ -534,6 +550,8 @@ def test_moe_pp_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=2",
             "model_name=mixtral-8x7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=False",
             "capacity_factor=1",
             "per_device_batch_size=4",
@@ -558,6 +576,8 @@ def test_moe_deepseek_scanned_bf16(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=deepseek3-test",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=True",
             "megablox=False",
             "per_device_batch_size=2",
@@ -606,6 +626,8 @@ def test_moe_deepseek_with_device_limit(self):
             "use_iota_embed=true",
             "compile_topology_num_slices=1",
             "model_name=deepseek3-test",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "sparse_matmul=True",
             "megablox=False",
             "per_device_batch_size=1",
@@ -620,14 +642,16 @@ def test_moe_deepseek_with_device_limit(self):
 
   @pytest.mark.cpu_only
   def test_moe_deepseek_pipeline_subset(self):
+    # Keeps the full layer count so pipeline_parallel_layers=56 exercises
+    # the real stage boundaries.
     compiled_trainstep_file = "/tmp/test_moe_deepseek_pipeline_subset.pickle"
     train_compile_main(
         (
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-64",
-            "compile_topology_num_slices=8",
+            "compile_topology=v5p-8",
+            "compile_topology_num_slices=2",
             "use_iota_embed=true",
             "model_name=deepseek3-test",
             "megablox=True",
@@ -636,8 +660,9 @@ def test_moe_deepseek_pipeline_subset(self):
             "per_device_batch_size=1",
             "max_target_length=1024",
             "pipeline_parallel_layers=56",
-            "ici_expert_parallelism=16",
-            "dcn_pipeline_parallelism=8",
+            "ici_expert_parallelism=4",
+            "ici_fsdp_parallelism=1",
+            "dcn_pipeline_parallelism=2",
         )
     )
 
@@ -669,22 +694,23 @@ def test_moe_llama4_17b_16e(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-128",
+            "compile_topology=v5p-16",
             "compile_topology_num_slices=1",
             "model_name=llama4-17b-16e",
+            "override_model_config=true",
+            "base_num_decoder_layers=4",
             "per_device_batch_size=1",
             "max_target_length=1024",
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
             "scan_layers=True",
-            "ici_fsdp_parallelism=16",
-            "ici_tensor_parallelism=4",
+            "ici_fsdp_parallelism=4",
+            "ici_tensor_parallelism=2",
         )
     )
 
-  @pytest.mark.cpu_only
-  def test_moe_gpt_oss_20b_sparse_matmul(self):
-    compiled_trainstep_file = "/tmp/test_moe_gpt_oss_20b_sparse_matmul.pickle"
+  def _run_moe_gpt_oss_20b(self, suffix, matmul_args):
+    compiled_trainstep_file = f"/tmp/test_moe_gpt_oss_20b_{suffix}.pickle"
     train_compile_main(
         (
             "",
@@ -693,38 +719,25 @@ def test_moe_gpt_oss_20b_sparse_matmul(self):
             "compile_topology=v5p-16",
             "compile_topology_num_slices=1",
             "model_name=gpt-oss-20b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "per_device_batch_size=1",
             "max_target_length=1024",
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
             "scan_layers=True",
-            "sparse_matmul=True",
-            "megablox=True",
             "attention=flash",
+            *matmul_args,
         )
     )
 
+  @pytest.mark.cpu_only
+  def test_moe_gpt_oss_20b_sparse_matmul(self):
+    self._run_moe_gpt_oss_20b("sparse_matmul", ["sparse_matmul=True", "megablox=True"])
+
   @pytest.mark.cpu_only
   def test_moe_gpt_oss_20b_dense_matmul(self):
-    compiled_trainstep_file = "/tmp/test_moe_gpt_oss_20b_dense_matmul.pickle"
-    train_compile_main(
-        (
-            "",
-            get_test_config_path(),
-            f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-16",
-            "compile_topology_num_slices=1",
-            "model_name=gpt-oss-20b",
-            "per_device_batch_size=1",
-            "max_target_length=1024",
-            "dtype=bfloat16",
-            "weight_dtype=bfloat16",
-            "scan_layers=True",
-            "sparse_matmul=False",
-            "capacity_factor=-1",
-            "attention=flash",
-        )
-    )
+    self._run_moe_gpt_oss_20b("dense_matmul", ["sparse_matmul=False", "capacity_factor=-1"])
 
   @pytest.mark.cpu_only
   def test_gpt3_6b(self):
@@ -769,6 +782,8 @@ def test_qwen3_next(self):
             "compile_topology=v5p-256",
             "compile_topology_num_slices=1",
             "model_name=qwen3-next-80b-a3b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "per_device_batch_size=1",
             "max_target_length=1024",
         )
@@ -867,6 +882,8 @@ def test_olmo3_7b(self):
             "compile_topology=v5p-8",
             "compile_topology_num_slices=1",
             "model_name=olmo3-7b",
+            "override_model_config=true",
+            "base_num_decoder_layers=8",
             "per_device_batch_size=1",
             "scan_layers=True",
             "max_target_length=1024",