Speed up cpu-unit CI

gagika · Gagik Amirkhanyan · commit 8086fc14da9f · 2026-04-19T23:09:40.000Z
diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml
@@ -102,7 +102,7 @@ jobs:
           export MAXTEXT_TEST_ASSETS_ROOT=$(pwd)/tests/assets
           export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
           # TODO(b/454659463): Enable test_default_hlo_match after volume mount is supported.
-          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not CompileThenLoad" --durations=0
+          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not AotJaxprIdenticalTest and not CompileThenLoad" --durations=0
         env:
           PYTHONPATH: "${{ github.workspace }}/src"
     services:
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
@@ -164,6 +164,11 @@ jobs:
           if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then
             $PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist
             SPLIT_ARGS="--splits ${INPUTS_TOTAL_WORKERS} --group ${INPUTS_WORKER_GROUP} -n auto"
+            # On scheduled runs, record per-shard durations so future splits balance by time (LPT).
+            # Merge artifacts offline and commit as .test_durations at repo root.
+            if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then
+              SPLIT_ARGS="${SPLIT_ARGS} --store-durations --durations-path=.test_durations.${INPUTS_WORKER_GROUP}"
+            fi
           else
             SPLIT_ARGS=""
           fi
@@ -195,3 +200,11 @@ jobs:
           # If scheduled, upload to scheduled flag only. If PR, upload to regular flag only.
           flags: ${{ inputs.is_scheduled_run == 'true' && 'scheduled' || 'regular' }}
           verbose: true
+      - name: Upload test durations artifact
+        if: ${{ inputs.is_scheduled_run == 'true' && inputs.total_workers > 1 }}
+        uses: actions/upload-artifact@v4
+        continue-on-error: true
+        with:
+          name: test-durations-${{ inputs.flavor }}-${{ inputs.worker_group }}
+          path: .test_durations.*
+          if-no-files-found: ignore
diff --git a/.github/workflows/run_tests_coordinator.yml b/.github/workflows/run_tests_coordinator.yml
@@ -57,7 +57,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu-unit') && '[1, 2]' || '[1]') }}
+        worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu-unit') && '[1, 2, 3, 4]' || '[1]') }}
 
     uses: ./.github/workflows/run_tests_against_package.yml
     with:
@@ -148,5 +148,5 @@ jobs:
       is_scheduled_run: ${{ inputs.is_scheduled_run }}
       maxtext_installed: ${{ inputs.maxtext_installed }}
       worker_group: ${{ matrix.worker_group }}
-      total_workers: ${{ contains(inputs.flavor, 'cpu-unit') && 2 || 1 }}
+      total_workers: ${{ contains(inputs.flavor, 'cpu-unit') && 4 || 1 }}
       maxtext_sha: ${{ inputs.maxtext_sha }}
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
@@ -626,8 +626,8 @@ def test_moe_deepseek_pipeline_subset(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-64",
-            "compile_topology_num_slices=8",
+            "compile_topology=v5p-8",
+            "compile_topology_num_slices=2",
             "use_iota_embed=true",
             "model_name=deepseek3-test",
             "megablox=True",
@@ -636,8 +636,8 @@ def test_moe_deepseek_pipeline_subset(self):
             "per_device_batch_size=1",
             "max_target_length=1024",
             "pipeline_parallel_layers=56",
-            "ici_expert_parallelism=16",
-            "dcn_pipeline_parallelism=8",
+            "ici_expert_parallelism=8",
+            "dcn_pipeline_parallelism=2",
         )
     )
 
@@ -669,22 +669,22 @@ def test_moe_llama4_17b_16e(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-128",
+            "compile_topology=v5p-16",
             "compile_topology_num_slices=1",
             "model_name=llama4-17b-16e",
             "per_device_batch_size=1",
             "max_target_length=1024",
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
             "scan_layers=True",
-            "ici_fsdp_parallelism=16",
-            "ici_tensor_parallelism=4",
+            "ici_fsdp_parallelism=4",
+            "ici_tensor_parallelism=2",
+            "ici_expert_parallelism=2",
         )
     )
 
-  @pytest.mark.cpu_only
-  def test_moe_gpt_oss_20b_sparse_matmul(self):
-    compiled_trainstep_file = "/tmp/test_moe_gpt_oss_20b_sparse_matmul.pickle"
+  def _run_moe_gpt_oss_20b(self, suffix, matmul_args):
+    compiled_trainstep_file = f"/tmp/test_moe_gpt_oss_20b_{suffix}.pickle"
     train_compile_main(
         (
             "",
@@ -698,33 +698,18 @@ def test_moe_gpt_oss_20b_sparse_matmul(self):
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
             "scan_layers=True",
-            "sparse_matmul=True",
-            "megablox=True",
             "attention=flash",
+            *matmul_args,
         )
     )
 
+  @pytest.mark.cpu_only
+  def test_moe_gpt_oss_20b_sparse_matmul(self):
+    self._run_moe_gpt_oss_20b("sparse_matmul", ["sparse_matmul=True", "megablox=True"])
+
   @pytest.mark.cpu_only
   def test_moe_gpt_oss_20b_dense_matmul(self):
-    compiled_trainstep_file = "/tmp/test_moe_gpt_oss_20b_dense_matmul.pickle"
-    train_compile_main(
-        (
-            "",
-            get_test_config_path(),
-            f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-16",
-            "compile_topology_num_slices=1",
-            "model_name=gpt-oss-20b",
-            "per_device_batch_size=1",
-            "max_target_length=1024",
-            "dtype=bfloat16",
-            "weight_dtype=bfloat16",
-            "scan_layers=True",
-            "sparse_matmul=False",
-            "capacity_factor=-1",
-            "attention=flash",
-        )
-    )
+    self._run_moe_gpt_oss_20b("dense_matmul", ["sparse_matmul=False", "capacity_factor=-1"])
 
   @pytest.mark.cpu_only
   def test_gpt3_6b(self):
@@ -766,7 +751,7 @@ def test_qwen3_next(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-256",
+            "compile_topology=v5p-8",
             "compile_topology_num_slices=1",
             "model_name=qwen3-next-80b-a3b",
             "per_device_batch_size=1",