Switch from pytest-split to pytest-xdist for parallel test execution

shralex · shralex · commit 4b82901516f1 · 2025-12-22T11:51:13.000Z
Previously, CPU tests are distributed across multiple workers using pytest-split,
which assigns the same number of tests to each worker. However, since the runtime
of tests is different, some workers end up finishing fast and stand idle while
others take a long time, so we're not utilizing the workers fully. This change
replaces the use of pytest-split with pytest-xdist which dynamically assigns work to workers.
diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml
@@ -51,7 +51,7 @@ jobs:
         fail-fast: false # don't cancel all jobs on failure
         matrix:
           image_type: ["py312"]
-          worker_group: [1, 2, 3, 4]
+          worker_group: [1, 2]
     with:
       device_type: cpu
       device_name: X64
@@ -63,7 +63,7 @@ jobs:
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
       worker_group: ${{ matrix.worker_group }}
-      total_workers: 4
+      total_workers: 2
 
   maxtext_tpu_unit_tests:
     needs: build_and_upload_maxtext_package
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
@@ -71,6 +71,7 @@ jobs:
         TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
         TPU_SKIP_MDS_QUERY: ${{ inputs.device_type == 'cpu' && '1' || '' }}
         MAXTEXT_PACKAGE_EXTRA: ${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}
+        ALLOW_MULTIPLE_LIBTPU_LOAD: ${{ inputs.device_type == 'cpu' && 'true' || '' }} # bypass /tmp/libtpu_lockfile check for cpu tests, which don't actually use accelerators (to allow concurrency)
       options: ${{ inputs.container_resource_option }}
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -107,6 +108,7 @@ jobs:
           if [ "${{ inputs.device_type }}" != "cuda12" ]; then
             export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536'
           fi
+          # Use pytest-split to statically split tests across runners, and pytest-xdist to dynamically split across processes within each runner
+          [ "${{ inputs.total_workers }}" -gt 1 ] && .venv/bin/python3 -m pip install --quiet pytest-split pytest-xdist && SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }} -n auto" || SPLIT_ARGS=""
           # TODO: Fix the skipped tests and remove the deselect flags
-          [ "${{ inputs.total_workers }}" -gt 1 ] && .venv/bin/python3 -m pip install --quiet pytest-split && SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }}" || SPLIT_ARGS=""
           .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" --durations=0 --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize" $SPLIT_ARGS
diff --git a/tests/grain_data_processing_test.py b/tests/grain_data_processing_test.py
@@ -68,6 +68,7 @@ def setUp(self):
     )
     self.train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
 
+  @pytest.mark.cpu_only
   def test_train_ds(self):
     expected_shape = [jax.device_count(), self.config.max_target_length]
     # For training we pack multiple short examples in one example.
@@ -84,7 +85,7 @@ def test_train_ds(self):
             "targets_segmentation": expected_shape,
         },
     )
-
+  @pytest.mark.cpu_only
   def test_batch_determinism(self):
     batch1 = next(self.train_iter)
     train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
@@ -96,6 +97,7 @@ def test_batch_determinism(self):
     self.assertTrue((batch1["inputs_position"] == batch2["inputs_position"]).all())
     self.assertTrue((batch1["targets_position"] == batch2["targets_position"]).all())
 
+  @pytest.mark.cpu_only
   def test_for_loop_repeatable(self):
     def get_first_batch(iterator):
       batch = None
@@ -223,6 +225,7 @@ def setUp(self):
           "and it affects batch determinism at first."
       )
   )
+  @pytest.mark.cpu_only
   def test_batch_determinism(self):
     super().test_batch_determinism()
 
@@ -264,6 +267,7 @@ def setUp(self):
     )
     self.train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
 
+  @pytest.mark.cpu_only
   def test_train_ds(self):
     expected_shape = [jax.device_count(), self.config.max_target_length]
     # For training we pack multiple short examples in one example.
@@ -281,6 +285,7 @@ def test_train_ds(self):
         },
     )
 
+  @pytest.mark.cpu_only
   def test_batch_determinism(self):
     batch1 = next(self.train_iter)
     train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
@@ -292,6 +297,7 @@ def test_batch_determinism(self):
     self.assertTrue((batch1["inputs_position"] == batch2["inputs_position"]).all())
     self.assertTrue((batch1["targets_position"] == batch2["targets_position"]).all())
 
+  @pytest.mark.cpu_only
   def test_for_loop_repeatable(self):
     def get_first_batch(iterator):
       batch = None