Skip to content

Commit 88a9214

Browse files
ci: fix GPU unit and integration test failures on multi-GPU runners
1 parent d892d9c commit 88a9214

1 file changed

Lines changed: 7 additions & 0 deletions

File tree

.github/workflows/run_tests_against_package.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ jobs:
9191
|| (inputs.device_type == 'cpu' && 'tpu' || inputs.device_type)
9292
}}
9393
ALLOW_MULTIPLE_LIBTPU_LOAD: ${{ inputs.device_type == 'cpu' && 'true' || '' }} # bypass /tmp/libtpu_lockfile check for cpu tests, which don't actually use accelerators (to allow concurrency)
94+
NCCL_P2P_DISABLE: ${{ inputs.device_type == 'cuda12' && '1' || '' }}
95+
NCCL_IB_DISABLE: ${{ inputs.device_type == 'cuda12' && '1' || '' }}
9496
options: ${{ inputs.container_resource_option }}
9597
steps:
9698
- name: Checkout MaxText
@@ -172,6 +174,11 @@ jobs:
172174
done
173175
fi
174176
fi
177+
# Restrict GPU unit tests to a single GPU to avoid multi-device NCCL failures
178+
if [[ "${INPUTS_PYTEST_MARKER}" == *"not integration_test"* ]]; then
179+
export CUDA_VISIBLE_DEVICES=0
180+
echo "Restricting GPU unit tests to a single GPU: CUDA_VISIBLE_DEVICES=0"
181+
fi
175182
fi
176183
if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then
177184
$PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist

0 commit comments

Comments
 (0)