File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 9191 || (inputs.device_type == 'cpu' && 'tpu' || inputs.device_type)
9292 }}
9393 ALLOW_MULTIPLE_LIBTPU_LOAD : ${{ inputs.device_type == 'cpu' && 'true' || '' }} # bypass /tmp/libtpu_lockfile check for cpu tests, which don't actually use accelerators (to allow concurrency)
94+ NCCL_P2P_DISABLE : ${{ inputs.device_type == 'cuda12' && '1' || '' }}
95+ NCCL_IB_DISABLE : ${{ inputs.device_type == 'cuda12' && '1' || '' }}
9496 options : ${{ inputs.container_resource_option }}
9597 steps :
9698 - name : Checkout MaxText
@@ -172,6 +174,11 @@ jobs:
172174 done
173175 fi
174176 fi
177+ # Restrict GPU unit tests to a single GPU to avoid multi-device NCCL failures
178+ if [[ "${INPUTS_PYTEST_MARKER}" == *"not integration_test"* ]]; then
179+ export CUDA_VISIBLE_DEVICES=0
180+ echo "Restricting GPU unit tests to a single GPU: CUDA_VISIBLE_DEVICES=0"
181+ fi
175182 fi
176183 if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then
177184 $PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist
You can’t perform that action at this time.
0 commit comments