Skip to content

Commit 34e3d62

Browse files
[QA] Add CP deterministic tests to L3 and support TE_PATH in FA test
Add deterministic CP test runs to L3 FA versions test. Support TE_PATH positional arg and fix GPU threshold for parallel test execution. Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
1 parent 2a49dee commit 34e3d62

2 files changed

Lines changed: 45 additions & 3 deletions

File tree

  • qa
    • L1_pytorch_distributed_unittest
    • L3_pytorch_FA_versions_test

qa/L1_pytorch_distributed_unittest/test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_
3030
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
3131
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
3232
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
33+
NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention_with_cp.py"
3334
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
3435
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
3536
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_newton_schulz.xml $TE_PATH/tests/pytorch/distributed/test_newton_schulz.py || test_fail "test_newton_schulz.py"

qa/L3_pytorch_FA_versions_test/test.sh

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,25 @@
22
#
33
# See LICENSE for license information.
44

5-
set -e
5+
function error_exit() {
6+
echo "Error: $1"
7+
exit 1
8+
}
9+
10+
function test_fail() {
11+
RET=1
12+
FAILED_CASES="$FAILED_CASES $1"
13+
echo "Error: sub-test failed: $1"
14+
}
15+
16+
RET=0
17+
FAILED_CASES=""
618

719
: ${TE_PATH:=/opt/transformerengine}
820
: ${XML_LOG_DIR:=/logs}
921
mkdir -p "$XML_LOG_DIR"
1022

11-
pip3 install pytest==8.2.1
23+
pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
1224

1325
# Limit parallel build jobs to avoid overwhelming system resources
1426
export MAX_JOBS=32
@@ -41,6 +53,35 @@ do
4153
fi
4254

4355
# Run tests
44-
NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/pytorch/attention/test_attention.py
56+
NUM_GPUS=$(nvidia-smi -L | wc -l)
57+
echo "Detected $NUM_GPUS GPU(s)"
58+
if [ "$NUM_GPUS" -ge 5 ]; then
59+
CP_NUM_GPUS=$(( NUM_GPUS - 1 > 4 ? 4 : NUM_GPUS - 1 ))
60+
CP_GPUS=$(seq -s, 1 $CP_NUM_GPUS)
61+
echo "Running tests in parallel: test_attention.py on GPU 0, test_attention_with_cp.py on GPUs $CP_GPUS ($CP_NUM_GPUS GPUs)"
62+
63+
CUDA_VISIBLE_DEVICES=0 NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s \
64+
--junitxml=$XML_LOG_DIR/pytest.xml \
65+
$TE_PATH/tests/pytorch/attention/test_attention.py &
66+
PID_ATTN=$!
67+
68+
CUDA_VISIBLE_DEVICES=$CP_GPUS NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s \
69+
--junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml \
70+
$TE_PATH/tests/pytorch/attention/test_attention_with_cp.py &
71+
PID_CP=$!
4572

73+
wait $PID_ATTN || test_fail "test_attention.py"
74+
wait $PID_CP || test_fail "test_attention_with_cp.py"
75+
else
76+
echo "Running tests sequentially: need >=5 GPUs for parallel execution (1 for test_attention + 4 for test_attention_with_cp)"
77+
NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
78+
NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
79+
fi
4680
done
81+
82+
if [ "$RET" -ne 0 ]; then
83+
echo "Error in the following test cases:$FAILED_CASES"
84+
exit 1
85+
fi
86+
echo "All tests passed"
87+
exit 0

0 commit comments

Comments
 (0)