Skip to content

Commit f101c40

Browse files
q10facebook-github-bot
authored andcommitted
Fix bash scripts to fail correctly for ROCm jobs (#5564)
Summary: - Fix bash scripts to fail correctly for ROCm jobs Differential Revision: D98951961
1 parent e93e423 commit f101c40

2 files changed

Lines changed: 35 additions & 5 deletions

File tree

.github/scripts/fbgemm_gpu_test.bash

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,18 @@ run_python_test () {
6464
# shellcheck disable=SC2155
6565
local start=$(date +%s)
6666

67+
# Run pytest directly (not through eval) so that the exit code from conda run
68+
# is captured reliably. This is defense-in-depth against conda run + eval
69+
# silently swallowing non-zero exit codes.
6770
# shellcheck disable=SC2086
68-
if print_exec conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --cache-clear "${python_test_file}"; then
71+
echo "+ conda run --no-capture-output ${env_prefix} python -m pytest ${pytest_args[*]} --cache-clear ${python_test_file}"
72+
echo ""
73+
# shellcheck disable=SC2086
74+
conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --cache-clear "${python_test_file}"
75+
local retcode=$?
76+
echo ""
77+
78+
if [ $retcode -eq 0 ]; then
6979
echo "[TEST] Python test suite PASSED: ${python_test_file}"
7080
local test_time=$(($(date +%s)-start))
7181
echo "[TEST] Python test time for ${python_test_file}: ${test_time} seconds"
@@ -75,7 +85,7 @@ run_python_test () {
7585
return 0
7686
fi
7787

78-
echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}"
88+
echo "[TEST] Some tests FAILED (exit code: ${retcode}). Re-attempting only FAILED tests: ${python_test_file}"
7989
echo ""
8090
echo ""
8191

@@ -84,8 +94,28 @@ run_python_test () {
8494
# suites, we only run tests that have failed in the previous round. This is
8595
# enabled by using the pytest cache and the --lf flag.
8696

87-
# shellcheck disable=SC2086
88-
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"; then
97+
local max_retries=2
98+
local retry_retcode=0
99+
for i in $(seq 0 ${max_retries}); do
100+
# shellcheck disable=SC2086
101+
echo "[EXEC] [ATTEMPT ${i}/${max_retries}] + conda run --no-capture-output ${env_prefix} python -m pytest ${pytest_args[*]} --lf --last-failed-no-failures none ${python_test_file}"
102+
# shellcheck disable=SC2086
103+
conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"
104+
retry_retcode=$?
105+
echo ""
106+
107+
if [ $retry_retcode -eq 0 ]; then
108+
break
109+
fi
110+
111+
echo "[EXEC] [ATTEMPT ${i}/${max_retries}] Command attempt failed (exit code: ${retry_retcode})."
112+
echo ""
113+
if [ "$i" -ne "$max_retries" ]; then
114+
sleep 2
115+
fi
116+
done
117+
118+
if [ $retry_retcode -eq 0 ]; then
89119
echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
90120
local test_time=$(($(date +%s)-start))
91121
echo "[TEST] Python test time with retries for ${python_test_file}: ${test_time} seconds"

.github/scripts/nova_postscript.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ echo "[NOVA] Time taken to install wheel: ${runtime} seconds"
4747
# Test with PyTest
4848
$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
4949
cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
50-
test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}"
50+
test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" || exit 1
5151
end_time=$(date +%s)
5252
runtime=$((end_time-start_time))
5353
start_time=${end_time}

0 commit comments

Comments
 (0)