Skip to content

Commit af7d87c

Browse files
q10facebook-github-bot
authored andcommitted
Fix bash scripts to fail correctly for ROCm jobs (#5564)
Summary: - Fix bash scripts to fail correctly for ROCm jobs Differential Revision: D98951961
1 parent 38f11d1 commit af7d87c

2 files changed

Lines changed: 46 additions & 8 deletions

File tree

.github/scripts/fbgemm_gpu_test.bash

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,19 @@ run_python_test () {
6464
# shellcheck disable=SC2155
6565
local start=$(date +%s)
6666

67+
# NOTE: conda run in conda 26.x does not reliably propagate exit codes unless
68+
# the -- separator is used to explicitly delimit conda's own flags from the
69+
# command to execute.
6770
# shellcheck disable=SC2086
68-
if print_exec conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --cache-clear "${python_test_file}"; then
71+
echo "+ conda run --no-capture-output ${env_prefix} -- python -m pytest ${pytest_args[*]} --cache-clear ${python_test_file}"
72+
echo ""
73+
# shellcheck disable=SC2086
74+
conda run --no-capture-output ${env_prefix} -- python -m pytest "${pytest_args[@]}" --cache-clear "${python_test_file}"
75+
local retcode=$?
76+
echo "[TEST] Initial run exit code: ${retcode}"
77+
echo ""
78+
79+
if [ $retcode -eq 0 ]; then
6980
echo "[TEST] Python test suite PASSED: ${python_test_file}"
7081
local test_time=$(($(date +%s)-start))
7182
echo "[TEST] Python test time for ${python_test_file}: ${test_time} seconds"
@@ -75,7 +86,7 @@ run_python_test () {
7586
return 0
7687
fi
7788

78-
echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}"
89+
echo "[TEST] Some tests FAILED (exit code: ${retcode}). Re-attempting only FAILED tests: ${python_test_file}"
7990
echo ""
8091
echo ""
8192

@@ -84,8 +95,29 @@ run_python_test () {
8495
# suites, we only run tests that have failed in the previous round. This is
8596
# enabled by using the pytest cache and the --lf flag.
8697

87-
# shellcheck disable=SC2086
88-
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"; then
98+
local max_retries=2
99+
local retry_retcode=0
100+
for i in $(seq 0 ${max_retries}); do
101+
# shellcheck disable=SC2086
102+
echo "[EXEC] [ATTEMPT ${i}/${max_retries}] + conda run --no-capture-output ${env_prefix} -- python -m pytest ${pytest_args[*]} --lf --last-failed-no-failures none ${python_test_file}"
103+
# shellcheck disable=SC2086
104+
conda run --no-capture-output ${env_prefix} -- python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"
105+
retry_retcode=$?
106+
echo "[TEST] Retry run exit code: ${retry_retcode}"
107+
echo ""
108+
109+
if [ $retry_retcode -eq 0 ]; then
110+
break
111+
fi
112+
113+
echo "[EXEC] [ATTEMPT ${i}/${max_retries}] Command attempt failed (exit code: ${retry_retcode})."
114+
echo ""
115+
if [ "$i" -ne "$max_retries" ]; then
116+
sleep 2
117+
fi
118+
done
119+
120+
if [ $retry_retcode -eq 0 ]; then
89121
echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
90122
local test_time=$(($(date +%s)-start))
91123
echo "[TEST] Python test time with retries for ${python_test_file}: ${test_time} seconds"
@@ -257,7 +289,7 @@ __setup_fbgemm_gpu_test () {
257289
__run_fbgemm_gpu_tests_in_directory () {
258290
echo "################################################################################"
259291
# shellcheck disable=SC2154
260-
echo "# Run FBGEMM-GPU Tests: ${pwd}"
292+
echo "# Run FBGEMM-GPU Tests: $(pwd)"
261293
echo "#"
262294
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
263295
echo "################################################################################"
@@ -269,6 +301,10 @@ __run_fbgemm_gpu_tests_in_directory () {
269301
echo "[TEST] Enumerating ALL test files ..."
270302
# shellcheck disable=SC2155
271303
local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
304+
if [ -z "$all_test_files" ]; then
305+
echo "[ERROR] No test files (*_test.py) found in directory: $(pwd)"
306+
return 1
307+
fi
272308
for f in $all_test_files; do echo "$f"; done
273309
echo ""
274310

.github/scripts/nova_postscript.bash

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8+
set -eo pipefail
9+
810
echo "[NOVA] Current working directory: $(pwd)"
9-
cd "${FBGEMM_REPO}" || echo "[NOVA] Failed to cd to ${FBGEMM_REPO}"
11+
cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO}"; exit 1; }
1012
PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
1113
BUILD_ENV_NAME=${CONDA_ENV}
1214
GITHUB_ENV=TRUE
@@ -46,8 +48,8 @@ echo "[NOVA] Time taken to install wheel: ${runtime} seconds"
4648

4749
# Test with PyTest
4850
$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
49-
cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
50-
test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}"
51+
cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; exit 1; };
52+
test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" || exit 1
5153
end_time=$(date +%s)
5254
runtime=$((end_time-start_time))
5355
start_time=${end_time}

0 commit comments

Comments
 (0)