Skip to content

Commit ddd95ac

Browse files
sbryngelsonclaude
andcommitted
Use nick-fields/retry for Frontier builds; reduce -j to 4
Move build retry logic from shell scripts to GHA using nick-fields/retry with 60s backoff between attempts. This gives better visibility into retries and lets login node memory pressure subside between attempts. Also reduce build parallelism from -j 8 to -j 4 to lower peak memory on shared Frontier login nodes, and remove the outdated Node 16 version overrides from self-hosted runner env. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 475caa3 commit ddd95ac

4 files changed

Lines changed: 27 additions & 82 deletions

File tree

.github/workflows/bench.yml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,6 @@ jobs:
8686
group: ${{ matrix.group }}
8787
labels: ${{ matrix.labels }}
8888
timeout-minutes: 480
89-
env:
90-
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
91-
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
9289
steps:
9390
- name: Clone - PR
9491
uses: actions/checkout@v4
@@ -104,10 +101,14 @@ jobs:
104101

105102
- name: Setup & Build
106103
if: matrix.build_script != ''
107-
run: |
108-
(cd pr && ${{ matrix.build_script }}) &
109-
(cd master && ${{ matrix.build_script }}) &
110-
wait %1 && wait %2
104+
uses: nick-fields/retry@v3
105+
with:
106+
max_attempts: 3
107+
retry_wait_seconds: 60
108+
command: |
109+
(cd pr && ${{ matrix.build_script }}) &
110+
(cd master && ${{ matrix.build_script }}) &
111+
wait %1 && wait %2
111112
112113
- name: Bench (Master v. PR)
113114
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

.github/workflows/frontier/build.sh

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,39 +18,10 @@ fi
1818

1919
. ./mfc.sh load -c f -m g
2020

21-
max_attempts=3
22-
attempt=1
23-
while [ $attempt -le $max_attempts ]; do
24-
echo "Build attempt $attempt of $max_attempts..."
25-
if [ "$run_bench" == "bench" ]; then
26-
build_cmd_ok=true
27-
for dir in benchmarks/*/; do
28-
dirname=$(basename "$dir")
29-
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
30-
build_cmd_ok=false
31-
break
32-
fi
33-
done
34-
else
35-
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
36-
build_cmd_ok=true
37-
else
38-
build_cmd_ok=false
39-
fi
40-
fi
41-
42-
if [ "$build_cmd_ok" = true ]; then
43-
echo "Build succeeded on attempt $attempt."
44-
exit 0
45-
fi
46-
47-
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
50-
sleep 30
51-
fi
52-
attempt=$((attempt + 1))
53-
done
54-
55-
echo "Build failed after $max_attempts attempts."
56-
exit 1
21+
if [ "$run_bench" == "bench" ]; then
22+
for dir in benchmarks/*/; do
23+
./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
24+
done
25+
else
26+
./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts
27+
fi

.github/workflows/frontier_amd/build.sh

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,39 +18,10 @@ fi
1818

1919
. ./mfc.sh load -c famd -m g
2020

21-
max_attempts=3
22-
attempt=1
23-
while [ $attempt -le $max_attempts ]; do
24-
echo "Build attempt $attempt of $max_attempts..."
25-
if [ "$run_bench" == "bench" ]; then
26-
build_cmd_ok=true
27-
for dir in benchmarks/*/; do
28-
dirname=$(basename "$dir")
29-
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
30-
build_cmd_ok=false
31-
break
32-
fi
33-
done
34-
else
35-
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
36-
build_cmd_ok=true
37-
else
38-
build_cmd_ok=false
39-
fi
40-
fi
41-
42-
if [ "$build_cmd_ok" = true ]; then
43-
echo "Build succeeded on attempt $attempt."
44-
exit 0
45-
fi
46-
47-
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
50-
sleep 30
51-
fi
52-
attempt=$((attempt + 1))
53-
done
54-
55-
echo "Build failed after $max_attempts attempts."
56-
exit 1
21+
if [ "$run_bench" == "bench" ]; then
22+
for dir in benchmarks/*/; do
23+
./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
24+
done
25+
else
26+
./mfc.sh test -v -a --dry-run -j 4 $build_opts
27+
fi

.github/workflows/test.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,17 @@ jobs:
245245
labels: ${{ matrix.runner }}
246246
env:
247247
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
248-
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
249-
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
250248
steps:
251249
- name: Clone
252250
uses: actions/checkout@v4
253251

254252
- name: Build
255253
if: matrix.cluster != 'phoenix'
256-
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
254+
uses: nick-fields/retry@v3
255+
with:
256+
max_attempts: 3
257+
retry_wait_seconds: 60
258+
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
257259

258260
- name: Test
259261
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}

0 commit comments

Comments
 (0)