Skip to content

Commit c37c57f

Browse files
sbryngelsonclaude
andcommitted
Add test sharding, proactive clean, and retry logic for self-hosted CI
- Shard Frontier GPU tests into 2 parts for faster parallel execution - Add proactive ./mfc.sh clean in Phoenix test scripts to prevent cross-compiler contamination from stale build artifacts - Add --requeue to Phoenix SLURM jobs for preemption recovery - Add lint-gate job that must pass before self-hosted tests run - Add retry logic for GitHub runner tests (retry <=5 failures) - Add Frontier AMD test support with dedicated submit/test scripts - Restructure self-hosted matrix with explicit cluster names Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4ff0bef commit c37c57f

7 files changed

Lines changed: 85 additions & 17 deletions

File tree

.github/workflows/frontier/submit.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
3434
submit_output=$(sbatch <<EOT
3535
#!/bin/bash
3636
#SBATCH -J MFC-$job_slug # Job name
37-
#SBATCH -A ENG160 # charge account
37+
#SBATCH -A CFD154 # charge account
3838
#SBATCH -N 1 # Number of nodes required
3939
$sbatch_device_opts
40-
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
40+
#SBATCH -t 01:59:00 # Duration of the job
4141
#SBATCH -o$output_file # Combined output and error messages file
42-
#SBATCH -p extended # Extended partition for shorter queues
42+
#SBATCH -p batch # Batch partition (concurrent jobs)
43+
#SBATCH --qos=hackathon # Hackathon QOS for batch access
4344
4445
set -e
4546
set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
5051
job_slug="$job_slug"
5152
job_device="$2"
5253
job_interface="$3"
54+
job_shard="$4"
5355
5456
. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
5557

.github/workflows/frontier/test.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
1313
fi
1414
fi
1515

16+
shard_opts=""
17+
if [ -n "$job_shard" ]; then
18+
shard_opts="--shard $job_shard"
19+
fi
20+
1621
if [ "$job_device" = "gpu" ]; then
17-
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
22+
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
1823
else
1924
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
2025
fi

.github/workflows/frontier_amd/submit.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
3434
submit_output=$(sbatch <<EOT
3535
#!/bin/bash
3636
#SBATCH -J MFC-$job_slug # Job name
37-
#SBATCH -A ENG160 # charge account
37+
#SBATCH -A CFD154 # charge account
3838
#SBATCH -N 1 # Number of nodes required
3939
$sbatch_device_opts
40-
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
40+
#SBATCH -t 01:59:00 # Duration of the job
4141
#SBATCH -o$output_file # Combined output and error messages file
42-
#SBATCH -p extended # Extended partition for shorter queues
42+
#SBATCH -p batch # Batch partition (concurrent jobs)
43+
#SBATCH --qos=hackathon # Hackathon QOS for batch access
4344
4445
set -e
4546
set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
5051
job_slug="$job_slug"
5152
job_device="$2"
5253
job_interface="$3"
54+
job_shard="$4"
5355
5456
. ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
5557

.github/workflows/frontier_amd/test.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
1313
fi
1414
fi
1515

16+
shard_opts=""
17+
if [ -n "$job_shard" ]; then
18+
shard_opts="--shard $job_shard"
19+
fi
20+
1621
if [ "$job_device" = "gpu" ]; then
17-
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
22+
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
1823
else
1924
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
2025
fi

.github/workflows/phoenix/submit.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
4848
$sbatch_device_opts
4949
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
5050
#SBATCH -q embers # QOS Name
51+
#SBATCH --requeue # Auto-requeue on preemption
5152
#SBATCH -o$output_file # Combined output and error messages file
5253
5354
set -e

.github/workflows/phoenix/test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
22

3+
# Clean stale build artifacts from previous CI runs to prevent
4+
# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC)
5+
./mfc.sh clean
6+
37
build_opts=""
48
if [ "$job_device" = "gpu" ]; then
59
build_opts="--gpu"

.github/workflows/test.yml

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,33 @@ jobs:
137137
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
138138

139139
- name: Test
140-
run: |
141-
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
140+
run: |
141+
rm -f tests/failed_uuids.txt
142+
TEST_EXIT=0
143+
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
144+
145+
# Retry only if a small number of tests failed (sporadic failures)
146+
if [ -f tests/failed_uuids.txt ]; then
147+
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
148+
if [ "$NUM_FAILED" -le 5 ]; then
149+
FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
150+
echo ""
151+
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
152+
echo ""
153+
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
154+
else
155+
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
156+
exit 1
157+
fi
158+
elif [ "$TEST_EXIT" -ne 0 ]; then
159+
exit $TEST_EXIT
160+
fi
142161
env:
143162
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
144163
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
145164

146165
self:
147-
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
166+
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
148167
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
149168
needs: [lint-gate, file-changes]
150169
continue-on-error: false
@@ -158,60 +177,90 @@ jobs:
158177
cluster_name: 'Georgia Tech | Phoenix'
159178
device: 'gpu'
160179
interface: 'acc'
180+
shard: ''
161181
- runner: 'gt'
162182
cluster: 'phoenix'
163183
cluster_name: 'Georgia Tech | Phoenix'
164184
device: 'gpu'
165185
interface: 'omp'
186+
shard: ''
166187
- runner: 'gt'
167188
cluster: 'phoenix'
168189
cluster_name: 'Georgia Tech | Phoenix'
169190
device: 'cpu'
170191
interface: 'none'
171-
# Frontier (ORNL) — build on login node, test via SLURM
192+
shard: ''
193+
# Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
194+
- runner: 'frontier'
195+
cluster: 'frontier'
196+
cluster_name: 'Oak Ridge | Frontier'
197+
device: 'gpu'
198+
interface: 'acc'
199+
shard: '1/2'
172200
- runner: 'frontier'
173201
cluster: 'frontier'
174202
cluster_name: 'Oak Ridge | Frontier'
175203
device: 'gpu'
176204
interface: 'acc'
205+
shard: '2/2'
177206
- runner: 'frontier'
178207
cluster: 'frontier'
179208
cluster_name: 'Oak Ridge | Frontier'
180209
device: 'gpu'
181210
interface: 'omp'
211+
shard: '1/2'
212+
- runner: 'frontier'
213+
cluster: 'frontier'
214+
cluster_name: 'Oak Ridge | Frontier'
215+
device: 'gpu'
216+
interface: 'omp'
217+
shard: '2/2'
182218
- runner: 'frontier'
183219
cluster: 'frontier'
184220
cluster_name: 'Oak Ridge | Frontier'
185221
device: 'cpu'
186222
interface: 'none'
187-
# Frontier AMD — build on login node, test via SLURM
223+
shard: ''
224+
# Frontier AMD — build on login node, GPU tests sharded for batch partition
188225
- runner: 'frontier'
189226
cluster: 'frontier_amd'
190227
cluster_name: 'Oak Ridge | Frontier (AMD)'
191228
device: 'gpu'
192229
interface: 'omp'
230+
shard: '1/2'
231+
- runner: 'frontier'
232+
cluster: 'frontier_amd'
233+
cluster_name: 'Oak Ridge | Frontier (AMD)'
234+
device: 'gpu'
235+
interface: 'omp'
236+
shard: '2/2'
193237
- runner: 'frontier'
194238
cluster: 'frontier_amd'
195239
cluster_name: 'Oak Ridge | Frontier (AMD)'
196240
device: 'cpu'
197241
interface: 'none'
242+
shard: ''
198243
runs-on:
199244
group: phoenix
200245
labels: ${{ matrix.runner }}
201246
env:
202247
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
203-
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
204-
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
205248
steps:
206249
- name: Clone
207250
uses: actions/checkout@v4
208251

209252
- name: Build
210253
if: matrix.cluster != 'phoenix'
211-
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
254+
uses: nick-fields/retry@v3
255+
with:
256+
max_attempts: 3
257+
retry_wait_seconds: 60
258+
timeout_minutes: 480
259+
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
260+
on_retry_command: ./mfc.sh clean
212261

213262
- name: Test
214-
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
263+
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
215264

216265
- name: Print Logs
217266
if: always()

0 commit comments

Comments
 (0)