@@ -137,14 +137,33 @@ jobs:
137137 TEST_ALL : ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
138138
139139 - name : Test
140- run : |
141- /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
140+ run : |
141+ rm -f tests/failed_uuids.txt
142+ TEST_EXIT=0
143+ /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
144+
145+ # Retry only if a small number of tests failed (sporadic failures)
146+ if [ -f tests/failed_uuids.txt ]; then
147+ NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
148+ if [ "$NUM_FAILED" -le 5 ]; then
149+ FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
150+ echo ""
151+ echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
152+ echo ""
153+ /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
154+ else
155+ echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
156+ exit 1
157+ fi
158+ elif [ "$TEST_EXIT" -ne 0 ]; then
159+ exit $TEST_EXIT
160+ fi
142161 env :
143162 TEST_ALL : ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
144163 TEST_PCT : ${{ matrix.debug == 'debug' && '-% 20' || '' }}
145164
146165 self :
147- name : " ${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
166+ name : " ${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }} )"
148167 if : github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
149168 needs : [lint-gate, file-changes]
150169 continue-on-error : false
@@ -158,60 +177,90 @@ jobs:
158177 cluster_name : ' Georgia Tech | Phoenix'
159178 device : ' gpu'
160179 interface : ' acc'
180+ shard : ' '
161181 - runner : ' gt'
162182 cluster : ' phoenix'
163183 cluster_name : ' Georgia Tech | Phoenix'
164184 device : ' gpu'
165185 interface : ' omp'
186+ shard : ' '
166187 - runner : ' gt'
167188 cluster : ' phoenix'
168189 cluster_name : ' Georgia Tech | Phoenix'
169190 device : ' cpu'
170191 interface : ' none'
171- # Frontier (ORNL) — build on login node, test via SLURM
192+ shard : ' '
193+ # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
194+ - runner : ' frontier'
195+ cluster : ' frontier'
196+ cluster_name : ' Oak Ridge | Frontier'
197+ device : ' gpu'
198+ interface : ' acc'
199+ shard : ' 1/2'
172200 - runner : ' frontier'
173201 cluster : ' frontier'
174202 cluster_name : ' Oak Ridge | Frontier'
175203 device : ' gpu'
176204 interface : ' acc'
205+ shard : ' 2/2'
177206 - runner : ' frontier'
178207 cluster : ' frontier'
179208 cluster_name : ' Oak Ridge | Frontier'
180209 device : ' gpu'
181210 interface : ' omp'
211+ shard : ' 1/2'
212+ - runner : ' frontier'
213+ cluster : ' frontier'
214+ cluster_name : ' Oak Ridge | Frontier'
215+ device : ' gpu'
216+ interface : ' omp'
217+ shard : ' 2/2'
182218 - runner : ' frontier'
183219 cluster : ' frontier'
184220 cluster_name : ' Oak Ridge | Frontier'
185221 device : ' cpu'
186222 interface : ' none'
187- # Frontier AMD — build on login node, test via SLURM
223+ shard : ' '
224+ # Frontier AMD — build on login node, GPU tests sharded for batch partition
188225 - runner : ' frontier'
189226 cluster : ' frontier_amd'
190227 cluster_name : ' Oak Ridge | Frontier (AMD)'
191228 device : ' gpu'
192229 interface : ' omp'
230+ shard : ' 1/2'
231+ - runner : ' frontier'
232+ cluster : ' frontier_amd'
233+ cluster_name : ' Oak Ridge | Frontier (AMD)'
234+ device : ' gpu'
235+ interface : ' omp'
236+ shard : ' 2/2'
193237 - runner : ' frontier'
194238 cluster : ' frontier_amd'
195239 cluster_name : ' Oak Ridge | Frontier (AMD)'
196240 device : ' cpu'
197241 interface : ' none'
242+ shard : ' '
198243 runs-on :
199244 group : phoenix
200245 labels : ${{ matrix.runner }}
201246 env :
202247 NODE_OPTIONS : ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
203- ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION : node16
204- ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION : true
205248 steps :
206249 - name : Clone
207250 uses : actions/checkout@v4
208251
209252 - name : Build
210253 if : matrix.cluster != 'phoenix'
211- run : bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
254+ uses : nick-fields/retry@v3
255+ with :
256+ max_attempts : 3
257+ retry_wait_seconds : 60
258+ timeout_minutes : 480
259+ command : bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
260+ on_retry_command : ./mfc.sh clean
212261
213262 - name : Test
214- run : bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
263+ run : bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
215264
216265 - name : Print Logs
217266 if : always()
0 commit comments