Skip to content

Commit 9ef978f

Browse files
sbryngelsonclaude
andcommitted
Harden benchmark workflow: retry builds, proactive clean, robust monitoring
- Wrap bench builds in nick-fields/retry with 3 attempts and automatic ./mfc.sh clean between retries - Add proactive ./mfc.sh clean at start of all build scripts to prevent cross-compiler contamination from stale artifacts on persistent runners - Improve monitor_slurm_job.sh with better state detection and heartbeats - Add concurrency group to prevent duplicate bench runs per branch - Reduce timeout from 1400 to 480 minutes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4ff0bef commit 9ef978f

6 files changed

Lines changed: 136 additions & 190 deletions

File tree

.github/scripts/monitor_slurm_job.sh

Lines changed: 88 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44

55
set -euo pipefail
66

7-
# Cleanup handler to prevent orphaned tail processes
7+
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88
cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12+
# Cancel the SLURM job if the monitor is exiting due to an error
13+
# (e.g., the CI runner is being killed). Don't cancel on success.
14+
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16+
scancel "$job_id" 2>/dev/null || true
17+
fi
1218
}
1319
trap cleanup EXIT
1420

@@ -23,30 +29,78 @@ output_file="$2"
2329
echo "Submitted batch job $job_id"
2430
echo "Monitoring output file: $output_file"
2531

26-
# Wait for file to appear with retry logic for transient squeue failures
32+
# Robustly check SLURM job state using squeue with sacct fallback.
33+
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+
# or "UNKNOWN" if both commands fail.
35+
get_job_state() {
36+
local jid="$1"
37+
local state
38+
39+
# Try squeue first (fast, works for active jobs)
40+
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
41+
if [ -n "$state" ]; then
42+
echo "$state"
43+
return
44+
fi
45+
46+
# Fallback to sacct (works for completed/historical jobs)
47+
if command -v sacct >/dev/null 2>&1; then
48+
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
49+
if [ -n "$state" ]; then
50+
echo "$state"
51+
return
52+
fi
53+
fi
54+
55+
echo "UNKNOWN"
56+
}
57+
58+
# Check if a state is terminal (job is done, for better or worse)
59+
is_terminal_state() {
60+
case "$1" in
61+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE)
62+
return 0 ;;
63+
*)
64+
return 1 ;;
65+
esac
66+
}
67+
68+
# Wait for file to appear, using robust state checking.
69+
# Never give up due to transient squeue/sacct failures — the CI job timeout
70+
# is the ultimate backstop.
2771
echo "Waiting for job to start..."
28-
squeue_retries=0
29-
max_squeue_retries=5
72+
unknown_count=0
3073
while [ ! -f "$output_file" ]; do
31-
# Check if job is still queued/running
32-
if squeue -j "$job_id" &>/dev/null; then
33-
squeue_retries=0 # Reset on success
34-
sleep 5
35-
else
36-
squeue_retries=$((squeue_retries + 1))
37-
if [ $squeue_retries -ge $max_squeue_retries ]; then
38-
# Job not in queue and output file doesn't exist
39-
if [ ! -f "$output_file" ]; then
40-
echo "ERROR: Job $job_id not in queue and output file not created"
74+
state=$(get_job_state "$job_id")
75+
76+
case "$state" in
77+
PENDING|CONFIGURING)
78+
unknown_count=0
79+
sleep 5
80+
;;
81+
RUNNING|COMPLETING)
82+
unknown_count=0
83+
# Job is running but output file not yet visible (NFS delay)
84+
sleep 2
85+
;;
86+
UNKNOWN)
87+
unknown_count=$((unknown_count + 1))
88+
# Only print warning periodically to avoid log spam
89+
if [ $((unknown_count % 12)) -eq 1 ]; then
90+
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+
fi
92+
sleep 5
93+
;;
94+
*)
95+
# Terminal state — job finished without creating output
96+
if is_terminal_state "$state"; then
97+
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
4198
exit 1
4299
fi
43-
break
44-
fi
45-
# Exponential backoff
46-
sleep_time=$((2 ** squeue_retries))
47-
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48-
sleep $sleep_time
49-
fi
100+
# Unrecognized state, keep waiting
101+
sleep 5
102+
;;
103+
esac
50104
done
51105

52106
echo "=== Streaming output for job $job_id ==="
@@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111
tail_pid=$!
58112

59113
# Monitor job status and stream output simultaneously
60-
squeue_failures=0
61114
last_heartbeat=$(date +%s)
62115

63116
while true; do
64117
# Try to read from tail output (non-blocking via timeout)
65118
# Read multiple lines if available to avoid falling behind
66119
lines_read=0
67-
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
120+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
68121
echo "$line"
69122
lines_read=$((lines_read + 1))
70123
last_heartbeat=$(date +%s)
@@ -73,49 +126,30 @@ while true; do
73126
break
74127
fi
75128
done
76-
129+
77130
# Check job status
78131
current_time=$(date +%s)
79-
if ! squeue -j "$job_id" &>/dev/null; then
80-
squeue_failures=$((squeue_failures + 1))
81-
# Check if job actually completed using sacct (if available)
82-
if [ $squeue_failures -ge 3 ]; then
83-
if command -v sacct >/dev/null 2>&1; then
84-
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85-
# Consider job done only if it reached a terminal state
86-
case "$state" in
87-
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88-
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89-
break
90-
;;
91-
*)
92-
# treat as transient failure, reset failures and continue polling
93-
squeue_failures=0
94-
;;
95-
esac
96-
else
97-
# No sacct: assume job completed after 3 failures
98-
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99-
break
100-
fi
101-
fi
132+
state=$(get_job_state "$job_id")
133+
134+
if is_terminal_state "$state"; then
135+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
136+
break
102137
else
103-
squeue_failures=0
104138
# Print heartbeat if no output for 60 seconds
105139
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106-
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
140+
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
107141
last_heartbeat=$current_time
108142
fi
109143
fi
110-
144+
111145
# Sleep briefly between status checks
112146
sleep 1
113147
done
114148

115149
# Drain any remaining output from tail after job completes
116150
echo "Draining remaining output..."
117151
drain_count=0
118-
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
152+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
119153
echo "$line"
120154
drain_count=$((drain_count + 1))
121155
# Safety limit to avoid infinite loop
@@ -128,8 +162,9 @@ done
128162
# Close the file descriptor and kill tail
129163
exec 3<&-
130164
kill "${tail_pid}" 2>/dev/null || true
165+
tail_pid=""
131166

132-
# Wait for output file to finish growing (stabilize) before stopping tail
167+
# Wait for output file to stabilize (NFS flush) before final read
133168
if [ -f "$output_file" ]; then
134169
last_size=-1
135170
same_count=0
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184
done
150185
fi
151186

152-
# Stop tailing (trap will also handle this on exit)
153-
kill "${tail_pid}" 2>/dev/null || true
154-
155187
echo ""
156188
echo "=== Final output ==="
157189
cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219
exit 1
188220
fi
189221

222+
monitor_success=1
190223
echo "Job $job_id completed successfully"
191224
exit 0
192-

.github/workflows/bench.yml

Lines changed: 21 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,35 @@
11
name: 'Benchmark'
22

33
on:
4-
# Trigger when Test Suite completes (no polling needed)
5-
workflow_run:
6-
workflows: ["Test Suite"]
7-
types: [completed]
4+
pull_request:
5+
pull_request_review:
6+
types: [submitted]
87
workflow_dispatch:
98

109
concurrency:
11-
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
10+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
1211
cancel-in-progress: true
1312

1413
jobs:
1514
file-changes:
1615
name: Detect File Changes
17-
# Only run if Test Suite passed (or manual dispatch)
18-
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
1916
runs-on: 'ubuntu-latest'
2017
outputs:
2118
checkall: ${{ steps.changes.outputs.checkall }}
22-
pr_number: ${{ steps.pr-info.outputs.pr_number }}
23-
pr_approved: ${{ steps.pr-info.outputs.approved }}
24-
pr_author: ${{ steps.pr-info.outputs.author }}
2519
steps:
2620
- name: Clone
2721
uses: actions/checkout@v4
28-
with:
29-
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
3022

3123
- name: Detect Changes
3224
uses: dorny/paths-filter@v3
3325
id: changes
3426
with:
3527
filters: ".github/file-filter.yml"
3628

37-
- name: Get PR Info
38-
id: pr-info
39-
env:
40-
GH_TOKEN: ${{ github.token }}
41-
run: |
42-
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
43-
echo "pr_number=" >> $GITHUB_OUTPUT
44-
echo "approved=true" >> $GITHUB_OUTPUT
45-
echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
46-
else
47-
# Get PR number from workflow_run
48-
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
49-
if [ -n "$PR_NUMBER" ]; then
50-
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
51-
52-
# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
53-
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
54-
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
55-
56-
# Check if PR is approved
57-
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
58-
--jq '[.[] | select(.state == "APPROVED")] | length')
59-
if [ "$APPROVED" -gt 0 ]; then
60-
echo "approved=true" >> $GITHUB_OUTPUT
61-
else
62-
echo "approved=false" >> $GITHUB_OUTPUT
63-
fi
64-
else
65-
echo "pr_number=" >> $GITHUB_OUTPUT
66-
echo "approved=false" >> $GITHUB_OUTPUT
67-
echo "author=" >> $GITHUB_OUTPUT
68-
fi
69-
fi
70-
7129
self:
7230
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
73-
if: >
74-
github.repository == 'MFlowCode/MFC' &&
75-
needs.file-changes.outputs.checkall == 'true' &&
76-
(
77-
github.event_name == 'workflow_dispatch' ||
78-
needs.file-changes.outputs.pr_approved == 'true' ||
79-
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
80-
needs.file-changes.outputs.pr_author == 'wilfonba'
81-
)
82-
needs: [file-changes]
31+
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
32+
needs: file-changes
8333
strategy:
8434
fail-fast: false
8535
matrix:
@@ -136,14 +86,10 @@ jobs:
13686
group: ${{ matrix.group }}
13787
labels: ${{ matrix.labels }}
13888
timeout-minutes: 480
139-
env:
140-
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
141-
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
14289
steps:
14390
- name: Clone - PR
14491
uses: actions/checkout@v4
14592
with:
146-
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
14793
path: pr
14894

14995
- name: Clone - Master
@@ -155,10 +101,21 @@ jobs:
155101

156102
- name: Setup & Build
157103
if: matrix.build_script != ''
158-
run: |
159-
(cd pr && ${{ matrix.build_script }}) &
160-
(cd master && ${{ matrix.build_script }}) &
161-
wait %1 && wait %2
104+
uses: nick-fields/retry@v3
105+
with:
106+
max_attempts: 3
107+
retry_wait_seconds: 60
108+
timeout_minutes: 480
109+
command: |
110+
(cd pr && ${{ matrix.build_script }}) &
111+
pid1=$!
112+
(cd master && ${{ matrix.build_script }}) &
113+
pid2=$!
114+
wait $pid1 && wait $pid2
115+
on_retry_command: |
116+
(cd pr && ./mfc.sh clean) &
117+
(cd master && ./mfc.sh clean) &
118+
wait
162119
163120
- name: Bench (Master v. PR)
164121
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

0 commit comments

Comments
 (0)