Skip to content

Commit c6b6f81

Browse files
sbryngelsonclaude
andcommitted
Harden SLURM monitor: robust state checks, orphan cleanup
Replace squeue exit-code polling with get_job_state() that parses the actual state string (squeue + sacct fallback). Never give up on UNKNOWN state — CI timeout is the backstop. Cancel orphaned SLURM jobs on abnormal monitor exit. Include job state in heartbeats. Incorporates changes from PR #1140. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0658bd3 commit c6b6f81

1 file changed

Lines changed: 85 additions & 53 deletions

File tree

.github/scripts/monitor_slurm_job.sh

Lines changed: 85 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44

55
set -euo pipefail
66

7-
# Cleanup handler to prevent orphaned tail processes
7+
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88
cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12+
# Cancel the SLURM job if the monitor is exiting due to an error
13+
# (e.g., the CI runner is being killed). Don't cancel on success.
14+
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16+
scancel "$job_id" 2>/dev/null || true
17+
fi
1218
}
1319
trap cleanup EXIT
1420

@@ -23,30 +29,78 @@ output_file="$2"
2329
echo "Submitted batch job $job_id"
2430
echo "Monitoring output file: $output_file"
2531

26-
# Wait for file to appear with retry logic for transient squeue failures
32+
# Robustly check SLURM job state using squeue with sacct fallback.
33+
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+
# or "UNKNOWN" if both commands fail.
35+
get_job_state() {
36+
local jid="$1"
37+
local state
38+
39+
# Try squeue first (fast, works for active jobs)
40+
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
41+
if [ -n "$state" ]; then
42+
echo "$state"
43+
return
44+
fi
45+
46+
# Fallback to sacct (works for completed/historical jobs)
47+
if command -v sacct >/dev/null 2>&1; then
48+
state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
49+
if [ -n "$state" ]; then
50+
echo "$state"
51+
return
52+
fi
53+
fi
54+
55+
echo "UNKNOWN"
56+
}
57+
58+
# Check if a state is terminal (job is done, for better or worse)
59+
is_terminal_state() {
60+
case "$1" in
61+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
62+
return 0 ;;
63+
*)
64+
return 1 ;;
65+
esac
66+
}
67+
68+
# Wait for file to appear, using robust state checking.
69+
# Never give up due to transient squeue/sacct failures — the CI job timeout
70+
# is the ultimate backstop.
2771
echo "Waiting for job to start..."
28-
squeue_retries=0
29-
max_squeue_retries=5
72+
unknown_count=0
3073
while [ ! -f "$output_file" ]; do
31-
# Check if job is still queued/running
32-
if squeue -j "$job_id" &>/dev/null; then
33-
squeue_retries=0 # Reset on success
34-
sleep 5
35-
else
36-
squeue_retries=$((squeue_retries + 1))
37-
if [ $squeue_retries -ge $max_squeue_retries ]; then
38-
# Job not in queue and output file doesn't exist
39-
if [ ! -f "$output_file" ]; then
40-
echo "ERROR: Job $job_id not in queue and output file not created"
74+
state=$(get_job_state "$job_id")
75+
76+
case "$state" in
77+
PENDING|CONFIGURING)
78+
unknown_count=0
79+
sleep 5
80+
;;
81+
RUNNING|COMPLETING)
82+
unknown_count=0
83+
# Job is running but output file not yet visible (NFS delay)
84+
sleep 2
85+
;;
86+
UNKNOWN)
87+
unknown_count=$((unknown_count + 1))
88+
# Only print warning periodically to avoid log spam
89+
if [ $((unknown_count % 12)) -eq 1 ]; then
90+
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+
fi
92+
sleep 5
93+
;;
94+
*)
95+
# Terminal state — job finished without creating output
96+
if is_terminal_state "$state"; then
97+
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
4198
exit 1
4299
fi
43-
break
44-
fi
45-
# Exponential backoff
46-
sleep_time=$((2 ** squeue_retries))
47-
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48-
sleep $sleep_time
49-
fi
100+
# Unrecognized state, keep waiting
101+
sleep 5
102+
;;
103+
esac
50104
done
51105

52106
echo "=== Streaming output for job $job_id ==="
@@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111
tail_pid=$!
58112

59113
# Monitor job status and stream output simultaneously
60-
squeue_failures=0
61114
last_heartbeat=$(date +%s)
62115

63116
while true; do
@@ -73,41 +126,22 @@ while true; do
73126
break
74127
fi
75128
done
76-
129+
77130
# Check job status
78131
current_time=$(date +%s)
79-
if ! squeue -j "$job_id" &>/dev/null; then
80-
squeue_failures=$((squeue_failures + 1))
81-
# Check if job actually completed using sacct (if available)
82-
if [ $squeue_failures -ge 3 ]; then
83-
if command -v sacct >/dev/null 2>&1; then
84-
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85-
# Consider job done only if it reached a terminal state
86-
case "$state" in
87-
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88-
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89-
break
90-
;;
91-
*)
92-
# treat as transient failure, reset failures and continue polling
93-
squeue_failures=0
94-
;;
95-
esac
96-
else
97-
# No sacct: assume job completed after 3 failures
98-
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99-
break
100-
fi
101-
fi
132+
state=$(get_job_state "$job_id")
133+
134+
if is_terminal_state "$state"; then
135+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
136+
break
102137
else
103-
squeue_failures=0
104138
# Print heartbeat if no output for 60 seconds
105139
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106-
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
140+
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
107141
last_heartbeat=$current_time
108142
fi
109143
fi
110-
144+
111145
# Sleep briefly between status checks
112146
sleep 1
113147
done
@@ -128,6 +162,7 @@ done
128162
# Close the file descriptor and kill tail
129163
exec 3<&-
130164
kill "${tail_pid}" 2>/dev/null || true
165+
tail_pid=""
131166

132167
# Wait for output file to finish growing (stabilize) before stopping tail
133168
if [ -f "$output_file" ]; then
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184
done
150185
fi
151186

152-
# Stop tailing (trap will also handle this on exit)
153-
kill "${tail_pid}" 2>/dev/null || true
154-
155187
echo ""
156188
echo "=== Final output ==="
157189
cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219
exit 1
188220
fi
189221

222+
monitor_success=1
190223
echo "Job $job_id completed successfully"
191224
exit 0
192-

0 commit comments

Comments
 (0)