Skip to content

Commit c83429b

Browse files
sbryngelsonclaude
andcommitted
Harden SLURM job monitor against transient squeue failures
The monitor script used `squeue -j $id &>/dev/null` which only checks the exit code. When squeue itself fails transiently (SLURM daemon overloaded, network hiccup), this is indistinguishable from "job doesn't exist," causing the monitor to give up on jobs that are still PENDING in the queue — leaving orphaned SLURM jobs. Changes: - Add get_job_state() that parses squeue output for the actual state string, with sacct fallback for completed/historical jobs - Never give up on UNKNOWN state (let CI timeout be the backstop) - Cancel orphaned SLURM jobs on abnormal monitor exit - Fix fractional read timeouts that caused bash segfaults - Include job state in heartbeat messages for better diagnostics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0ba6c02 commit c83429b

1 file changed

Lines changed: 87 additions & 55 deletions

File tree

.github/scripts/monitor_slurm_job.sh

Lines changed: 87 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44

55
set -euo pipefail
66

7-
# Cleanup handler to prevent orphaned tail processes
7+
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88
cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12+
# Cancel the SLURM job if the monitor is exiting due to an error
13+
# (e.g., the CI runner is being killed). Don't cancel on success.
14+
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16+
scancel "$job_id" 2>/dev/null || true
17+
fi
1218
}
1319
trap cleanup EXIT
1420

@@ -23,30 +29,78 @@ output_file="$2"
2329
echo "Submitted batch job $job_id"
2430
echo "Monitoring output file: $output_file"
2531

26-
# Wait for file to appear with retry logic for transient squeue failures
32+
# Robustly check SLURM job state using squeue with sacct fallback.
33+
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+
# or "UNKNOWN" if both commands fail.
35+
get_job_state() {
36+
local jid="$1"
37+
local state
38+
39+
# Try squeue first (fast, works for active jobs)
40+
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
41+
if [ -n "$state" ]; then
42+
echo "$state"
43+
return
44+
fi
45+
46+
# Fallback to sacct (works for completed/historical jobs)
47+
if command -v sacct >/dev/null 2>&1; then
48+
state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
49+
if [ -n "$state" ]; then
50+
echo "$state"
51+
return
52+
fi
53+
fi
54+
55+
echo "UNKNOWN"
56+
}
57+
58+
# Check if a state is terminal (job is done, for better or worse)
59+
is_terminal_state() {
60+
case "$1" in
61+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
62+
return 0 ;;
63+
*)
64+
return 1 ;;
65+
esac
66+
}
67+
68+
# Wait for file to appear, using robust state checking.
69+
# Never give up due to transient squeue/sacct failures — the CI job timeout
70+
# is the ultimate backstop.
2771
echo "Waiting for job to start..."
28-
squeue_retries=0
29-
max_squeue_retries=5
72+
unknown_count=0
3073
while [ ! -f "$output_file" ]; do
31-
# Check if job is still queued/running
32-
if squeue -j "$job_id" &>/dev/null; then
33-
squeue_retries=0 # Reset on success
34-
sleep 5
35-
else
36-
squeue_retries=$((squeue_retries + 1))
37-
if [ $squeue_retries -ge $max_squeue_retries ]; then
38-
# Job not in queue and output file doesn't exist
39-
if [ ! -f "$output_file" ]; then
40-
echo "ERROR: Job $job_id not in queue and output file not created"
74+
state=$(get_job_state "$job_id")
75+
76+
case "$state" in
77+
PENDING|CONFIGURING)
78+
unknown_count=0
79+
sleep 5
80+
;;
81+
RUNNING|COMPLETING)
82+
unknown_count=0
83+
# Job is running but output file not yet visible (NFS delay)
84+
sleep 2
85+
;;
86+
UNKNOWN)
87+
unknown_count=$((unknown_count + 1))
88+
# Only print warning periodically to avoid log spam
89+
if [ $((unknown_count % 12)) -eq 1 ]; then
90+
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+
fi
92+
sleep 5
93+
;;
94+
*)
95+
# Terminal state — job finished without creating output
96+
if is_terminal_state "$state"; then
97+
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
4198
exit 1
4299
fi
43-
break
44-
fi
45-
# Exponential backoff
46-
sleep_time=$((2 ** squeue_retries))
47-
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48-
sleep $sleep_time
49-
fi
100+
# Unrecognized state, keep waiting
101+
sleep 5
102+
;;
103+
esac
50104
done
51105

52106
echo "=== Streaming output for job $job_id ==="
@@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111
tail_pid=$!
58112

59113
# Monitor job status and stream output simultaneously
60-
squeue_failures=0
61114
last_heartbeat=$(date +%s)
62115

63116
while true; do
64117
# Try to read from tail output (non-blocking via timeout)
65118
# Read multiple lines if available to avoid falling behind
66119
lines_read=0
67-
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
120+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
68121
echo "$line"
69122
lines_read=$((lines_read + 1))
70123
last_heartbeat=$(date +%s)
@@ -73,49 +126,30 @@ while true; do
73126
break
74127
fi
75128
done
76-
129+
77130
# Check job status
78131
current_time=$(date +%s)
79-
if ! squeue -j "$job_id" &>/dev/null; then
80-
squeue_failures=$((squeue_failures + 1))
81-
# Check if job actually completed using sacct (if available)
82-
if [ $squeue_failures -ge 3 ]; then
83-
if command -v sacct >/dev/null 2>&1; then
84-
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85-
# Consider job done only if it reached a terminal state
86-
case "$state" in
87-
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88-
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89-
break
90-
;;
91-
*)
92-
# treat as transient failure, reset failures and continue polling
93-
squeue_failures=0
94-
;;
95-
esac
96-
else
97-
# No sacct: assume job completed after 3 failures
98-
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99-
break
100-
fi
101-
fi
132+
state=$(get_job_state "$job_id")
133+
134+
if is_terminal_state "$state"; then
135+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
136+
break
102137
else
103-
squeue_failures=0
104138
# Print heartbeat if no output for 60 seconds
105139
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106-
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
140+
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
107141
last_heartbeat=$current_time
108142
fi
109143
fi
110-
144+
111145
# Sleep briefly between status checks
112146
sleep 1
113147
done
114148

115149
# Drain any remaining output from tail after job completes
116150
echo "Draining remaining output..."
117151
drain_count=0
118-
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
152+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
119153
echo "$line"
120154
drain_count=$((drain_count + 1))
121155
# Safety limit to avoid infinite loop
@@ -128,6 +162,7 @@ done
128162
# Close the file descriptor and kill tail
129163
exec 3<&-
130164
kill "${tail_pid}" 2>/dev/null || true
165+
tail_pid=""
131166

132167
# Wait for output file to finish growing (stabilize) before stopping tail
133168
if [ -f "$output_file" ]; then
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184
done
150185
fi
151186

152-
# Stop tailing (trap will also handle this on exit)
153-
kill "${tail_pid}" 2>/dev/null || true
154-
155187
echo ""
156188
echo "=== Final output ==="
157189
cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219
exit 1
188220
fi
189221

222+
monitor_success=1
190223
echo "Job $job_id completed successfully"
191224
exit 0
192-

0 commit comments

Comments
 (0)