44
55set -euo pipefail
66
7- # Cleanup handler to prevent orphaned tail processes
7+ # Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88cleanup () {
99 if [ -n " ${tail_pid:- } " ]; then
1010 kill " ${tail_pid} " 2> /dev/null || true
1111 fi
12+ # Cancel the SLURM job if the monitor is exiting due to an error
13+ # (e.g., the CI runner is being killed). Don't cancel on success.
14+ if [ " ${monitor_success:- 0} " -ne 1 ] && [ -n " ${job_id:- } " ]; then
15+ echo " Monitor exiting abnormally — cancelling SLURM job $job_id "
16+ scancel " $job_id " 2> /dev/null || true
17+ fi
1218}
1319trap cleanup EXIT
1420
@@ -23,30 +29,78 @@ output_file="$2"
2329echo " Submitted batch job $job_id "
2430echo " Monitoring output file: $output_file "
2531
26- # Wait for file to appear with retry logic for transient squeue failures
32+ # Robustly check SLURM job state using squeue with sacct fallback.
33+ # Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+ # or "UNKNOWN" if both commands fail.
35+ get_job_state () {
36+ local jid=" $1 "
37+ local state
38+
39+ # Try squeue first (fast, works for active jobs)
40+ state=$( squeue -j " $jid " -h -o ' %T' 2> /dev/null | head -n1 | tr -d ' ' )
41+ if [ -n " $state " ]; then
42+ echo " $state "
43+ return
44+ fi
45+
46+ # Fallback to sacct (works for completed/historical jobs)
47+ if command -v sacct > /dev/null 2>&1 ; then
48+ state=$( sacct -j " $jid " --format=State --noheader 2> /dev/null | head -n1 | awk ' {print $1}' )
49+ if [ -n " $state " ]; then
50+ echo " $state "
51+ return
52+ fi
53+ fi
54+
55+ echo " UNKNOWN"
56+ }
57+
58+ # Check if a state is terminal (job is done, for better or worse)
59+ is_terminal_state () {
60+ case " $1 " in
61+ COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
62+ return 0 ;;
63+ * )
64+ return 1 ;;
65+ esac
66+ }
67+
68+ # Wait for file to appear, using robust state checking.
69+ # Never give up due to transient squeue/sacct failures — the CI job timeout
70+ # is the ultimate backstop.
2771echo " Waiting for job to start..."
28- squeue_retries=0
29- max_squeue_retries=5
72+ unknown_count=0
3073while [ ! -f " $output_file " ]; do
31- # Check if job is still queued/running
32- if squeue -j " $job_id " & > /dev/null; then
33- squeue_retries=0 # Reset on success
34- sleep 5
35- else
36- squeue_retries=$(( squeue_retries + 1 ))
37- if [ $squeue_retries -ge $max_squeue_retries ]; then
38- # Job not in queue and output file doesn't exist
39- if [ ! -f " $output_file " ]; then
40- echo " ERROR: Job $job_id not in queue and output file not created"
74+ state=$( get_job_state " $job_id " )
75+
76+ case " $state " in
77+ PENDING|CONFIGURING)
78+ unknown_count=0
79+ sleep 5
80+ ;;
81+ RUNNING|COMPLETING)
82+ unknown_count=0
83+ # Job is running but output file not yet visible (NFS delay)
84+ sleep 2
85+ ;;
86+ UNKNOWN)
87+ unknown_count=$(( unknown_count + 1 ))
88+ # Only print warning periodically to avoid log spam
89+ if [ $(( unknown_count % 12 )) -eq 1 ]; then
90+ echo " Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+ fi
92+ sleep 5
93+ ;;
94+ * )
95+ # Terminal state — job finished without creating output
96+ if is_terminal_state " $state " ; then
97+ echo " ERROR: Job $job_id reached terminal state ($state ) without creating output file"
4198 exit 1
4299 fi
43- break
44- fi
45- # Exponential backoff
46- sleep_time=$(( 2 ** squeue_retries))
47- echo " Warning: squeue check failed, retrying in ${sleep_time} s..."
48- sleep $sleep_time
49- fi
100+ # Unrecognized state, keep waiting
101+ sleep 5
102+ ;;
103+ esac
50104done
51105
52106echo " === Streaming output for job $job_id ==="
@@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111tail_pid=$!
58112
59113# Monitor job status and stream output simultaneously
60- squeue_failures=0
61114last_heartbeat=$( date +%s)
62115
63116while true ; do
@@ -73,41 +126,22 @@ while true; do
73126 break
74127 fi
75128 done
76-
129+
77130 # Check job status
78131 current_time=$( date +%s)
79- if ! squeue -j " $job_id " & > /dev/null; then
80- squeue_failures=$(( squeue_failures + 1 ))
81- # Check if job actually completed using sacct (if available)
82- if [ $squeue_failures -ge 3 ]; then
83- if command -v sacct > /dev/null 2>&1 ; then
84- state=$( sacct -j " $job_id " --format=State --noheader 2> /dev/null | head -n1 | awk ' {print $1}' )
85- # Consider job done only if it reached a terminal state
86- case " $state " in
87- COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88- echo " [$( date +%H:%M:%S) ] Job $job_id reached terminal state: $state "
89- break
90- ;;
91- * )
92- # treat as transient failure, reset failures and continue polling
93- squeue_failures=0
94- ;;
95- esac
96- else
97- # No sacct: assume job completed after 3 failures
98- echo " [$( date +%H:%M:%S) ] Job $job_id no longer in queue"
99- break
100- fi
101- fi
132+ state=$( get_job_state " $job_id " )
133+
134+ if is_terminal_state " $state " ; then
135+ echo " [$( date +%H:%M:%S) ] Job $job_id reached terminal state: $state "
136+ break
102137 else
103- squeue_failures=0
104138 # Print heartbeat if no output for 60 seconds
105139 if [ $(( current_time - last_heartbeat)) -ge 60 ]; then
106- echo " [$( date +%H:%M:%S) ] Job $job_id still running (no new output for 60s)..."
140+ echo " [$( date +%H:%M:%S) ] Job $job_id state= $state (no new output for 60s)..."
107141 last_heartbeat=$current_time
108142 fi
109143 fi
110-
144+
111145 # Sleep briefly between status checks
112146 sleep 1
113147done
128162# Close the file descriptor and kill tail
129163exec 3< & -
130164kill " ${tail_pid} " 2> /dev/null || true
165+ tail_pid=" "
131166
132167# Wait for output file to finish growing (stabilize) before stopping tail
133168if [ -f " $output_file " ]; then
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184 done
150185fi
151186
152- # Stop tailing (trap will also handle this on exit)
153- kill " ${tail_pid} " 2> /dev/null || true
154-
155187echo " "
156188echo " === Final output ==="
157189cat " $output_file "
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219 exit 1
188220fi
189221
222+ monitor_success=1
190223echo " Job $job_id completed successfully"
191224exit 0
192-
0 commit comments