@@ -9,11 +9,17 @@ cleanup() {
99 if [ -n " ${tail_pid:- } " ]; then
1010 kill " ${tail_pid} " 2> /dev/null || true
1111 fi
12- # Cancel the SLURM job if the monitor is exiting due to an error
13- # (e.g., the CI runner is being killed). Don't cancel on success.
12+ # Cancel the SLURM job only if it is still active in the scheduler.
13+ # If the job already left the queue (squeue returns empty), it has finished
14+ # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415 if [ " ${monitor_success:- 0} " -ne 1 ] && [ -n " ${job_id:- } " ]; then
15- echo " Monitor exiting abnormally — cancelling SLURM job $job_id "
16- scancel " $job_id " 2> /dev/null || true
16+ active_state=$( squeue -j " $job_id " -h -o ' %T' 2> /dev/null | head -n1 | tr -d ' ' || echo " " )
17+ if [ -n " $active_state " ]; then
18+ echo " Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state )"
19+ scancel " $job_id " 2> /dev/null || true
20+ else
21+ echo " Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+ fi
1723 fi
1824}
1925trap cleanup EXIT
@@ -56,9 +62,11 @@ get_job_state() {
5662}
5763
5864# Check if a state is terminal (job is done, for better or worse)
65+ # PREEMPTED is intentionally excluded: with --requeue the job restarts under
66+ # the same job ID and we must keep monitoring rather than exiting early.
5967is_terminal_state () {
6068 case " $1 " in
61- COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED| REVOKED)
69+ COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6270 return 0 ;;
6371 * )
6472 return 1 ;;
@@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
7482 state=$( get_job_state " $job_id " )
7583
7684 case " $state " in
77- PENDING|CONFIGURING)
85+ PENDING|CONFIGURING|PREEMPTED )
7886 unknown_count=0
7987 sleep 5
8088 ;;
0 commit comments