File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ # Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
3+ # from the runner OS) before the SLURM job completes. When the monitor exits
4+ # non-zero, sacct is used to verify the job's actual final state; if the SLURM
5+ # job succeeded we exit 0 so the CI step is not falsely marked as failed.
6+ #
7+ # Usage: run_monitored_slurm_job.sh <job_id> <output_file>
8+
9+ set -euo pipefail
10+
11+ if [ $# -ne 2 ]; then
12+ echo " Usage: $0 <job_id> <output_file>"
13+ exit 1
14+ fi
15+
16+ job_id=" $1 "
17+ output_file=" $2 "
18+
19+ SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
20+
21+ monitor_exit=0
22+ bash " $SCRIPT_DIR /monitor_slurm_job.sh" " $job_id " " $output_file " || monitor_exit=$?
23+
24+ if [ " $monitor_exit " -ne 0 ]; then
25+ echo " Monitor exited with code $monitor_exit ; re-checking SLURM job $job_id final state..."
26+ # Give the SLURM epilog time to finalize if the job just finished
27+ sleep 30
28+ final_state=$( sacct -j " $job_id " -n -X -P -o State 2> /dev/null | head -n1 | cut -d' |' -f1 | tr -d ' ' || echo " UNKNOWN" )
29+ final_exit=$( sacct -j " $job_id " --format=ExitCode --noheader --parsable2 2> /dev/null | head -n1 | tr -d ' ' || echo " " )
30+ echo " Final SLURM state=$final_state exit=$final_exit "
31+ if [ " $final_state " = " COMPLETED" ] && [ " $final_exit " = " 0:0" ]; then
32+ echo " SLURM job $job_id completed successfully despite monitor failure — continuing."
33+ else
34+ echo " ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit )"
35+ exit 1
36+ fi
37+ fi
Original file line number Diff line number Diff line change 102102
103103echo " Submitted batch job $job_id "
104104
105- # Use resilient monitoring instead of sbatch -W
106- bash " $SCRIPT_DIR /../../scripts/monitor_slurm_job.sh" " $job_id " " $output_file "
105+ bash " $SCRIPT_DIR /../../scripts/run_monitored_slurm_job.sh" " $job_id " " $output_file "
Original file line number Diff line number Diff line change 9494
9595echo " Submitted batch job $job_id "
9696
97- # Use resilient monitoring instead of sbatch -W
9897SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
99- monitor_exit=0
100- bash " $SCRIPT_DIR /../../scripts/monitor_slurm_job.sh" " $job_id " " $output_file " || monitor_exit=$?
101-
102- if [ " $monitor_exit " -ne 0 ]; then
103- echo " Monitor exited with code $monitor_exit ; re-checking SLURM job $job_id final state..."
104- # Give the SLURM epilog time to finalize if the job just finished
105- sleep 30
106- final_state=$( sacct -j " $job_id " -n -X -P -o State 2> /dev/null | head -n1 | cut -d' |' -f1 | tr -d ' ' || echo " UNKNOWN" )
107- final_exit=$( sacct -j " $job_id " --format=ExitCode --noheader --parsable2 2> /dev/null | head -n1 | tr -d ' ' || echo " " )
108- echo " Final SLURM state=$final_state exit=$final_exit "
109- if [ " $final_state " = " COMPLETED" ] && [ " $final_exit " = " 0:0" ]; then
110- echo " SLURM job $job_id completed successfully despite monitor failure — continuing."
111- else
112- echo " ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit )"
113- exit 1
114- fi
115- fi
98+ bash " $SCRIPT_DIR /../../scripts/run_monitored_slurm_job.sh" " $job_id " " $output_file "
You can’t perform that action at this time.
0 commit comments