Skip to content

Commit 61924d8

Browse files
sbryngelsonclaude
andcommitted
Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh
All three submit.sh scripts (phoenix, frontier, frontier_amd symlink) now call a single helper that wraps monitor_slurm_job.sh with sacct fallback: if the monitor is killed before the SLURM job completes, the helper re-checks the job's final state and exits 0 if it succeeded. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6e97695 commit 61924d8

3 files changed

Lines changed: 39 additions & 20 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
3+
# from the runner OS) before the SLURM job completes. When the monitor exits
4+
# non-zero, sacct is used to verify the job's actual final state; if the SLURM
5+
# job succeeded we exit 0 so the CI step is not falsely marked as failed.
6+
#
7+
# Usage: run_monitored_slurm_job.sh <job_id> <output_file>
8+
9+
set -euo pipefail
10+
11+
if [ $# -ne 2 ]; then
12+
echo "Usage: $0 <job_id> <output_file>"
13+
exit 1
14+
fi
15+
16+
job_id="$1"
17+
output_file="$2"
18+
19+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20+
21+
monitor_exit=0
22+
bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
23+
24+
if [ "$monitor_exit" -ne 0 ]; then
25+
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
26+
# Give the SLURM epilog time to finalize if the job just finished
27+
sleep 30
28+
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
29+
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
30+
echo "Final SLURM state=$final_state exit=$final_exit"
31+
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
32+
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
33+
else
34+
echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
35+
exit 1
36+
fi
37+
fi

.github/workflows/frontier/submit.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,5 +102,4 @@ fi
102102

103103
echo "Submitted batch job $job_id"
104104

105-
# Use resilient monitoring instead of sbatch -W
106-
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
105+
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

.github/workflows/phoenix/submit.sh

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -94,22 +94,5 @@ fi
9494

9595
echo "Submitted batch job $job_id"
9696

97-
# Use resilient monitoring instead of sbatch -W
9897
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
99-
monitor_exit=0
100-
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
101-
102-
if [ "$monitor_exit" -ne 0 ]; then
103-
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
104-
# Give the SLURM epilog time to finalize if the job just finished
105-
sleep 30
106-
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
107-
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
108-
echo "Final SLURM state=$final_state exit=$final_exit"
109-
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
110-
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
111-
else
112-
echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
113-
exit 1
114-
fi
115-
fi
98+
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

0 commit comments

Comments
 (0)