Skip to content

Commit e331547

Browse files
authored
ci: sequential benchmark monitoring to avoid OOM kills (#1309)
1 parent f528e23 commit e331547

2 files changed

Lines changed: 80 additions & 56 deletions

File tree

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
2-
# Run PR and master benchmarks in parallel and verify outputs
2+
# Run PR and master benchmarks and verify outputs.
3+
# Both SLURM jobs are submitted up front so they run concurrently on
4+
# compute nodes (fair comparison under the same cluster load), but
5+
# monitoring happens sequentially to stay within the per-user cgroup
6+
# memory limit on login nodes (4 GB on Phoenix shared by 7 runners).
37
# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
48

59
set -euo pipefail
@@ -17,88 +21,104 @@ cluster="$3"
1721
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1822

1923
echo "=========================================="
20-
echo "Starting parallel benchmark jobs..."
24+
echo "Starting benchmark jobs..."
2125
echo "=========================================="
2226

23-
# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24-
# both parallel jobs so PR and master always land on the same GPU type.
27+
# For Phoenix GPU benchmarks, select a consistent GPU partition so PR and
28+
# master always land on the same GPU type.
2529
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
2630
echo "Selecting Phoenix GPU partition for benchmark consistency..."
27-
# Require 2 nodes so both PR and master jobs can run concurrently.
31+
# Require 2 nodes so both jobs can run concurrently on compute.
2832
GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
2933
BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
3034
export BENCH_GPU_PARTITION
3135
fi
3236

33-
# Run both jobs with monitoring using dedicated script from PR
34-
# Use stdbuf for line-buffered output and prefix each line for clarity
35-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
36-
pr_pid=$!
37-
echo "PR job started in background (PID: $pr_pid)"
38-
39-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
40-
master_pid=$!
41-
echo "Master job started in background (PID: $master_pid)"
42-
43-
echo "Waiting for both jobs to complete..."
44-
45-
# Wait and capture exit codes reliably.
46-
# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
47-
# (which would orphan the second job).
37+
# The bench script must come from the PR tree (master may not have it).
38+
PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh"
39+
# Must match the slug computed by submit-slurm-job.sh:
40+
# basename("bench.sh") → "bench" → "bench-${device}-${interface}"
41+
job_slug="bench-${device}-${interface}"
42+
43+
# --- Phase 1: Submit both SLURM jobs (no monitoring yet) ---
44+
echo "Submitting PR benchmark..."
45+
(cd pr && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
46+
pr_job_id=$(cat "pr/${job_slug}.slurm_job_id")
47+
echo "PR job submitted: $pr_job_id"
48+
49+
echo "Submitting master benchmark..."
50+
(cd master && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
51+
master_job_id=$(cat "master/${job_slug}.slurm_job_id")
52+
echo "Master job submitted: $master_job_id"
53+
54+
echo "Both SLURM jobs submitted — running concurrently on compute nodes."
55+
echo "Monitoring sequentially to conserve login node memory."
56+
57+
# --- Phase 2: Monitor sequentially (one at a time on login node) ---
58+
echo ""
59+
echo "=== Monitoring PR job $pr_job_id ==="
4860
pr_exit=0
49-
master_exit=0
50-
51-
wait "$pr_pid" || pr_exit=$?
61+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out" || pr_exit=$?
5262
if [ "$pr_exit" -ne 0 ]; then
53-
echo "PR job exited with code: $pr_exit"
54-
echo "Last 50 lines of PR job log:"
55-
tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
63+
echo "PR job exited with code: $pr_exit"
64+
tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
5665
else
57-
echo "PR job completed successfully"
66+
echo "PR job completed successfully"
5867
fi
5968

60-
wait "$master_pid" || master_exit=$?
69+
echo ""
70+
echo "=== Monitoring master job $master_job_id ==="
71+
master_exit=0
72+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$master_job_id" "master/${job_slug}.out" || master_exit=$?
6173
if [ "$master_exit" -ne 0 ]; then
62-
echo "Master job exited with code: $master_exit"
63-
echo "Last 50 lines of master job log:"
64-
tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
74+
echo "Master job exited with code: $master_exit"
75+
tail -n 50 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
6576
else
66-
echo "Master job completed successfully"
77+
echo "Master job completed successfully"
6778
fi
6879

69-
# Warn if either job failed (partial results may still be usable)
80+
# --- Phase 3: Verify outputs ---
7081
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
71-
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
72-
echo "Checking for partial results..."
82+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
83+
echo "Checking for partial results..."
7384
else
74-
echo "=========================================="
75-
echo "Both benchmark jobs completed successfully!"
76-
echo "=========================================="
85+
echo "=========================================="
86+
echo "Both benchmark jobs completed successfully!"
87+
echo "=========================================="
7788
fi
7889

79-
# Final verification that output files exist before proceeding
80-
pr_yaml="pr/bench-${device}-${interface}.yaml"
81-
master_yaml="master/bench-${device}-${interface}.yaml"
90+
pr_yaml="pr/${job_slug}.yaml"
91+
master_yaml="master/${job_slug}.yaml"
92+
93+
# Wait briefly for YAML files to appear on NFS. When monitoring starts
94+
# after a job has already completed (common for the second job), the
95+
# recovery path in run_monitored_slurm_job.sh sleeps 30s, but NFS
96+
# propagation can take longer under load.
97+
for yaml in "$pr_yaml" "$master_yaml"; do
98+
attempts=0
99+
while [ ! -f "$yaml" ] && [ $attempts -lt 6 ]; do
100+
echo "Waiting for $yaml to appear (NFS propagation)..."
101+
sleep 5
102+
attempts=$((attempts + 1))
103+
done
104+
done
82105

83106
if [ ! -f "$pr_yaml" ]; then
84-
echo "ERROR: PR benchmark output not found: $pr_yaml"
85-
ls -la pr/ || true
86-
echo ""
87-
echo "Last 100 lines of PR log:"
88-
tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
89-
exit 1
107+
echo "ERROR: PR benchmark output not found: $pr_yaml"
108+
ls -la pr/ || true
109+
echo ""
110+
tail -n 100 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
111+
exit 1
90112
fi
91113

92114
if [ ! -f "$master_yaml" ]; then
93-
echo "ERROR: Master benchmark output not found: $master_yaml"
94-
ls -la master/ || true
95-
echo ""
96-
echo "Last 100 lines of master log:"
97-
tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
98-
exit 1
115+
echo "ERROR: Master benchmark output not found: $master_yaml"
116+
ls -la master/ || true
117+
echo ""
118+
tail -n 100 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
119+
exit 1
99120
fi
100121

101122
echo "Verified both YAML files exist:"
102123
echo " - $pr_yaml"
103124
echo " - $master_yaml"
104-

.github/scripts/submit-slurm-job.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,9 @@ echo "Submitted batch job $job_id"
200200
echo "$job_id" > "$id_file"
201201
echo "Job ID written to $id_file"
202202

203-
# --- Monitor ---
204-
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
203+
# --- Monitor (skip if SUBMIT_ONLY=1, e.g. for parallel submission) ---
204+
if [ "${SUBMIT_ONLY:-0}" = "1" ]; then
205+
echo "SUBMIT_ONLY mode: skipping monitor (job_id=$job_id output=$output_file)"
206+
else
207+
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
208+
fi

0 commit comments

Comments
 (0)