11#! /bin/bash
2- # Run PR and master benchmarks in parallel and verify outputs
2+ # Run PR and master benchmarks and verify outputs.
3+ # Both SLURM jobs are submitted up front so they run concurrently on
4+ # compute nodes (fair comparison under the same cluster load), but
5+ # monitoring happens sequentially to stay within the per-user cgroup
6+ # memory limit on login nodes (4 GB on Phoenix shared by 7 runners).
37# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
48
59set -euo pipefail
@@ -17,88 +21,104 @@ cluster="$3"
1721SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
1822
1923echo " =========================================="
20- echo " Starting parallel benchmark jobs..."
24+ echo " Starting benchmark jobs..."
2125echo " =========================================="
2226
23- # For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24- # both parallel jobs so PR and master always land on the same GPU type.
27+ # For Phoenix GPU benchmarks, select a consistent GPU partition so PR and
28+ # master always land on the same GPU type.
2529if [ " $device " = " gpu" ] && [ " $cluster " = " phoenix" ]; then
2630 echo " Selecting Phoenix GPU partition for benchmark consistency..."
27- # Require 2 nodes so both PR and master jobs can run concurrently.
31+ # Require 2 nodes so both jobs can run concurrently on compute .
2832 GPU_PARTITION_MIN_NODES=2 source " ${SCRIPT_DIR} /select-gpu-partition.sh"
2933 BENCH_GPU_PARTITION=" $SELECTED_GPU_PARTITION "
3034 export BENCH_GPU_PARTITION
3135fi
3236
33- # Run both jobs with monitoring using dedicated script from PR
34- # Use stdbuf for line-buffered output and prefix each line for clarity
35- (set -o pipefail; stdbuf -oL -eL bash " ${SCRIPT_DIR} /submit_and_monitor_bench.sh" pr " $device " " $interface " " $cluster " 2>&1 | while IFS= read -r line; do echo " [PR] $line " ; done) &
36- pr_pid=$!
37- echo " PR job started in background (PID: $pr_pid )"
38-
39- (set -o pipefail; stdbuf -oL -eL bash " ${SCRIPT_DIR} /submit_and_monitor_bench.sh" master " $device " " $interface " " $cluster " 2>&1 | while IFS= read -r line; do echo " [MASTER] $line " ; done) &
40- master_pid=$!
41- echo " Master job started in background (PID: $master_pid )"
42-
43- echo " Waiting for both jobs to complete..."
44-
45- # Wait and capture exit codes reliably.
46- # Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
47- # (which would orphan the second job).
37+ # The bench script must come from the PR tree (master may not have it).
38+ PR_BENCH_SCRIPT=" $( cd " ${SCRIPT_DIR} /../workflows/common" && pwd) /bench.sh"
39+ # Must match the slug computed by submit-slurm-job.sh:
40+ # basename("bench.sh") → "bench" → "bench-${device}-${interface}"
41+ job_slug=" bench-${device} -${interface} "
42+
43+ # --- Phase 1: Submit both SLURM jobs (no monitoring yet) ---
44+ echo " Submitting PR benchmark..."
45+ (cd pr && SUBMIT_ONLY=1 bash " ${SCRIPT_DIR} /submit-slurm-job.sh" " $PR_BENCH_SCRIPT " " $device " " $interface " " $cluster " )
46+ pr_job_id=$( cat " pr/${job_slug} .slurm_job_id" )
47+ echo " PR job submitted: $pr_job_id "
48+
49+ echo " Submitting master benchmark..."
50+ (cd master && SUBMIT_ONLY=1 bash " ${SCRIPT_DIR} /submit-slurm-job.sh" " $PR_BENCH_SCRIPT " " $device " " $interface " " $cluster " )
51+ master_job_id=$( cat " master/${job_slug} .slurm_job_id" )
52+ echo " Master job submitted: $master_job_id "
53+
54+ echo " Both SLURM jobs submitted — running concurrently on compute nodes."
55+ echo " Monitoring sequentially to conserve login node memory."
56+
57+ # --- Phase 2: Monitor sequentially (one at a time on login node) ---
58+ echo " "
59+ echo " === Monitoring PR job $pr_job_id ==="
4860pr_exit=0
49- master_exit=0
50-
51- wait " $pr_pid " || pr_exit=$?
61+ bash " ${SCRIPT_DIR} /run_monitored_slurm_job.sh" " $pr_job_id " " pr/${job_slug} .out" || pr_exit=$?
5262if [ " $pr_exit " -ne 0 ]; then
53- echo " PR job exited with code: $pr_exit "
54- echo " Last 50 lines of PR job log:"
55- tail -n 50 " pr/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read PR log"
63+ echo " PR job exited with code: $pr_exit "
64+ tail -n 50 " pr/${job_slug} .out" 2> /dev/null || echo " Could not read PR log"
5665else
57- echo " PR job completed successfully"
66+ echo " PR job completed successfully"
5867fi
5968
60- wait " $master_pid " || master_exit=$?
69+ echo " "
70+ echo " === Monitoring master job $master_job_id ==="
71+ master_exit=0
72+ bash " ${SCRIPT_DIR} /run_monitored_slurm_job.sh" " $master_job_id " " master/${job_slug} .out" || master_exit=$?
6173if [ " $master_exit " -ne 0 ]; then
62- echo " Master job exited with code: $master_exit "
63- echo " Last 50 lines of master job log:"
64- tail -n 50 " master/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read master log"
74+ echo " Master job exited with code: $master_exit "
75+ tail -n 50 " master/${job_slug} .out" 2> /dev/null || echo " Could not read master log"
6576else
66- echo " Master job completed successfully"
77+ echo " Master job completed successfully"
6778fi
6879
69- # Warn if either job failed (partial results may still be usable)
80+ # --- Phase 3: Verify outputs ---
7081if [ " ${pr_exit} " -ne 0 ] || [ " ${master_exit} " -ne 0 ]; then
71- echo " WARNING: Benchmark jobs had failures: pr=${pr_exit} , master=${master_exit} "
72- echo " Checking for partial results..."
82+ echo " WARNING: Benchmark jobs had failures: pr=${pr_exit} , master=${master_exit} "
83+ echo " Checking for partial results..."
7384else
74- echo " =========================================="
75- echo " Both benchmark jobs completed successfully!"
76- echo " =========================================="
85+ echo " =========================================="
86+ echo " Both benchmark jobs completed successfully!"
87+ echo " =========================================="
7788fi
7889
79- # Final verification that output files exist before proceeding
80- pr_yaml=" pr/bench-${device} -${interface} .yaml"
81- master_yaml=" master/bench-${device} -${interface} .yaml"
90+ pr_yaml=" pr/${job_slug} .yaml"
91+ master_yaml=" master/${job_slug} .yaml"
92+
93+ # Wait briefly for YAML files to appear on NFS. When monitoring starts
94+ # after a job has already completed (common for the second job), the
95+ # recovery path in run_monitored_slurm_job.sh sleeps 30s, but NFS
96+ # propagation can take longer under load.
97+ for yaml in " $pr_yaml " " $master_yaml " ; do
98+ attempts=0
99+ while [ ! -f " $yaml " ] && [ $attempts -lt 6 ]; do
100+ echo " Waiting for $yaml to appear (NFS propagation)..."
101+ sleep 5
102+ attempts=$(( attempts + 1 ))
103+ done
104+ done
82105
83106if [ ! -f " $pr_yaml " ]; then
84- echo " ERROR: PR benchmark output not found: $pr_yaml "
85- ls -la pr/ || true
86- echo " "
87- echo " Last 100 lines of PR log:"
88- tail -n 100 " pr/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read PR log"
89- exit 1
107+ echo " ERROR: PR benchmark output not found: $pr_yaml "
108+ ls -la pr/ || true
109+ echo " "
110+ tail -n 100 " pr/${job_slug} .out" 2> /dev/null || echo " Could not read PR log"
111+ exit 1
90112fi
91113
92114if [ ! -f " $master_yaml " ]; then
93- echo " ERROR: Master benchmark output not found: $master_yaml "
94- ls -la master/ || true
95- echo " "
96- echo " Last 100 lines of master log:"
97- tail -n 100 " master/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read master log"
98- exit 1
115+ echo " ERROR: Master benchmark output not found: $master_yaml "
116+ ls -la master/ || true
117+ echo " "
118+ tail -n 100 " master/${job_slug} .out" 2> /dev/null || echo " Could not read master log"
119+ exit 1
99120fi
100121
101122echo " Verified both YAML files exist:"
102123echo " - $pr_yaml "
103124echo " - $master_yaml "
104-
0 commit comments