Skip to content

Commit a3b2bb4

Browse files
authored
Merge branch 'master' into fix/time-stepping-order
2 parents d9bab25 + 25a074e commit a3b2bb4

8 files changed

Lines changed: 101 additions & 72 deletions

File tree

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
2-
# Run PR and master benchmarks in parallel and verify outputs
2+
# Run PR and master benchmarks and verify outputs.
3+
# Both SLURM jobs are submitted up front so they run concurrently on
4+
# compute nodes (fair comparison under the same cluster load), but
5+
# monitoring happens sequentially to stay within the per-user cgroup
6+
# memory limit on login nodes (4 GB on Phoenix shared by 7 runners).
37
# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
48

59
set -euo pipefail
@@ -17,88 +21,104 @@ cluster="$3"
1721
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1822

1923
echo "=========================================="
20-
echo "Starting parallel benchmark jobs..."
24+
echo "Starting benchmark jobs..."
2125
echo "=========================================="
2226

23-
# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24-
# both parallel jobs so PR and master always land on the same GPU type.
27+
# For Phoenix GPU benchmarks, select a consistent GPU partition so PR and
28+
# master always land on the same GPU type.
2529
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
2630
echo "Selecting Phoenix GPU partition for benchmark consistency..."
27-
# Require 2 nodes so both PR and master jobs can run concurrently.
31+
# Require 2 nodes so both jobs can run concurrently on compute.
2832
GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
2933
BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
3034
export BENCH_GPU_PARTITION
3135
fi
3236

33-
# Run both jobs with monitoring using dedicated script from PR
34-
# Use stdbuf for line-buffered output and prefix each line for clarity
35-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
36-
pr_pid=$!
37-
echo "PR job started in background (PID: $pr_pid)"
38-
39-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
40-
master_pid=$!
41-
echo "Master job started in background (PID: $master_pid)"
42-
43-
echo "Waiting for both jobs to complete..."
44-
45-
# Wait and capture exit codes reliably.
46-
# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
47-
# (which would orphan the second job).
37+
# The bench script must come from the PR tree (master may not have it).
38+
PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh"
39+
# Must match the slug computed by submit-slurm-job.sh:
40+
# basename("bench.sh") → "bench" → "bench-${device}-${interface}"
41+
job_slug="bench-${device}-${interface}"
42+
43+
# --- Phase 1: Submit both SLURM jobs (no monitoring yet) ---
44+
echo "Submitting PR benchmark..."
45+
(cd pr && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
46+
pr_job_id=$(cat "pr/${job_slug}.slurm_job_id")
47+
echo "PR job submitted: $pr_job_id"
48+
49+
echo "Submitting master benchmark..."
50+
(cd master && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
51+
master_job_id=$(cat "master/${job_slug}.slurm_job_id")
52+
echo "Master job submitted: $master_job_id"
53+
54+
echo "Both SLURM jobs submitted — running concurrently on compute nodes."
55+
echo "Monitoring sequentially to conserve login node memory."
56+
57+
# --- Phase 2: Monitor sequentially (one at a time on login node) ---
58+
echo ""
59+
echo "=== Monitoring PR job $pr_job_id ==="
4860
pr_exit=0
49-
master_exit=0
50-
51-
wait "$pr_pid" || pr_exit=$?
61+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out" || pr_exit=$?
5262
if [ "$pr_exit" -ne 0 ]; then
53-
echo "PR job exited with code: $pr_exit"
54-
echo "Last 50 lines of PR job log:"
55-
tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
63+
echo "PR job exited with code: $pr_exit"
64+
tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
5665
else
57-
echo "PR job completed successfully"
66+
echo "PR job completed successfully"
5867
fi
5968

60-
wait "$master_pid" || master_exit=$?
69+
echo ""
70+
echo "=== Monitoring master job $master_job_id ==="
71+
master_exit=0
72+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$master_job_id" "master/${job_slug}.out" || master_exit=$?
6173
if [ "$master_exit" -ne 0 ]; then
62-
echo "Master job exited with code: $master_exit"
63-
echo "Last 50 lines of master job log:"
64-
tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
74+
echo "Master job exited with code: $master_exit"
75+
tail -n 50 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
6576
else
66-
echo "Master job completed successfully"
77+
echo "Master job completed successfully"
6778
fi
6879

69-
# Warn if either job failed (partial results may still be usable)
80+
# --- Phase 3: Verify outputs ---
7081
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
71-
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
72-
echo "Checking for partial results..."
82+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
83+
echo "Checking for partial results..."
7384
else
74-
echo "=========================================="
75-
echo "Both benchmark jobs completed successfully!"
76-
echo "=========================================="
85+
echo "=========================================="
86+
echo "Both benchmark jobs completed successfully!"
87+
echo "=========================================="
7788
fi
7889

79-
# Final verification that output files exist before proceeding
80-
pr_yaml="pr/bench-${device}-${interface}.yaml"
81-
master_yaml="master/bench-${device}-${interface}.yaml"
90+
pr_yaml="pr/${job_slug}.yaml"
91+
master_yaml="master/${job_slug}.yaml"
92+
93+
# Wait briefly for YAML files to appear on NFS. When monitoring starts
94+
# after a job has already completed (common for the second job), the
95+
# recovery path in run_monitored_slurm_job.sh sleeps 30s, but NFS
96+
# propagation can take longer under load.
97+
for yaml in "$pr_yaml" "$master_yaml"; do
98+
attempts=0
99+
while [ ! -f "$yaml" ] && [ $attempts -lt 6 ]; do
100+
echo "Waiting for $yaml to appear (NFS propagation)..."
101+
sleep 5
102+
attempts=$((attempts + 1))
103+
done
104+
done
82105

83106
if [ ! -f "$pr_yaml" ]; then
84-
echo "ERROR: PR benchmark output not found: $pr_yaml"
85-
ls -la pr/ || true
86-
echo ""
87-
echo "Last 100 lines of PR log:"
88-
tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
89-
exit 1
107+
echo "ERROR: PR benchmark output not found: $pr_yaml"
108+
ls -la pr/ || true
109+
echo ""
110+
tail -n 100 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
111+
exit 1
90112
fi
91113

92114
if [ ! -f "$master_yaml" ]; then
93-
echo "ERROR: Master benchmark output not found: $master_yaml"
94-
ls -la master/ || true
95-
echo ""
96-
echo "Last 100 lines of master log:"
97-
tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
98-
exit 1
115+
echo "ERROR: Master benchmark output not found: $master_yaml"
116+
ls -la master/ || true
117+
echo ""
118+
tail -n 100 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
119+
exit 1
99120
fi
100121

101122
echo "Verified both YAML files exist:"
102123
echo " - $pr_yaml"
103124
echo " - $master_yaml"
104-

.github/scripts/submit-slurm-job.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,9 @@ echo "Submitted batch job $job_id"
200200
echo "$job_id" > "$id_file"
201201
echo "Job ID written to $id_file"
202202

203-
# --- Monitor ---
204-
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
203+
# --- Monitor (skip if SUBMIT_ONLY=1, e.g. for parallel submission) ---
204+
if [ "${SUBMIT_ONLY:-0}" = "1" ]; then
205+
echo "SUBMIT_ONLY mode: skipping monitor (job_id=$job_id output=$output_file)"
206+
else
207+
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
208+
fi

.github/workflows/test.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,9 @@ jobs:
234234
# submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry.
235235
clean: false
236236

237+
- name: Clean stale output files
238+
run: rm -f *.out
239+
237240
- name: Build (login node)
238241
if: matrix.cluster != 'phoenix'
239242
timeout-minutes: 60
@@ -317,6 +320,9 @@ jobs:
317320
with:
318321
clean: false
319322

323+
- name: Clean stale output files
324+
run: rm -f *.out
325+
320326
- name: Pre-Build (SLURM)
321327
if: matrix.cluster == 'phoenix'
322328
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

src/common/m_model.fpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,14 +241,14 @@ contains
241241
end subroutine s_read_stl
242242

243243
!> This procedure reads an OBJ file.
244-
!! @param filepath Path to the odj file.
244+
!! @param filepath Path to the obj file.
245245
!! @param model The obj file.
246246
impure subroutine s_read_obj(filepath, model)
247247

248248
character(LEN=*), intent(in) :: filepath
249249
type(t_model), intent(out) :: model
250250

251-
integer :: i, j, k, l, iunit, iostat, nVertices
251+
integer :: i, j, k, l, iv3, iunit, iostat, nVertices
252252

253253
real(wp), dimension(1:3), allocatable :: vertices(:, :)
254254

@@ -297,10 +297,10 @@ contains
297297
read (line(3:), *) vertices(i, :)
298298
i = i + 1
299299
case ("f ")
300-
read (line(3:), *) k, l, j
300+
read (line(3:), *) k, l, iv3
301301
model%trs(j)%v(1, :) = vertices(k, :)
302302
model%trs(j)%v(2, :) = vertices(l, :)
303-
model%trs(j)%v(3, :) = vertices(j, :)
303+
model%trs(j)%v(3, :) = vertices(iv3, :)
304304
j = j + 1
305305
case default
306306
print *, "Error: unknown line type in OBJ file ", filepath

src/post_process/m_data_output.fpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,7 @@ contains
11021102
call MPI_BCAST(file_time, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
11031103
call MPI_BCAST(file_dt, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
11041104
call MPI_BCAST(file_num_procs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
1105+
time_real = file_time
11051106
11061107
allocate (proc_bubble_counts(file_num_procs))
11071108
@@ -1271,6 +1272,7 @@ contains
12711272
call MPI_BCAST(file_time, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
12721273
call MPI_BCAST(file_dt, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
12731274
call MPI_BCAST(file_num_procs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
1275+
time_real = file_time
12741276
12751277
allocate (proc_bubble_counts(file_num_procs))
12761278
@@ -1546,19 +1548,16 @@ contains
15461548
counter = counter + 1
15471549
x_d1(counter) = x_cc(j)
15481550
y_d1(counter) = y_cc(k)
1549-
euc_d = sqrt((x_cc(j) - x_d1(i))**2 + (y_cc(k) - y_d1(i))**2)
1550-
tgp = sqrt(dx(j)**2 + dy(k)**2)
15511551
else
1552-
euc_d = sqrt((x_cc(j) - x_d1(i))**2 + (y_cc(k) - y_d1(i))**2)
15531552
tgp = sqrt(dx(j)**2 + dy(k)**2)
15541553
do i = 1, counter
1554+
euc_d = sqrt((x_cc(j) - x_d1(i))**2 + (y_cc(k) - y_d1(i))**2)
15551555
if (euc_d < tgp) then
1556-
cycle
1557-
elseif (euc_d > tgp .and. i == counter) then
1556+
exit
1557+
elseif (i == counter) then
15581558
counter = counter + 1
15591559
x_d1(counter) = x_cc(j)
15601560
y_d1(counter) = y_cc(k)
1561-
15621561
end if
15631562
end do
15641563
end if

src/pre_process/m_simplex_noise.fpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,8 @@ contains
205205
x2 = x0 - 1._wp + 2._wp*G2
206206
y2 = y0 - 1._wp + 2._wp*G2
207207

208-
ii = mod(i, 255)
209-
jj = mod(j, 255)
208+
ii = iand(i, 255)
209+
jj = iand(j, 255)
210210

211211
gi0 = mod(p_vec(ii + p_vec(jj)), 10) + 1
212212
gi1 = mod(p_vec(ii + i1 + p_vec(jj + j1)), 10) + 1

src/pre_process/m_start_up.fpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ contains
357357
! the time-step directory that will contain the new grid and initial
358358
! condition data are also generated.
359359
if (old_ic .neqv. .true.) then
360-
call s_delete_directory(trim(proc_rank_dir)//'/*')
360+
call s_delete_directory(trim(proc_rank_dir))
361361
call s_create_directory(trim(proc_rank_dir)//'/0')
362362
end if
363363
@@ -507,7 +507,7 @@ contains
507507
! process may be cleaned out to make room for new pre-process data.
508508
! In addition, the time-step folder that will contain the new grid
509509
! and initial condition data are also generated.
510-
call s_create_directory(trim(proc_rank_dir)//'/*')
510+
call s_delete_directory(trim(proc_rank_dir))
511511
call s_create_directory(trim(proc_rank_dir)//'/0')
512512
513513
end subroutine s_read_serial_ic_data_files

src/simulation/m_global_parameters.fpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -815,8 +815,8 @@ contains
815815
integral(i)%xmax = dflt_real
816816
integral(i)%ymin = dflt_real
817817
integral(i)%ymax = dflt_real
818-
integral(i)%ymin = dflt_real
819-
integral(i)%ymax = dflt_real
818+
integral(i)%zmin = dflt_real
819+
integral(i)%zmax = dflt_real
820820
end do
821821
822822
! GRCBC flags

0 commit comments

Comments
 (0)