Skip to content

Commit 92abc4d

Browse files
authored
Merge branch 'master' into bala_force_changes
2 parents 47afbfe + edff972 commit 92abc4d

60 files changed

Lines changed: 1982 additions & 835 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/file-filter.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ yml: &yml
2525
- '.github/workflows/phoenix/**'
2626
- '.github/workflows/frontier/**'
2727
- '.github/workflows/frontier_amd/**'
28+
- '.github/scripts/**'
2829
- '.github/workflows/bench.yml'
2930
- '.github/workflows/test.yml'
3031
- '.github/workflows/formatting.yml'

.github/scripts/bench-preamble.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
3+
# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
4+
# Usage: source .github/scripts/bench-preamble.sh
5+
6+
source .github/scripts/detect-gpus.sh
7+
source .github/scripts/gpu-opts.sh
8+
9+
n_ranks=12
10+
build_opts="$gpu_opts"
11+
device_opts=""
12+
if [ "$job_device" = "gpu" ]; then
13+
n_ranks=$ngpus
14+
device_opts="$gpu_opts -g $gpu_ids"
15+
fi
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
3+
"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
4+
5+
import math
6+
import sys
7+
import os
8+
9+
if len(sys.argv) != 2:
10+
print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
11+
sys.exit(1)
12+
13+
# Allow importing from the repo root
14+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
15+
16+
from toolchain.mfc.packer.pack import compile as pack_compile
17+
18+
case_dir = sys.argv[1]
19+
if os.path.isfile(case_dir):
20+
case_dir = os.path.dirname(case_dir)
21+
22+
pack, err = pack_compile(case_dir)
23+
if err is not None:
24+
print(f"ERROR: {err}")
25+
sys.exit(1)
26+
27+
if not pack.entries:
28+
print(f"ERROR: No data found in {case_dir}/D/")
29+
sys.exit(1)
30+
31+
if pack.has_bad_values():
32+
print("ERROR: NaN or Inf detected in output:")
33+
for name, entry in pack.entries.items():
34+
for i, val in enumerate(entry.doubles):
35+
if math.isnan(val) or math.isinf(val):
36+
label = 'NaN' if math.isnan(val) else 'Inf'
37+
print(f" {label} at index {i} in {name}")
38+
break
39+
sys.exit(1)
40+
41+
total = sum(len(e.doubles) for e in pack.entries.values())
42+
print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")

.github/scripts/detect-gpus.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
3+
# Usage: source .github/scripts/detect-gpus.sh
4+
5+
ngpus=0
6+
gpu_ids=""
7+
if command -v nvidia-smi &>/dev/null; then
8+
ngpus=$(nvidia-smi -L | wc -l)
9+
gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
10+
elif command -v rocm-smi &>/dev/null; then
11+
gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
12+
ngpus=$(echo "$gpu_ids" | wc -w)
13+
fi

.github/scripts/gpu-opts.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Sets $gpu_opts from $job_device and $job_interface.
3+
# Usage: source .github/scripts/gpu-opts.sh
4+
5+
gpu_opts=""
6+
if [ "$job_device" = "gpu" ]; then
7+
gpu_opts="--gpu"
8+
if [ "$job_interface" = "omp" ]; then
9+
gpu_opts+=" mp"
10+
elif [ "$job_interface" = "acc" ]; then
11+
gpu_opts+=" acc"
12+
fi
13+
fi

.github/scripts/monitor_slurm_job.sh

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@ cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12-
# Cancel the SLURM job if the monitor is exiting due to an error
13-
# (e.g., the CI runner is being killed). Don't cancel on success.
12+
# Cancel the SLURM job only if it is still active in the scheduler.
13+
# If the job already left the queue (squeue returns empty), it has finished
14+
# and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15-
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16-
scancel "$job_id" 2>/dev/null || true
16+
active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
17+
if [ -n "$active_state" ]; then
18+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
19+
scancel "$job_id" 2>/dev/null || true
20+
else
21+
echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+
fi
1723
fi
1824
}
1925
trap cleanup EXIT
@@ -56,9 +62,11 @@ get_job_state() {
5662
}
5763

5864
# Check if a state is terminal (job is done, for better or worse)
65+
# PREEMPTED is intentionally excluded: with --requeue the job restarts under
66+
# the same job ID and we must keep monitoring rather than exiting early.
5967
is_terminal_state() {
6068
case "$1" in
61-
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
69+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6270
return 0 ;;
6371
*)
6472
return 1 ;;
@@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
7482
state=$(get_job_state "$job_id")
7583

7684
case "$state" in
77-
PENDING|CONFIGURING)
85+
PENDING|CONFIGURING|PREEMPTED)
7886
unknown_count=0
7987
sleep 5
8088
;;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
# Pre-builds all benchmark cases with --case-optimization.
4+
# Can run in two modes:
5+
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
6+
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
7+
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
8+
9+
set -e
10+
11+
# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
12+
cluster="${1:-${job_cluster:-phoenix}}"
13+
job_device="${2:-$job_device}"
14+
job_interface="${3:-$job_interface}"
15+
16+
# Derive module flag from cluster name
17+
case "$cluster" in
18+
phoenix) flag="p" ;;
19+
frontier) flag="f" ;;
20+
frontier_amd) flag="famd" ;;
21+
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
22+
esac
23+
24+
rm -rf build
25+
26+
. ./mfc.sh load -c "$flag" -m g
27+
source .github/scripts/gpu-opts.sh
28+
29+
for case in benchmarks/*/case.py; do
30+
echo "=== Pre-building: $case ==="
31+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
32+
done

.github/scripts/retry-build.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
# Provides retry_build(): 2-attempt loop.
3+
# On failure of attempt 1, nukes the entire build directory before attempt 2.
4+
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
5+
# Usage: source .github/scripts/retry-build.sh
6+
# retry_build ./mfc.sh build -j 8 --gpu acc
7+
8+
retry_build() {
9+
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
10+
local max_attempts=2
11+
local attempt=1
12+
while [ $attempt -le $max_attempts ]; do
13+
echo "Build attempt $attempt of $max_attempts..."
14+
if "$@"; then
15+
if [ -n "$validate_cmd" ]; then
16+
if ! eval "$validate_cmd"; then
17+
echo "Post-build validation failed on attempt $attempt."
18+
if [ $attempt -lt $max_attempts ]; then
19+
echo " Nuking build directory before retry..."
20+
rm -rf build 2>/dev/null || true
21+
sleep 5
22+
attempt=$((attempt + 1))
23+
continue
24+
else
25+
echo "Validation still failing after $max_attempts attempts."
26+
return 1
27+
fi
28+
fi
29+
fi
30+
echo "Build succeeded on attempt $attempt."
31+
return 0
32+
fi
33+
if [ $attempt -lt $max_attempts ]; then
34+
echo " Build failed — nuking build directory before retry..."
35+
rm -rf build 2>/dev/null || true
36+
sleep 30
37+
else
38+
echo "Build failed after $max_attempts attempts."
39+
return 1
40+
fi
41+
attempt=$((attempt + 1))
42+
done
43+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
# Runs ./mfc.sh test with all provided arguments, then retries a small number
3+
# of sporadic failures (up to 5). Exits non-zero on real failures.
4+
# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...]
5+
6+
# Extract flags that should carry over to retries (retries build their own
7+
# argument list with --only, so we capture passthrough flags here).
8+
PASSTHROUGH=""
9+
for arg in "$@"; do
10+
case "$arg" in
11+
--test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
12+
esac
13+
done
14+
15+
rm -f tests/failed_uuids.txt
16+
TEST_EXIT=0
17+
/bin/bash mfc.sh test "$@" || TEST_EXIT=$?
18+
19+
# Retry only if a small number of tests failed (sporadic failures)
20+
if [ -s tests/failed_uuids.txt ]; then
21+
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
22+
if [ "$NUM_FAILED" -le 5 ]; then
23+
FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt)
24+
echo ""
25+
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
26+
echo ""
27+
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $?
28+
else
29+
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
30+
exit 1
31+
fi
32+
elif [ "$TEST_EXIT" -ne 0 ]; then
33+
exit $TEST_EXIT
34+
fi
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/bin/bash
2+
3+
# Case-optimization CI test script.
4+
# Runs inside a SLURM job — expects $job_device and $job_interface from submit.sh.
5+
6+
set -e
7+
8+
source .github/scripts/detect-gpus.sh
9+
source .github/scripts/gpu-opts.sh
10+
11+
# Default to 1 GPU if detection found none but we're in GPU mode
12+
if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
13+
ngpus=1
14+
fi
15+
16+
# Verify the venv Python interpreter exists (created by ./mfc.sh build)
17+
if [ ! -x build/venv/bin/python3 ]; then
18+
echo "ERROR: build/venv/bin/python3 not found."
19+
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
20+
exit 1
21+
fi
22+
23+
benchmarks=(
24+
benchmarks/5eq_rk3_weno3_hllc/case.py
25+
benchmarks/viscous_weno5_sgb_acoustic/case.py
26+
benchmarks/hypo_hll/case.py
27+
benchmarks/ibm/case.py
28+
benchmarks/igr/case.py
29+
)
30+
31+
passed=0
32+
failed=0
33+
failed_cases=""
34+
35+
for case in "${benchmarks[@]}"; do
36+
case_dir="$(dirname "$case")"
37+
case_name="$(basename "$case_dir")"
38+
echo ""
39+
echo "========================================"
40+
echo "Case-optimization test: $case_name"
41+
echo "========================================"
42+
43+
# Clean any previous output
44+
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
45+
46+
# Build + run with --case-optimization, small grid, 10 timesteps
47+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
48+
# Validate output
49+
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
50+
echo "PASS: $case_name"
51+
passed=$((passed + 1))
52+
else
53+
echo "FAIL: $case_name (validation error)"
54+
failed=$((failed + 1))
55+
failed_cases="$failed_cases $case_name"
56+
fi
57+
else
58+
echo "FAIL: $case_name (build or run error)"
59+
failed=$((failed + 1))
60+
failed_cases="$failed_cases $case_name"
61+
fi
62+
63+
# Clean up output between cases
64+
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
65+
done
66+
67+
echo ""
68+
echo "========================================"
69+
echo "Case-optimization summary: $passed passed, $failed failed"
70+
if [ $failed -gt 0 ]; then
71+
echo "Failed cases:$failed_cases"
72+
fi
73+
echo "========================================"
74+
75+
[ $failed -eq 0 ] && exit 0 || exit 1

0 commit comments

Comments
 (0)