Skip to content

Commit 8b37c79

Browse files
authored
Merge branch 'master' into fix/mhd-cleaning-speed
2 parents e553bbc + edff972 commit 8b37c79

161 files changed

Lines changed: 9578 additions & 3169 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.codeant/configuration.json

Lines changed: 0 additions & 14 deletions
This file was deleted.

.coderabbit.yaml

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,52 @@ language: en-US
33

44
reviews:
55
profile: chill
6+
7+
# Hide the “Review skipped / Draft detected” status message
8+
review_status: false
9+
10+
# Remove the “Prompt for all review comments with AI agents” block
11+
enable_prompt_for_ai_agents: false
12+
13+
# Disable extra/non-default summary sections
14+
high_level_summary: false
15+
related_issues: false
16+
sequence_diagrams: false
17+
changed_files_summary: false
18+
19+
# Further reduce walkthrough noise (footer itself isn't currently configurable)
20+
collapse_walkthrough: true
21+
review_details: false
22+
estimate_code_review_effort: false
23+
assess_linked_issues: false
24+
related_prs: false
25+
suggested_labels: false
26+
suggested_reviewers: false
27+
in_progress_fortune: false
28+
poem: false
29+
30+
# Remove "Finishing Touches" section content
31+
finishing_touches:
32+
docstrings:
33+
enabled: false
34+
unit_tests:
35+
enabled: false
36+
37+
auto_review:
38+
enabled: true # initial PR-open review
39+
auto_incremental_review: false # no re-run on every push
40+
641
path_instructions:
42+
- path: "**/*"
43+
instructions: |
44+
IMPORTANT: Only comment on code that is within this PR’s diff (changed lines / files).
45+
Do not add feedback that requires commenting outside the diff range.
46+
747
- path: "src/**/*.fpp"
848
instructions: |
949
Fortran source (Fypp-preprocessed). Follow the coding standards in
1050
docs/documentation/contributing.md and the GPU macro API in
11-
docs/documentation/gpuParallelization.md.
12-
- path: "src/**/*.f90"
13-
instructions: |
14-
Fortran source. Follow the coding standards in
51+
docs/documentation/gpuParallelization.md and the coding standards in
1552
docs/documentation/contributing.md.
1653
- path: "toolchain/**/*.py"
1754
instructions: |

.github/file-filter.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ yml: &yml
2525
- '.github/workflows/phoenix/**'
2626
- '.github/workflows/frontier/**'
2727
- '.github/workflows/frontier_amd/**'
28+
- '.github/scripts/**'
2829
- '.github/workflows/bench.yml'
2930
- '.github/workflows/test.yml'
3031
- '.github/workflows/formatting.yml'

.github/pull_request_template.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,11 @@ See the [developer guide](https://mflowcode.github.io/documentation/contributing
3030
- [ ] Tested on NVIDIA GPU or AMD GPU
3131

3232
</details>
33+
34+
## AI code reviews
35+
36+
Reviews are not triggered automatically. To request a review, comment on the PR:
37+
- `@coderabbitai review` — incremental review (new changes only)
38+
- `@coderabbitai full review` — full review from scratch
39+
- `/review` — Qodo review
40+
- `/improve` — Qodo code suggestions

.github/scripts/bench-preamble.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
3+
# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
4+
# Usage: source .github/scripts/bench-preamble.sh
5+
6+
source .github/scripts/detect-gpus.sh
7+
source .github/scripts/gpu-opts.sh
8+
9+
n_ranks=12
10+
build_opts="$gpu_opts"
11+
device_opts=""
12+
if [ "$job_device" = "gpu" ]; then
13+
n_ranks=$ngpus
14+
device_opts="$gpu_opts -g $gpu_ids"
15+
fi
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
3+
"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
4+
5+
import math
6+
import sys
7+
import os
8+
9+
if len(sys.argv) != 2:
10+
print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
11+
sys.exit(1)
12+
13+
# Allow importing from the repo root
14+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
15+
16+
from toolchain.mfc.packer.pack import compile as pack_compile
17+
18+
case_dir = sys.argv[1]
19+
if os.path.isfile(case_dir):
20+
case_dir = os.path.dirname(case_dir)
21+
22+
pack, err = pack_compile(case_dir)
23+
if err is not None:
24+
print(f"ERROR: {err}")
25+
sys.exit(1)
26+
27+
if not pack.entries:
28+
print(f"ERROR: No data found in {case_dir}/D/")
29+
sys.exit(1)
30+
31+
if pack.has_bad_values():
32+
print("ERROR: NaN or Inf detected in output:")
33+
for name, entry in pack.entries.items():
34+
for i, val in enumerate(entry.doubles):
35+
if math.isnan(val) or math.isinf(val):
36+
label = 'NaN' if math.isnan(val) else 'Inf'
37+
print(f" {label} at index {i} in {name}")
38+
break
39+
sys.exit(1)
40+
41+
total = sum(len(e.doubles) for e in pack.entries.values())
42+
print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")

.github/scripts/detect-gpus.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
3+
# Usage: source .github/scripts/detect-gpus.sh
4+
5+
ngpus=0
6+
gpu_ids=""
7+
if command -v nvidia-smi &>/dev/null; then
8+
ngpus=$(nvidia-smi -L | wc -l)
9+
gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
10+
elif command -v rocm-smi &>/dev/null; then
11+
gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
12+
ngpus=$(echo "$gpu_ids" | wc -w)
13+
fi

.github/scripts/gpu-opts.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Sets $gpu_opts from $job_device and $job_interface.
3+
# Usage: source .github/scripts/gpu-opts.sh
4+
5+
gpu_opts=""
6+
if [ "$job_device" = "gpu" ]; then
7+
gpu_opts="--gpu"
8+
if [ "$job_interface" = "omp" ]; then
9+
gpu_opts+=" mp"
10+
elif [ "$job_interface" = "acc" ]; then
11+
gpu_opts+=" acc"
12+
fi
13+
fi

.github/scripts/monitor_slurm_job.sh

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@ cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12-
# Cancel the SLURM job if the monitor is exiting due to an error
13-
# (e.g., the CI runner is being killed). Don't cancel on success.
12+
# Cancel the SLURM job only if it is still active in the scheduler.
13+
# If the job already left the queue (squeue returns empty), it has finished
14+
# and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15-
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16-
scancel "$job_id" 2>/dev/null || true
16+
active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
17+
if [ -n "$active_state" ]; then
18+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
19+
scancel "$job_id" 2>/dev/null || true
20+
else
21+
echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+
fi
1723
fi
1824
}
1925
trap cleanup EXIT
@@ -56,9 +62,11 @@ get_job_state() {
5662
}
5763

5864
# Check if a state is terminal (job is done, for better or worse)
65+
# PREEMPTED is intentionally excluded: with --requeue the job restarts under
66+
# the same job ID and we must keep monitoring rather than exiting early.
5967
is_terminal_state() {
6068
case "$1" in
61-
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
69+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6270
return 0 ;;
6371
*)
6472
return 1 ;;
@@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
7482
state=$(get_job_state "$job_id")
7583

7684
case "$state" in
77-
PENDING|CONFIGURING)
85+
PENDING|CONFIGURING|PREEMPTED)
7886
unknown_count=0
7987
sleep 5
8088
;;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
# Pre-builds all benchmark cases with --case-optimization.
4+
# Can run in two modes:
5+
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
6+
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
7+
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
8+
9+
set -e
10+
11+
# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
12+
cluster="${1:-${job_cluster:-phoenix}}"
13+
job_device="${2:-$job_device}"
14+
job_interface="${3:-$job_interface}"
15+
16+
# Derive module flag from cluster name
17+
case "$cluster" in
18+
phoenix) flag="p" ;;
19+
frontier) flag="f" ;;
20+
frontier_amd) flag="famd" ;;
21+
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
22+
esac
23+
24+
rm -rf build
25+
26+
. ./mfc.sh load -c "$flag" -m g
27+
source .github/scripts/gpu-opts.sh
28+
29+
for case in benchmarks/*/case.py; do
30+
echo "=== Pre-building: $case ==="
31+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
32+
done

0 commit comments

Comments
 (0)