Skip to content

Commit f7afee9

Browse files
authored
fp-stability: scale-free pass/fail + Verrou cancellation-origin reporting; auto-installed prebuilt (#1526)
1 parent 16fed9b commit f7afee9

8 files changed

Lines changed: 1190 additions & 947 deletions

File tree

.github/workflows/fp-stability.yml

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ name: FP Stability
2424
# On FAIL: verrou_dd_sym runs to identify the responsible function symbols.
2525
# Logs are uploaded as CI artifacts.
2626
#
27-
# Verrou (Valgrind 3.26.0 + edf-hpc/verrou@a58d434) is built once and cached.
28-
# Build takes ~20 min uncached; cached runs restore in ~30 s.
27+
# Verrou (the pinned Valgrind+Verrou pair; versions live in toolchain/bootstrap/verrou.sh)
28+
# is installed by fp-stability on first use and cached. The prebuilt download is seconds;
29+
# a cache miss with no prebuilt falls back to a ~20-min source build.
2930

3031
on:
3132
push:
@@ -68,37 +69,21 @@ jobs:
6869
uses: actions/cache@v4
6970
with:
7071
path: ~/.local/verrou
71-
key: verrou-a58d434-valgrind-3.26.0-${{ runner.os }}
72+
# Key off the installer's content so any version bump (or other edit) in
73+
# verrou.sh auto-busts the cache and forces a fresh install — no hand-synced
74+
# version string to drift out of date.
75+
key: verrou-${{ hashFiles('toolchain/bootstrap/verrou.sh') }}-${{ runner.os }}
7276

7377
- name: Install system dependencies
7478
run: |
7579
sudo apt-get update -y
7680
sudo apt-get install -y \
7781
build-essential automake python3 python3-numpy libc6-dbg \
78-
cmake gfortran
82+
cmake gfortran zstd
7983
80-
- name: Build Verrou
81-
if: steps.cache-verrou.outputs.cache-hit != 'true'
82-
run: |
83-
cd /tmp
84-
wget -q https://sourceware.org/pub/valgrind/valgrind-3.26.0.tar.bz2
85-
tar xf valgrind-3.26.0.tar.bz2
86-
87-
git clone https://github.com/edf-hpc/verrou.git
88-
git -C verrou checkout a58d434
89-
90-
# Merge Verrou into Valgrind source tree and patch
91-
cp -r verrou valgrind-3.26.0/verrou
92-
cd valgrind-3.26.0
93-
cat verrou/valgrind.*diff | patch -p1
94-
95-
./autogen.sh
96-
./configure --enable-only64bit --prefix="$HOME/.local/verrou"
97-
make -j"$(nproc)"
98-
make install
99-
100-
- name: Verify Verrou
101-
run: ~/.local/verrou/bin/valgrind --version
84+
# Verrou is installed by `fp-stability` itself on first use (downloads the
85+
# prebuilt artifact; aborts if that fails). The cache above restores it across
86+
# runs so the download only happens on a cache miss.
10287

10388
- name: Build MFC (debug, serial)
10489
# FFLAGS=-fno-inline prevents gfortran from inlining small functions into

toolchain/bootstrap/verrou.sh

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#!/bin/bash
2+
#
3+
# Opt-in installer for Verrou (the Valgrind FP-perturbation tool used by
4+
# `./mfc.sh fp-stability`). Verrou is NOT a Python/pip package - it is a fork of
5+
# Valgrind. By default this downloads a prebuilt, hash-verified artifact (seconds);
6+
# if none is available for this tag/arch it falls back to a source build (~20 min).
7+
# fp-stability auto-runs this on first use when Verrou is absent (printing what it
8+
# does); it is also safe to run by hand. A failed install aborts, never a silent skip.
9+
#
10+
# bash toolchain/bootstrap/verrou.sh # install into $HOME/.local/verrou
11+
# VERROU_HOME=/path bash toolchain/bootstrap/verrou.sh
12+
# bash toolchain/bootstrap/verrou.sh --force # reinstall even if present
13+
# VERROU_BUILD_FROM_SOURCE=1 bash toolchain/bootstrap/verrou.sh # skip the prebuilt
14+
#
15+
# Versions are pinned to match the fp-stability CI workflow.
16+
17+
set -euo pipefail
18+
19+
VALGRIND_VERSION="3.26.0"
20+
VERROU_COMMIT="a58d434"
21+
# Prebuilt artifacts (built once per arch) live in a small companion repo. The tag
22+
# pins to the (valgrind, verrou) pair above - bump all three together.
23+
VERROU_DIST_REPO="${VERROU_DIST_REPO:-sbryngelson/verrou-dist}"
24+
VERROU_DIST_TAG="${VERROU_DIST_TAG:-v1}"
25+
PREFIX="${VERROU_HOME:-$HOME/.local/verrou}"
26+
FORCE="${1:-}"
27+
28+
echo "==> Verrou bootstrap (Valgrind ${VALGRIND_VERSION} + edf-hpc/verrou@${VERROU_COMMIT}) -> ${PREFIX}"
29+
30+
# Idempotent: skip if already installed and working. Source env.sh first if present
31+
# (a prebuilt tree needs VALGRIND_LIB to run; a source build works either way).
32+
if [ "$FORCE" != "--force" ] && [ -x "${PREFIX}/bin/valgrind" ] \
33+
&& ( [ -f "${PREFIX}/env.sh" ] && . "${PREFIX}/env.sh"; "${PREFIX}/bin/valgrind" --tool=verrou --version >/dev/null 2>&1 ); then
34+
echo "==> Verrou already installed at ${PREFIX} (use --force to rebuild). Nothing to do."
35+
exit 0
36+
fi
37+
38+
# Platform: Valgrind has no working modern-macOS support; Linux only.
39+
if [ "$(uname -s)" != "Linux" ]; then
40+
echo "ERROR: Verrou requires Linux (Valgrind does not support modern macOS, incl. Apple Silicon)." >&2
41+
exit 1
42+
fi
43+
arch_tag=""
44+
case "$(uname -m)" in
45+
x86_64) arch_tag="x86_64" ;;
46+
aarch64|arm64)
47+
arch_tag="aarch64"
48+
echo "WARNING: $(uname -m) detected. Valgrind builds here, but Verrou's FP backends are" >&2
49+
echo " best-validated on x86_64 - treat results as experimental on this arch." >&2
50+
;;
51+
*)
52+
echo "WARNING: unrecognised arch $(uname -m); the build may fail. Proceeding anyway." >&2
53+
;;
54+
esac
55+
56+
# Fast path: download a prebuilt, hash-verified artifact and source its relocatable
57+
# env.sh, instead of building from source. Any failure (no asset for this arch/tag,
58+
# missing zstd/sha256sum, checksum mismatch, won't run) falls through to the build.
59+
try_prebuilt() {
60+
[ -n "$arch_tag" ] || return 1
61+
[ "${VERROU_BUILD_FROM_SOURCE:-}" = "1" ] && return 1
62+
command -v sha256sum >/dev/null 2>&1 || return 1
63+
tar --zstd --help >/dev/null 2>&1 || command -v zstd >/dev/null 2>&1 || return 1
64+
command -v curl >/dev/null 2>&1 || command -v wget >/dev/null 2>&1 || return 1
65+
66+
local asset base dl
67+
asset="verrou-${VERROU_COMMIT}-valgrind-${VALGRIND_VERSION}-linux-${arch_tag}.tar.zst"
68+
base="https://github.com/${VERROU_DIST_REPO}/releases/download/${VERROU_DIST_TAG}/${asset}"
69+
dl="$(mktemp -d)"
70+
71+
echo "==> Trying prebuilt ${VERROU_DIST_REPO}@${VERROU_DIST_TAG} (${asset})"
72+
_fetch() { # url dest
73+
if command -v curl >/dev/null 2>&1; then curl -fsSL -o "$2" "$1"; else wget -q -O "$2" "$1"; fi
74+
}
75+
if ! _fetch "$base" "$dl/$asset" || ! _fetch "$base.sha256" "$dl/$asset.sha256"; then
76+
echo "==> No prebuilt for this tag/arch - building from source instead."
77+
rm -rf "$dl"; return 1
78+
fi
79+
if ! ( cd "$dl" && sha256sum -c "$asset.sha256" >/dev/null 2>&1 ); then
80+
echo "WARNING: prebuilt checksum mismatch - building from source instead." >&2
81+
rm -rf "$dl"; return 1
82+
fi
83+
84+
# Extract + verify in a staging dir, then swap into $PREFIX atomically. set -e
85+
# is suppressed inside a function used as an `if` condition, so check each step
86+
# explicitly - otherwise a failed extract would fall through and the source
87+
# build would install on top of a half-written tree (or a stale one on --force).
88+
local stage="$dl/stage"
89+
mkdir -p "$stage"
90+
if tar --zstd --help >/dev/null 2>&1; then
91+
tar -C "$stage" --zstd -xf "$dl/$asset" || { echo "WARNING: prebuilt extract failed - building from source instead." >&2; rm -rf "$dl"; return 1; }
92+
else
93+
zstd -dc "$dl/$asset" | tar -C "$stage" -xf - || { echo "WARNING: prebuilt extract failed - building from source instead." >&2; rm -rf "$dl"; return 1; }
94+
fi
95+
96+
# Valgrind bakes its build prefix into the binary; the artifact's env.sh sets
97+
# VALGRIND_LIB relative to the tree so the relocated install works. Verify the
98+
# staged tree runs before committing it.
99+
if ! ( . "${stage}/env.sh" && "${stage}/bin/valgrind" --tool=verrou --version >/dev/null 2>&1 ); then
100+
echo "WARNING: prebuilt did not run - building from source instead." >&2
101+
rm -rf "$dl"; return 1
102+
fi
103+
104+
# Commit only now: replace any existing $PREFIX atomically.
105+
mkdir -p "$(dirname "$PREFIX")"
106+
rm -rf "$PREFIX"
107+
if ! mv "$stage" "$PREFIX"; then
108+
echo "WARNING: could not install prebuilt to ${PREFIX} - building from source instead." >&2
109+
rm -rf "$dl"; return 1
110+
fi
111+
rm -rf "$dl"
112+
return 0
113+
}
114+
115+
if try_prebuilt; then
116+
echo "==> Verifying"
117+
( . "${PREFIX}/env.sh" && "${PREFIX}/bin/valgrind" --tool=verrou --version )
118+
echo "==> Done (prebuilt). Verrou installed at ${PREFIX}"
119+
echo " Run: ./mfc.sh fp-stability (or set VERROU_HOME=${PREFIX} if you used a custom prefix)"
120+
exit 0
121+
fi
122+
123+
# Build dependencies.
124+
missing=""
125+
for tool in tar git make patch autoconf automake; do
126+
command -v "$tool" >/dev/null 2>&1 || missing="$missing $tool"
127+
done
128+
command -v cc >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || missing="$missing gcc"
129+
command -v wget >/dev/null 2>&1 || command -v curl >/dev/null 2>&1 || missing="$missing wget/curl"
130+
if [ -n "$missing" ]; then
131+
echo "ERROR: missing build dependencies:$missing" >&2
132+
echo " Install them (e.g. apt: build-essential automake autoconf libtool; or load HPC modules) and retry." >&2
133+
exit 1
134+
fi
135+
136+
workdir="$(mktemp -d)"
137+
trap 'rm -rf "$workdir"' EXIT
138+
cd "$workdir"
139+
140+
tarball="valgrind-${VALGRIND_VERSION}.tar.bz2"
141+
url="https://sourceware.org/pub/valgrind/${tarball}"
142+
echo "==> Downloading ${tarball}"
143+
if command -v wget >/dev/null 2>&1; then
144+
wget -q "$url"
145+
else
146+
curl -fsSL -o "$tarball" "$url"
147+
fi
148+
tar xf "$tarball"
149+
150+
echo "==> Cloning Verrou @ ${VERROU_COMMIT}"
151+
git clone --quiet https://github.com/edf-hpc/verrou.git
152+
git -C verrou checkout --quiet "$VERROU_COMMIT"
153+
154+
# Merge Verrou into the Valgrind tree and apply its patch.
155+
cp -r verrou "valgrind-${VALGRIND_VERSION}/verrou"
156+
cd "valgrind-${VALGRIND_VERSION}"
157+
cat verrou/valgrind.*diff | patch -p1
158+
159+
echo "==> Building (this takes ~20 min)"
160+
./autogen.sh
161+
./configure --enable-only64bit --prefix="$PREFIX"
162+
make -j"$(nproc)"
163+
make install
164+
165+
echo "==> Verifying"
166+
"${PREFIX}/bin/valgrind" --tool=verrou --version
167+
echo "==> Done. Verrou installed at ${PREFIX}"
168+
echo " Run: ./mfc.sh fp-stability (or set VERROU_HOME=${PREFIX} if you used a custom prefix)"

toolchain/mfc/cli/commands.py

Lines changed: 30 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -898,27 +898,36 @@
898898
name="fp-stability",
899899
help="Run floating-point stability tests using Verrou.",
900900
description=(
901-
"Runs each registered test case N times under Verrou's random IEEE-754 "
902-
"rounding mode and compares against a nearest-rounding reference run. "
903-
"Reports the max L∞ deviation and PASS/FAIL against per-case thresholds.\n\n"
904-
"Requires a Verrou-enabled Valgrind at $VERROU_HOME/bin/valgrind "
905-
"(defaults to $HOME/.local/verrou). The simulation and pre_process "
906-
"binaries must be serial (no-MPI, no-GPU) debug builds.\n\n"
907-
"Test cases:\n"
908-
" sod_standard 1-D standard Sod, p_L/p_R=10 (well-conditioned baseline)\n"
909-
" sod_strong 1-D Sod, p_L/p_R=100,000 — HLLC xi-factor cancellation\n"
910-
" water_stiffened 1-D water shock (pi_inf=4046) — pressure-recovery cancellation\n"
911-
" air_water_interface 1-D air/water contact (two-fluid) — mixed-cell cancellation\n\n"
912-
"Additional features (skip with --no-* flags):\n"
901+
"Runs Verrou random-rounding stability analysis on a built-in suite of small "
902+
"1-D cases, or - given a case .py (positional INPUT) - on your own case. Each "
903+
"case is run N times under Verrou's random IEEE-754 rounding and compared "
904+
"against a nearest-rounding reference. PASS/FAIL is scale-free: a case must "
905+
"retain at least ~24 significant bits (single precision) under random rounding "
906+
"(no per-case thresholds).\n\n"
907+
"With a case .py, that case is run as a SINGLE serial CPU process under Verrou "
908+
"(~30x slower, and run many times), so it must be a small, short proxy - large "
909+
"grids or long runs are rejected with guidance; serial .dat I/O is forced. "
910+
"Example: ./mfc.sh fp-stability my_case.py\n\n"
911+
"Uses a Verrou-enabled Valgrind at $VERROU_HOME/bin/valgrind (defaults to "
912+
"$HOME/.local/verrou); if absent it is installed automatically (a pinned, "
913+
"hash-verified prebuilt is downloaded, with a source build as fallback) - "
914+
"aborts if that install fails. The simulation and pre_process binaries must "
915+
"be serial (no-MPI, no-GPU) debug builds.\n\n"
916+
"Analysis passes (skip with --no-* flags):\n"
913917
" float proxy One run with --rounding-mode=float (single-precision sensitivity)\n"
914918
" vprec sweep Runs at mantissa bits [52, 23, 16, 10] (precision floor curve)\n"
915-
" dd_sym verrou_dd_sym bisection to responsible functions (on failure)\n"
916-
" dd_line verrou_dd_line bisection to responsible source lines (on failure)\n"
917-
" cancellation --check-cancellation detection of catastrophic cancellation sites\n"
918-
" mca-sigbits Monte Carlo Arithmetic (mcaquad) significant-bits lower bound\n"
919-
" float-max --check-max-float detection of double→float overflow sites\n"
919+
" cancellation --check-cancellation origins, ranked by significant digits lost\n"
920+
" float-max --check-max-float detection of double->float overflow sites\n"
920921
),
921922
include_common=["mfc_config", "verbose", "debug_log"],
923+
positionals=[
924+
Positional(
925+
name="input",
926+
help="Optional case .py to analyze instead of the built-in suite (run as a single serial CPU process under Verrou; must be small/short).",
927+
nargs="?",
928+
completion=Completion(type=CompletionType.FILES_PY),
929+
),
930+
],
922931
arguments=[
923932
Argument(
924933
name="sim-binary",
@@ -960,34 +969,13 @@
960969
default=False,
961970
dest="no_vprec",
962971
),
963-
Argument(
964-
name="no-dd-sym",
965-
help="Skip verrou_dd_sym function-level delta-debug on failure.",
966-
action=ArgAction.STORE_TRUE,
967-
default=False,
968-
dest="no_dd_sym",
969-
),
970-
Argument(
971-
name="no-dd-line",
972-
help="Skip verrou_dd_line source-line delta-debug on failure.",
973-
action=ArgAction.STORE_TRUE,
974-
default=False,
975-
dest="no_dd_line",
976-
),
977972
Argument(
978973
name="no-cancellation",
979974
help="Skip --check-cancellation catastrophic-cancellation detection.",
980975
action=ArgAction.STORE_TRUE,
981976
default=False,
982977
dest="no_cancellation",
983978
),
984-
Argument(
985-
name="no-mca",
986-
help="Skip Monte Carlo Arithmetic (mcaquad) significant-bits estimate.",
987-
action=ArgAction.STORE_TRUE,
988-
default=False,
989-
dest="no_mca",
990-
),
991979
Argument(
992980
name="no-float-max",
993981
help="Skip --check-max-float float32 overflow detection.",
@@ -997,14 +985,15 @@
997985
),
998986
],
999987
examples=[
1000-
Example("./mfc.sh fp-stability", "Auto-discover binaries and run all cases"),
988+
Example("./mfc.sh fp-stability", "Auto-discover binaries and run the built-in suite"),
989+
Example("./mfc.sh fp-stability my_case.py", "Analyze your own case (small/short, serial, CPU)"),
1001990
Example(
1002991
"./mfc.sh fp-stability --sim-binary build/install/abc123/bin/simulation",
1003992
"Specify simulation binary explicitly",
1004993
),
1005994
Example("./mfc.sh fp-stability -N 10", "Run 10 random-rounding samples per case"),
1006-
Example("./mfc.sh fp-stability --no-vprec --no-dd-line", "Skip VPREC sweep and line debug"),
1007-
Example("./mfc.sh fp-stability --no-cancellation --no-mca --no-float-max", "Skip new analysis passes"),
995+
Example("./mfc.sh fp-stability --no-vprec --no-cancellation", "Skip VPREC sweep and cancellation detection"),
996+
Example("./mfc.sh fp-stability --no-cancellation --no-float-max", "Skip analysis passes"),
1008997
],
1009998
key_options=[
1010999
("--sim-binary PATH", "Serial simulation binary (debug, no-MPI)"),
@@ -1013,10 +1002,7 @@
10131002
("-N, --samples N", "Random-rounding samples per case (default: 5)"),
10141003
("--no-float-proxy", "Skip float-rounding proxy run"),
10151004
("--no-vprec", "Skip VPREC mantissa-bit sweep"),
1016-
("--no-dd-sym", "Skip verrou_dd_sym on failure"),
1017-
("--no-dd-line", "Skip verrou_dd_line on failure"),
10181005
("--no-cancellation", "Skip cancellation detection"),
1019-
("--no-mca", "Skip MCA significant-bits estimate"),
10201006
("--no-float-max", "Skip float32 overflow detection"),
10211007
],
10221008
)

0 commit comments

Comments
 (0)