Skip to content

Commit 2276eb1

Browse files
committed
fp-stability: accept a user case.py (positional, like run), with a feasibility guard
Following the native convention (run/validate/viz take the case .py as a positional 'input'), fp-stability now does too — './mfc.sh fp-stability my_case.py' analyzes your case instead of the built-in suite; omitting it runs the suite as before. It loads the case via the shared loader (run.input.load), runs it as a single case, and auto-detects the files to diff from the reference run (_autodetect_compare: conserved-var .dat at the final step, prim fallback). Output is forced to serial .dat I/O (parallel_io=F) since the no-MPI binary is run as one process and the suite diffs serial files. Guard (Verrou is ~30x and the suite runs the sim many times): the case must be a small, short, single-process proxy — errors if cells > 100k or work (cells x t_step_stop) > 200k cell-steps, with guidance to coarsen. Validated end-to-end on a real case .py (auto-compare + sig-bits PASS + cancellation digits); guard correctly rejects 1D_sodshocktube (400k cell-steps). 60 toolchain tests, ruff, precheck all 7.
1 parent 982ec89 commit 2276eb1

4 files changed

Lines changed: 111 additions & 2 deletions

File tree

toolchain/mfc/cli/commands.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,14 @@
919919
" float-max --check-max-float detection of double→float overflow sites\n"
920920
),
921921
include_common=["mfc_config", "verbose", "debug_log"],
922+
positionals=[
923+
Positional(
924+
name="input",
925+
help="Optional case .py to analyze instead of the built-in suite (run as a single serial CPU process under Verrou; must be small/short).",
926+
nargs="?",
927+
completion=Completion(type=CompletionType.FILES_PY),
928+
),
929+
],
922930
arguments=[
923931
Argument(
924932
name="sim-binary",
@@ -997,7 +1005,8 @@
9971005
),
9981006
],
9991007
examples=[
1000-
Example("./mfc.sh fp-stability", "Auto-discover binaries and run all cases"),
1008+
Example("./mfc.sh fp-stability", "Auto-discover binaries and run the built-in suite"),
1009+
Example("./mfc.sh fp-stability my_case.py", "Analyze your own case (small/short, serial, CPU)"),
10011010
Example(
10021011
"./mfc.sh fp-stability --sim-binary build/install/abc123/bin/simulation",
10031012
"Specify simulation binary explicitly",

toolchain/mfc/fp_stability.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
from .fp_stability_metrics import (
7676
CANCEL_BIT_LEVELS,
7777
MIN_SIG_BITS,
78+
_autodetect_compare,
7879
_cancellation_severity,
7980
_mark_cancellation,
8081
_max_abs_np,
@@ -431,6 +432,15 @@ def _run_case(
431432
cons.print(" [dim]reference run (rounding=nearest)...[/dim]")
432433
_run_simulation_verrou(verrou_bin, sim_bin, work_dir, ref_dir, rounding_mode="nearest")
433434

435+
# For a user case with no fixed compare list, diff whatever the reference
436+
# run actually wrote (conserved vars at the final step).
437+
if not compare:
438+
compare = _autodetect_compare(os.listdir(ref_dir))
439+
case["compare"] = compare
440+
if not compare:
441+
raise MFCException("case produced no cons.*/prim.* output to compare (check t_step_save/t_step_stop and parallel_io)")
442+
cons.print(f" [dim]comparing: {', '.join(compare)}[/dim]")
443+
434444
# --- A: random-rounding stability samples ---
435445
# Pass/fail is scale-free: bits retained = -log2(max_dev / field-scale),
436446
# vs one global floor (no per-case hand-tuned absolute threshold).
@@ -587,6 +597,51 @@ def _run_case(
587597
return result
588598

589599

600+
# Verrou is ~30x slower and the suite runs the simulation many times, so a user
601+
# case must be a small, short, single-process proxy. Work = cells x time steps;
602+
# both a huge grid and a long run are rejected (built-in cases are ~1k cell-steps).
603+
FP_CASE_MAX_CELLS = 100_000
604+
FP_CASE_MAX_WORK = 200_000 # cells x t_step_stop
605+
606+
607+
def _load_user_case(input_path: str) -> dict:
608+
"""Build a single fp-stability case from a user case .py.
609+
610+
The case is run as ONE serial CPU process under Verrou (so it must be small
611+
and short — a coarsened proxy of a production run, not the real thing); a grid
612+
too large to be feasible errors. The output files to compare are auto-detected
613+
from the reference run, so 'compare' is left empty here.
614+
"""
615+
from .run import input as run_input # lazy import: avoids a circular import
616+
617+
params = run_input.load(input_path, None, {}, do_print=False).params
618+
# Force serial .dat I/O: the suite runs the no-MPI binary as one process and
619+
# diffs serial cons.*/prim.* files (not the parallel SILO/HDF5 path).
620+
params["parallel_io"] = "F"
621+
m, n, p = (int(params.get(k, 0) or 0) for k in ("m", "n", "p"))
622+
cells = (m + 1) * (n + 1) * (p + 1)
623+
t_stop = int(params.get("t_step_stop", 0) or 0)
624+
work = cells * max(t_stop, 1)
625+
if cells > FP_CASE_MAX_CELLS:
626+
raise MFCException(f"case has {cells:,} cells — too large for Verrou (~30x slowdown, run many times). " f"Use a coarsened proxy (<= {FP_CASE_MAX_CELLS:,} cells).")
627+
if work > FP_CASE_MAX_WORK:
628+
raise MFCException(
629+
f"case is ~{work:,} cell-steps ({cells:,} cells x {t_stop} time steps) — too slow under "
630+
f"Verrou (~30x, run many times). Reduce m/n/p or t_step_stop (target <= {FP_CASE_MAX_WORK:,} cell-steps)."
631+
)
632+
stem = os.path.splitext(os.path.basename(input_path))[0]
633+
if stem == "case": # examples/<name>/case.py — the dir name is more telling
634+
stem = os.path.basename(os.path.dirname(os.path.abspath(input_path))) or stem
635+
return {
636+
"name": stem,
637+
"description": f"user case {input_path} ({cells} cells, run single-rank on CPU)",
638+
"compare": [], # auto-detected from the reference run's output
639+
"ill_cond": "",
640+
"pre": params,
641+
"sim": params,
642+
}
643+
644+
590645
def fp_stability():
591646
verrou_bin = ARG("verrou_binary") or _find_verrou()
592647
if not verrou_bin or not os.path.isfile(verrou_bin):
@@ -610,6 +665,8 @@ def fp_stability():
610665
run_mca = not ARG("no_mca")
611666
run_float_max = not ARG("no_float_max")
612667

668+
cases_to_run = [_load_user_case(ARG("input"))] if ARG("input") else CASES
669+
613670
log_dir = os.path.join(MFC_ROOT_DIR, "fp-stability-logs")
614671
os.makedirs(log_dir, exist_ok=True)
615672

@@ -618,6 +675,8 @@ def fp_stability():
618675
cons.print(f" verrou: {verrou_bin}")
619676
cons.print(f" simulation: {sim_bin}")
620677
cons.print(f" pre_process: {pp_bin}")
678+
if ARG("input"):
679+
cons.print(f" case: {ARG('input')} (single serial CPU run under Verrou)")
621680
cons.print(f" samples: {n_samples}")
622681
features = []
623682
if run_float:
@@ -640,7 +699,7 @@ def fp_stability():
640699

641700
start = time.time()
642701
results = []
643-
for case in CASES:
702+
for case in cases_to_run:
644703
try:
645704
r = _run_case(
646705
case,

toolchain/mfc/fp_stability_metrics.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,24 @@
1616
# 52 = full double, 23 = single, 16 = half-ish, 10 = ultra-low.
1717
VPREC_MANTISSA_BITS = [52, 23, 16, 10]
1818

19+
_OUTPUT_DAT = re.compile(r"^(cons|prim)\.\d+\.\d+\.(\d+)\.dat$")
20+
21+
22+
def _autodetect_compare(filenames: list) -> list:
23+
"""Pick the D/ output files to diff for a user-supplied case: the conserved-
24+
variable files at the latest written time step (falling back to primitive
25+
files if none are written). Returns [] if the case produced no field output."""
26+
by_step = {}
27+
for f in filenames:
28+
m = _OUTPUT_DAT.match(os.path.basename(f))
29+
if m:
30+
by_step.setdefault(int(m.group(2)), {"cons": [], "prim": []})[m.group(1)].append(os.path.basename(f))
31+
if not by_step:
32+
return []
33+
last = by_step[max(by_step)]
34+
return sorted(last["cons"] or last["prim"])
35+
36+
1937
# Stability pass/fail (stage A) is scale-free: a case must retain at least this
2038
# many significant bits under random rounding (sig_bits = -log2(max_dev/scale)).
2139
# 24 ~= single precision. One global floor replaces per-case absolute thresholds

toolchain/mfc/test_fp_stability.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from mfc.fp_stability_metrics import (
1010
MIN_SIG_BITS,
11+
_autodetect_compare,
1112
_build_source_filter,
1213
_cancellation_by_file,
1314
_cancellation_severity,
@@ -242,6 +243,28 @@ def test_cancellation_severity_empty():
242243
assert _cancellation_severity([]) == {}
243244

244245

246+
# --- auto-detect which output files to compare (for a user case) ---
247+
248+
249+
def test_autodetect_compare_picks_cons_at_latest_step():
250+
fns = [
251+
"cons.1.00.000000.dat",
252+
"cons.1.00.000050.dat",
253+
"cons.2.00.000050.dat",
254+
"prim.1.00.000050.dat",
255+
]
256+
assert _autodetect_compare(fns) == ["cons.1.00.000050.dat", "cons.2.00.000050.dat"]
257+
258+
259+
def test_autodetect_compare_falls_back_to_prim_when_no_cons():
260+
fns = ["prim.1.00.000010.dat", "prim.3.00.000010.dat"]
261+
assert _autodetect_compare(fns) == ["prim.1.00.000010.dat", "prim.3.00.000010.dat"]
262+
263+
264+
def test_autodetect_compare_empty_when_no_field_output():
265+
assert _autodetect_compare(["indices.dat", "pre_time_data.dat", "foo.txt"]) == []
266+
267+
245268
# --- scale-free pass/fail: significant bits retained ---
246269

247270

0 commit comments

Comments
 (0)