Skip to content

Commit c6637a0

Browse files
committed
fp-stability: address PR review — silent failures, 1-row crash, dead code, tests, comment rot
From a multi-agent PR review: - Silent failures (critical): a crashed cancellation/float-max/MCA run was reported as a clean result ('none detected' / 'no overflows' / 'dev=0.0'). The run helpers now log the failure and return None (cancellation/float-max) or a completed-sample count (MCA, distinct from measured zero); _run_case reports 'run failed' instead of a false all-clear. - Crash (important): np.loadtxt(...)[:,1] raised IndexError on a single-row .dat (reachable via a 1-cell user case), aborting the whole suite. Added _dat_column using np.atleast_2d; also fixed the generated dd_cmp.py oracle. - Dead code: removed _cancellation_by_file (superseded by the digits-lost severity view) and _stability_pass (the orchestrator inlines the comparison), plus their tests. - Tests: smoke tests for the CI-only report emitters (blank + populated result, and unconfirmed->::notice:: downgrade), _digits_left clamp, and #:block/#:call/unbalanced macro cases. - Comment rot: stage-E docstring now says confirmation is set-level (not per-line); dropped the non-existent 'per-file density' reference; fixed the '48 ~ full mantissa' note (53-bit). 45 tests, ruff, precheck all 7.
1 parent 3b662db commit c6637a0

4 files changed

Lines changed: 147 additions & 103 deletions

File tree

toolchain/mfc/fp_stability.py

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@
2424
Each reported line is then *confirmed* by a positive control: --gen-source
2525
captures the symbol-correct executed lines, those are filtered to the suspect
2626
set, and a float-mode run with --source restricted to just them must
27-
reproduce the instability. Lines that do not reproduce it are reported as
28-
unconfirmed (downgraded from ::warning:: to ::notice::). Each line is then
29-
perturbed alone and ranked by the share of the single-precision deviation it
30-
reproduces. NOTE: this is a *sensitivity* measure — where reduced precision
31-
most moves the output — and is typically dominated by the time integrator /
32-
final accumulation, NOT by where cancellation originates. Stage F (and its
33-
per-file density) is the cancellation-origin view; the two usually differ.
27+
reproduce the instability. If perturbing the suspect set does not reproduce
28+
it, the case's hotspots are reported as unconfirmed (downgraded from
29+
::warning:: to ::notice::) — this is a single set-level verdict, not per line.
30+
Each line is then perturbed alone and ranked by the share of the single-
31+
precision deviation it reproduces. NOTE: that share is a *sensitivity*
32+
measure — where reduced precision most moves the output — typically dominated
33+
by the time integrator / final accumulation, NOT by where cancellation
34+
originates. Stage F is the cancellation-origin view; the two usually differ.
3435
Hotspots are cross-referenced against the stage-F cancellation sites and
3536
flagged as instance-ambiguous when the .fpp line sits inside a #:for/#:def
3637
expansion.
@@ -550,35 +551,42 @@ def _run_case(
550551
if run_cancellation:
551552
cons.print(" [dim]cancellation detection...[/dim]")
552553
try:
553-
# sweep bit thresholds to get per-site severity (bits lost)
554+
# sweep bit thresholds to get per-site severity (bits lost); each
555+
# run returns None if it failed (distinct from [] = ran, found none)
554556
level_sites = [(level, _run_cancellation_check(verrou_bin, sim_bin, work_dir, threshold=level)) for level in CANCEL_BIT_LEVELS]
555-
locs = level_sites[0][1] # lowest threshold = full list
556-
bits = _cancellation_severity(level_sites)
557-
result["cancellation_locs"] = locs
558-
result["cancellation_bits"] = bits
559-
if locs:
560-
worst = max(bits.values()) if bits else 0
561-
cons.print(f" cancellation: {len(locs)} site(s), worst loses ≥ {worst / math.log2(10):.0f} of ~16 digits")
557+
locs = next((s for lvl, s in level_sites if lvl == CANCEL_BIT_LEVELS[0]), None)
558+
if locs is None:
559+
cons.print(" [bold yellow]cancellation: detection run failed (see logs); not reported[/bold yellow]")
562560
else:
563-
cons.print(" cancellation: none detected")
564-
# cross-reference: label dd_line hotspots that sit on a cancellation site
565-
if result["dd_line_locs"] and locs:
566-
_mark_cancellation(result["dd_line_locs"], locs)
567-
n_xref = sum(1 for loc in result["dd_line_locs"] if loc.get("cancellation"))
568-
if n_xref:
569-
cons.print(f" {n_xref} hotspot(s) coincide with a catastrophic-cancellation site")
561+
bits = _cancellation_severity([(lvl, s) for lvl, s in level_sites if s is not None])
562+
result["cancellation_locs"] = locs
563+
result["cancellation_bits"] = bits
564+
if locs:
565+
worst = max(bits.values()) if bits else 0
566+
cons.print(f" cancellation: {len(locs)} site(s), worst loses ≥ {worst / math.log2(10):.0f} of ~16 digits")
567+
else:
568+
cons.print(" cancellation: none detected")
569+
# cross-reference: label dd_line hotspots that sit on a cancellation site
570+
if result["dd_line_locs"] and locs:
571+
_mark_cancellation(result["dd_line_locs"], locs)
572+
n_xref = sum(1 for loc in result["dd_line_locs"] if loc.get("cancellation"))
573+
if n_xref:
574+
cons.print(f" {n_xref} hotspot(s) coincide with a catastrophic-cancellation site")
570575
except Exception as exc:
571576
cons.print(f" [bold yellow]cancellation check error[/bold yellow]: {exc}")
572577

573578
# --- G: MCA significant-bits estimate ---
574579
if run_mca:
575580
cons.print(f" [dim]MCA significant-bits estimate (N={n_samples})...[/dim]")
576581
try:
577-
mca_dev, mca_sigbits = _run_mca_samples(case, verrou_bin, sim_bin, work_dir, ref_dir, n_samples)
578-
result["mca_dev"] = mca_dev
579-
result["mca_sigbits"] = mca_sigbits
580-
bits_str = f"~{mca_sigbits} sig bits" if mca_sigbits is not None else "n/a"
581-
cons.print(f" MCA: dev={mca_dev:.3e} ({bits_str})")
582+
mca_dev, mca_sigbits, n_ok = _run_mca_samples(case, verrou_bin, sim_bin, work_dir, ref_dir, n_samples)
583+
if n_ok == 0:
584+
cons.print(f" [bold yellow]MCA: no samples completed (0/{n_samples}; see logs)[/bold yellow]")
585+
else:
586+
result["mca_dev"] = mca_dev
587+
result["mca_sigbits"] = mca_sigbits
588+
bits_str = f"~{mca_sigbits} sig bits" if mca_sigbits is not None else "n/a"
589+
cons.print(f" MCA: dev={mca_dev:.3e} ({bits_str}) [{n_ok}/{n_samples} samples]")
582590
except Exception as exc:
583591
cons.print(f" [bold yellow]MCA error[/bold yellow]: {exc}")
584592

@@ -587,11 +595,14 @@ def _run_case(
587595
cons.print(" [dim]float-max overflow check...[/dim]")
588596
try:
589597
locs = _run_float_max_check(verrou_bin, sim_bin, work_dir)
590-
result["float_max_locs"] = locs
591-
if locs:
592-
cons.print(f" [bold yellow]float-max[/bold yellow]: {len(locs)} overflow site(s)")
598+
if locs is None:
599+
cons.print(" [bold yellow]float-max: run failed (see logs); not reported[/bold yellow]")
593600
else:
594-
cons.print(" float-max: no overflows")
601+
result["float_max_locs"] = locs
602+
if locs:
603+
cons.print(f" [bold yellow]float-max[/bold yellow]: {len(locs)} overflow site(s)")
604+
else:
605+
cons.print(" float-max: no overflows")
595606
except Exception as exc:
596607
cons.print(f" [bold yellow]float-max check error[/bold yellow]: {exc}")
597608

toolchain/mfc/fp_stability_metrics.py

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,6 @@ def _sig_bits(max_dev: float, ref_scale: float) -> float:
5858
return -math.log2(max_dev / ref_scale)
5959

6060

61-
def _stability_pass(max_dev: float, ref_scale: float, floor: float) -> bool:
62-
"""A case passes when it retains at least `floor` significant bits."""
63-
return _sig_bits(max_dev, ref_scale) >= floor
64-
65-
6661
# Matches "path/file.f90:123" or "path/file.fpp:123-456" in dd_line rddmin_summary.
6762
_LOC_RE = re.compile(r"(\S+\.(?:f90|fpp|c|cpp|h|F90))\s*:(\d+)(?:-(\d+))?", re.IGNORECASE)
6863

@@ -232,6 +227,14 @@ def _get_source_context(fname: str, lineno: int, context: int = 2) -> str:
232227
return "\n".join(rows)
233228

234229

230+
def _dat_column(path: str):
231+
"""Load column 1 (the field value) from an MFC .dat file, robust to a
232+
single-row file (np.loadtxt returns 1-D then, which [:, 1] would crash on)."""
233+
import numpy as np
234+
235+
return np.atleast_2d(np.loadtxt(path))[:, 1]
236+
237+
235238
def _max_diff_np(ref_dir: str, run_dir: str, compare_files: list) -> float:
236239
import numpy as np
237240

@@ -240,9 +243,7 @@ def _max_diff_np(ref_dir: str, run_dir: str, compare_files: list) -> float:
240243
ref_p, run_p = os.path.join(ref_dir, fname), os.path.join(run_dir, fname)
241244
if not os.path.exists(ref_p) or not os.path.exists(run_p):
242245
return float("inf")
243-
ref = np.loadtxt(ref_p)[:, 1]
244-
run = np.loadtxt(run_p)[:, 1]
245-
total = max(total, float(np.max(np.abs(ref - run))))
246+
total = max(total, float(np.max(np.abs(_dat_column(ref_p) - _dat_column(run_p)))))
246247
return total
247248

248249

@@ -255,8 +256,7 @@ def _max_abs_np(ref_dir: str, compare_files: list) -> float:
255256
ref_p = os.path.join(ref_dir, fname)
256257
if not os.path.exists(ref_p):
257258
continue
258-
ref = np.loadtxt(ref_p)[:, 1]
259-
total = max(total, float(np.max(np.abs(ref))))
259+
total = max(total, float(np.max(np.abs(_dat_column(ref_p)))))
260260
return total
261261

262262

@@ -321,7 +321,8 @@ def _parse_vg_error_locs(log_path: str, error_keyword: str) -> list:
321321
# Verrou exposes no per-site bit-count, but --cc-threshold-double is a severity
322322
# filter: a site is reported only if it lost >= the threshold bits. Sweeping these
323323
# levels and taking the highest each site survives gives a per-site "bits lost"
324-
# severity (a lower bound — no false positives). 48 ~ full double mantissa.
324+
# severity (a lower bound — no false positives). 48 is near the full 53-bit
325+
# double mantissa (the top of the sweep), not the mantissa width itself.
325326
CANCEL_BIT_LEVELS = [10, 20, 30, 40, 48]
326327

327328

@@ -474,19 +475,3 @@ def _mark_cancellation(dd_line_locs: list, cancellation_locs: list) -> list:
474475
lines = by_base.get(os.path.basename(loc["path"]), set())
475476
loc["cancellation"] = any(ln in lines for ln in range(loc["start"], loc["end"] + 1))
476477
return dd_line_locs
477-
478-
479-
def _cancellation_by_file(cancellation_locs: list) -> list:
480-
"""Aggregate cancellation sites by source file → [(basename, count)] sorted by
481-
count (desc), ties by name.
482-
483-
This is the cancellation-*origin* view (where ill-conditioning concentrates),
484-
as opposed to the per-line --source share, which is a *sensitivity* view
485-
(where reduced precision most moves the output — typically the time
486-
integrator / final accumulation, regardless of where error originates).
487-
"""
488-
counts = {}
489-
for fname, _lineno in cancellation_locs:
490-
base = os.path.basename(fname)
491-
counts[base] = counts.get(base, 0) + 1
492-
return sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))

toolchain/mfc/fp_stability_runners.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,10 @@ def _run_simulation_verrou(
118118
shutil.copy2(os.path.join(tmpdir, "D", fn), run_dir)
119119

120120

121-
def _run_cancellation_check(verrou_bin: str, sim_bin: str, work_dir: str, threshold: int = 10) -> list:
121+
def _run_cancellation_check(verrou_bin: str, sim_bin: str, work_dir: str, threshold: int = 10):
122122
"""Run --check-cancellation at the given bit threshold; return [(fname, line)]
123-
of MFC cancellation sites (subtractions losing >= `threshold` significant bits)."""
123+
of MFC cancellation sites (subtractions losing >= `threshold` significant bits),
124+
or None if the run itself failed (distinct from [] = ran and found none)."""
124125
tag = f"cancellation_{threshold}"
125126
run_dir = os.path.join(work_dir, tag)
126127
os.makedirs(run_dir, exist_ok=True)
@@ -132,8 +133,9 @@ def _run_cancellation_check(verrou_bin: str, sim_bin: str, work_dir: str, thresh
132133
]
133134
try:
134135
_run_simulation_verrou(verrou_bin, sim_bin, work_dir, run_dir, rounding_mode="nearest", extra_flags=flags)
135-
except MFCException:
136-
pass
136+
except MFCException as exc:
137+
cons.print(f" [yellow]cancellation run (threshold {threshold}) failed: {exc}[/yellow]")
138+
return None
137139
raw = _parse_cancel_gen(gen_path)
138140
filtered = [(f, ln) for f, ln in raw if _is_arithmetic_loc(f, ln, ln)]
139141
skipped = len(raw) - len(filtered)
@@ -150,27 +152,31 @@ def _run_mca_samples(
150152
ref_dir: str,
151153
n_mca: int,
152154
) -> tuple:
153-
"""Run N mcaquad samples; return (max_dev, sig_bits_lower_bound)."""
155+
"""Run N mcaquad samples; return (max_dev, sig_bits_lower_bound, n_ok) where
156+
n_ok is how many samples actually completed (0 => no usable measurement)."""
154157
compare = case["compare"]
155158
ref_scale = _max_abs_np(ref_dir, compare)
156159
max_dev = 0.0
160+
n_ok = 0
157161
flags = ["--backend=mcaquad", "--mca-mode=mca"]
158162
for i in range(n_mca):
159163
run_dir = os.path.join(work_dir, f"mca_{i:02d}")
160164
os.makedirs(run_dir, exist_ok=True)
161165
try:
162166
_run_simulation_verrou(verrou_bin, sim_bin, work_dir, run_dir, extra_flags=flags)
163167
max_dev = max(max_dev, _max_diff_np(ref_dir, run_dir, compare))
164-
except MFCException:
165-
pass
168+
n_ok += 1
169+
except MFCException as exc:
170+
cons.print(f" [dim]MCA sample {i} failed: {exc}[/dim]")
166171
sig_bits = None
167-
if max_dev > 0.0 and ref_scale > 0.0:
172+
if n_ok and max_dev > 0.0 and ref_scale > 0.0:
168173
sig_bits = max(0, int(math.floor(-math.log2(max_dev / ref_scale))))
169-
return max_dev, sig_bits
174+
return max_dev, sig_bits, n_ok
170175

171176

172-
def _run_float_max_check(verrou_bin: str, sim_bin: str, work_dir: str) -> list:
173-
"""Run with --check-max-float=yes; return [(fname, line)] of overflow sites."""
177+
def _run_float_max_check(verrou_bin: str, sim_bin: str, work_dir: str):
178+
"""Run with --check-max-float=yes; return [(fname, line)] of overflow sites,
179+
or None if the run failed (distinct from [] = ran and found none)."""
174180
run_dir = os.path.join(work_dir, "float_max")
175181
os.makedirs(run_dir, exist_ok=True)
176182
try:
@@ -182,8 +188,9 @@ def _run_float_max_check(verrou_bin: str, sim_bin: str, work_dir: str) -> list:
182188
rounding_mode="nearest",
183189
extra_flags=["--check-max-float=yes"],
184190
)
185-
except MFCException:
186-
pass
191+
except MFCException as exc:
192+
cons.print(f" [yellow]float-max run failed: {exc}[/yellow]")
193+
return None
187194
return _parse_vg_error_locs(os.path.join(run_dir, "verrou.log"), "Max float")
188195

189196

@@ -291,8 +298,8 @@ def _write_dd_cmp_py(path: str, compare_files: list, threshold: float):
291298
if not os.path.exists(ref_p) or not os.path.exists(run_p):
292299
print(f"MISSING: {{fname}}")
293300
sys.exit(1)
294-
ref = np.loadtxt(ref_p)[:, 1]
295-
run = np.loadtxt(run_p)[:, 1]
301+
ref = np.atleast_2d(np.loadtxt(ref_p))[:, 1]
302+
run = np.atleast_2d(np.loadtxt(run_p))[:, 1]
296303
dev = float(np.max(np.abs(ref - run)))
297304
max_dev = max(max_dev, dev)
298305

0 commit comments

Comments
 (0)