Merge pull request #747 from PlanExeOrg/napkin-math/generate-bounds-phase4-runtime

neoneye · web-flow · commit 372ddc33a9d3 · 2026-05-21T16:09:35.000+02:00
napkin-math(bounds): Phase 4 runtime + schema readiness
diff --git a/experiments/napkin_math/.claude/skills/generate-bounds/system-prompt.txt b/experiments/napkin_math/.claude/skills/generate-bounds/system-prompt.txt
@@ -187,6 +187,8 @@ sampling_discipline must be one of:
 - "integer"         — countable units (people, households, days, kits, sites, …); downstream samplers round draws to the nearest integer and re-clamp to [low, high]
 - "fraction"        — bounded in [0, 1]; downstream samplers clamp draws to that interval
 - "continuous"      — real-valued; downstream samplers do not round or clamp beyond the [low, high] range
+- "lognormal"       — fat-tailed real-valued draw whose [low, high] mean P5 / P95, not hard cutoffs. **Schema-reserved: the Monte Carlo runner accepts this value at validation time but raises NotImplementedError at sample time until Phase 8 lands the matching sampler.** Do not emit yet unless a follow-up rule (megaproject CAPEX default) explicitly directs you to.
+- "pert"            — three-point modified-beta draw centered on base with low/high as the supports. **Schema-reserved with the same Phase-8 caveat as lognormal.** Do not emit yet.
 
 Choose sampling_discipline by the variable's nature, not by lexical tokens in its id or unit. The downstream Monte Carlo runner does not pattern-match on unit strings; it reads sampling_discipline directly. There must be no fallback path that re-guesses the discipline.
 
@@ -196,11 +198,13 @@ default_pass_probability:
 - For sampling_discipline "bernoulli_gate": must be a number in [0, 1]; this is the assumed pass probability when the caller does not override it
 - For every other sampling_discipline: must be null
 
+A single optional top-level key `correlations` is reserved alongside the per-variable entries for declaring cross-variable correlation groups. **Schema-reserved with the same Phase-8 caveat as lognormal/pert: the runner preserves the key but does not yet apply correlated sampling.** Do not emit a `correlations` block yet; the detailed selection rules will land alongside the copula sampler.
+
 Rules for the output:
 
 - Output only variables selected by the rules above.
 - Do not invent ids.
-- Every top-level key must correspond to a declared id in key_values or missing_values_to_estimate.
+- Every top-level key must correspond to a declared id in key_values or missing_values_to_estimate (except for the reserved `correlations` key described above, which is not yet emitted).
 - Order keys by importance: critical-priority first, then high, then medium, then remaining missing_values_to_estimate not already placed.
 - rationale must be at most 50 words. The cap exists to discourage prose, not to suppress required disclosures: the named-anchor paraphrase required by ACTUAL-VS-COMMITMENT and the base-vs-threshold clause required by SANITY CHECK are exempt from the cap if they push the rationale past 50 words.
 - Split rationale on whitespace for word count; hyphenated and slash-joined tokens count as one word.
diff --git a/experiments/napkin_math/run_monte_carlo.py b/experiments/napkin_math/run_monte_carlo.py
@@ -35,7 +35,26 @@
     "correlation_groups": [],
 }
 
-VALID_DISCIPLINES = {"fixed", "bernoulli_gate", "integer", "fraction", "continuous"}
+VALID_DISCIPLINES = {
+    "fixed", "bernoulli_gate", "integer", "fraction", "continuous",
+    # Phase 4 reserved; sampler raises NotImplementedError until Phase 8
+    # implements the matching samplers. Validation passes so that
+    # generate-bounds can begin emitting these for plan_type-driven
+    # megaproject CAPEX without a downstream schema error.
+    "lognormal", "pert",
+}
+
+# Disciplines whose schema is reserved but whose sampler is not yet
+# implemented. ``sample_one`` raises ``NotImplementedError`` loudly so a
+# user trying to run Monte Carlo against bounds that use them gets an
+# explicit failure rather than a silent fall-back to triangular.
+UNIMPLEMENTED_DISCIPLINES = frozenset({"lognormal", "pert"})
+
+# Top-level keys in ``bounds.json`` that are not per-variable bound
+# entries. ``correlations`` carries the optional R1.3 declared-correlation
+# groups; it is preserved as-is through ``strip_threshold_bounds`` and
+# never reaches per-variable validation or sampling.
+RESERVED_TOP_LEVEL_KEYS = frozenset({"correlations"})
 
 THRESHOLD_SUFFIXES = (
     "_threshold", "_target", "_ceiling", "_floor",
@@ -135,10 +154,30 @@ def _collect_formula_threshold_ids(parameters: dict) -> set[str]:
     return threshold_ids
 
 
+def _collect_calculation_output_names(parameters: dict) -> set[str]:
+    """``output_name`` values declared by any calculation in
+    ``recommended_first_calculations`` or ``derived_questions``. These
+    names refer to *computed* quantities — a bound on the output would
+    sample the aggregate independently of its formula's named inputs,
+    so a single Monte Carlo trial can pair sub-component p95s with a
+    total p05 (Gemini R1.1, disconnected aggregates).
+    """
+    out: set[str] = set()
+    for src in ("recommended_first_calculations", "derived_questions"):
+        for entry in parameters.get(src, []) or []:
+            if not isinstance(entry, dict):
+                continue
+            name = entry.get("output_name")
+            if isinstance(name, str) and name:
+                out.add(name)
+    return out
+
+
 def strip_threshold_bounds(
     bounds: dict, parameters: dict,
 ) -> tuple[dict, list[dict]]:
-    """Remove bounds entries for threshold/target variables.
+    """Remove bounds entries for threshold/target variables and for
+    variables that are calculation outputs.
 
     Bounds entries on threshold variables silently change what `pass_rate`
     measures from "does actual >= stated_threshold" to
@@ -147,28 +186,41 @@ def strip_threshold_bounds(
     `collect_input_specs` does this automatically when the threshold variable
     is absent from bounds. This function enforces that absence.
 
+    Bounds entries on calculation outputs (variables that appear as a
+    declared ``output_name`` in ``recommended_first_calculations`` or
+    ``derived_questions``) would let a single Monte Carlo trial pair
+    sub-component p95s with a total p05; the output is computed from its
+    inputs, not sampled.
+
     Deterministic backstop for the rule documented in
     `.claude/skills/generate-bounds/system-prompt.txt`. The LLM is asked to
-    skip threshold variables but does not reliably do so when parameter-JSON
-    metadata (medium uncertainty + critical/high priority) signals strongly
-    to include them.
+    skip threshold and aggregate variables but does not reliably do so when
+    parameter-JSON metadata (medium uncertainty + critical/high priority)
+    signals strongly to include them.
 
     Identification:
       1. id suffix match against `THRESHOLD_SUFFIXES`
       2. id appears as the non-`actual_` operand in a binary-subtraction
          margin formula declared in `recommended_first_calculations` or
          `derived_questions`
+      3. id matches the ``output_name`` of any declared calculation
 
-    Variables prefixed `actual_` are never stripped.
+    Variables prefixed `actual_` are never stripped. Reserved top-level
+    keys (currently ``correlations``) pass through unchanged so the
+    optional correlations block survives this pre-processor.
 
     Returns ``(cleaned_bounds, stripped)``. ``stripped`` is an ordered list
-    of ``{"id": str, "reason": "suffix" | "formula-side"}`` records. The
-    input ``bounds`` is not mutated.
+    of ``{"id": str, "reason": "suffix" | "formula-side" | "calculation-output"}``
+    records. The input ``bounds`` is not mutated.
     """
     formula_threshold_ids = _collect_formula_threshold_ids(parameters)
+    calculation_output_names = _collect_calculation_output_names(parameters)
     cleaned: dict = {}
     stripped: list[dict] = []
     for bound_id, bound in bounds.items():
+        if bound_id in RESERVED_TOP_LEVEL_KEYS:
+            cleaned[bound_id] = bound
+            continue
         if bound_id.startswith("actual_"):
             cleaned[bound_id] = bound
             continue
@@ -178,6 +230,9 @@ def strip_threshold_bounds(
         if bound_id in formula_threshold_ids:
             stripped.append({"id": bound_id, "reason": "formula-side"})
             continue
+        if bound_id in calculation_output_names:
+            stripped.append({"id": bound_id, "reason": "calculation-output"})
+            continue
         cleaned[bound_id] = bound
     return cleaned, stripped
 
@@ -226,6 +281,15 @@ def sample_one(rng: np.random.Generator, bound: dict, distribution_default: str,
         p = gate_probabilities.get(var_id, bound["default_pass_probability"])
         return high if rng.random() < p else low
 
+    if discipline in UNIMPLEMENTED_DISCIPLINES:
+        raise NotImplementedError(
+            f"bound '{var_id}' has sampling_discipline {discipline!r}; "
+            f"the matching sampler is not yet implemented in run_monte_carlo. "
+            f"This value is accepted by the schema (Phase 4) but the sampler "
+            f"lands in Phase 8 — switch to triangular/continuous in bounds.json "
+            f"or wait for Phase 8."
+        )
+
     if low == high:
         val = low
     elif distribution_default == "uniform":
@@ -427,11 +491,18 @@ def run(params_path: Path, bounds_path: Path, calc_path: Path,
 
     bounds, stripped_threshold_bounds = strip_threshold_bounds(bounds, params)
     for entry in stripped_threshold_bounds:
-        warnings_text.append(
-            f"stripped threshold variable '{entry['id']}' from bounds "
-            f"(reason: {entry['reason']}); simulation will use the stated value "
-            f"from parameters.json"
-        )
+        if entry["reason"] == "calculation-output":
+            warnings_text.append(
+                f"stripped calculation output '{entry['id']}' from bounds; "
+                f"simulation will compute it from calculations.py instead of "
+                f"sampling it independently"
+            )
+        else:
+            warnings_text.append(
+                f"stripped threshold variable '{entry['id']}' from bounds "
+                f"(reason: {entry['reason']}); simulation will use the stated value "
+                f"from parameters.json"
+            )
 
     input_specs = collect_input_specs(params, bounds)
     plan, plan_warnings = build_calculation_plan(params, module)
diff --git a/experiments/napkin_math/tests/test_run_monte_carlo.py b/experiments/napkin_math/tests/test_run_monte_carlo.py
@@ -186,6 +186,136 @@ def test_continuous_with_negatives_allowed(self):
         self.assertTrue(any(v < 0 for v in out))
         self.assertTrue(all(-10 <= v <= 10 for v in out))
 
+    def test_lognormal_passes_schema_validation(self):
+        """The schema accepts lognormal so that generate-bounds can begin
+        emitting it (Phase 4 readiness for the megaproject CAPEX default
+        that lands in the prompt-side follow-up)."""
+        bound = make_bound(unit="EUR", low=1e6, base=5e6, high=2e7,
+                           sampling_discipline="lognormal")
+        rmc.validate_bound("capex", bound)
+
+    def test_pert_passes_schema_validation(self):
+        bound = make_bound(unit="EUR", low=1e6, base=5e6, high=2e7,
+                           sampling_discipline="pert")
+        rmc.validate_bound("opex", bound)
+
+    def test_lognormal_sampler_raises_not_implemented(self):
+        """Sampling raises loudly until Phase 8 lands the sampler. A silent
+        fall-back to triangular would let the user see "100% Robust" on a
+        megaproject whose CAPEX bounds are actually fat-tailed — exactly
+        the megaproject illusion Phase 4 is laying groundwork to fix."""
+        bound = make_bound(unit="EUR", low=1e6, base=5e6, high=2e7,
+                           sampling_discipline="lognormal")
+        with self.assertRaises(NotImplementedError) as ctx:
+            rmc.sample_one(self.rng, bound, "triangular", {}, "capex")
+        self.assertIn("lognormal", str(ctx.exception))
+        self.assertIn("Phase 8", str(ctx.exception))
+
+    def test_pert_sampler_raises_not_implemented(self):
+        bound = make_bound(unit="EUR", low=1e6, base=5e6, high=2e7,
+                           sampling_discipline="pert")
+        with self.assertRaises(NotImplementedError) as ctx:
+            rmc.sample_one(self.rng, bound, "triangular", {}, "opex")
+        self.assertIn("pert", str(ctx.exception))
+
+
+class TestStrippedBoundsWarnings(unittest.TestCase):
+    """Reason-branched warning text for ``strip_threshold_bounds`` results.
+
+    Review feedback on PR #747: the original warning said every stripped
+    item was a "threshold variable" whose value would be read from
+    parameters.json — false for calculation outputs, which are computed
+    from calculations.py instead of read as a stated value.
+    """
+
+    def test_calculation_output_warning_mentions_calculations_py(self):
+        with tempfile.TemporaryDirectory() as td:
+            tmpdir = Path(td)
+            calc = (
+                "def total(a: float, b: float) -> float:\n"
+                "    return a + b\n"
+            )
+            bound_a = make_bound(unit="EUR", low=1, base=2, high=3,
+                                 sampling_discipline="continuous")
+            bound_b = make_bound(unit="EUR", low=4, base=5, high=6,
+                                 sampling_discipline="continuous")
+            bound_total = make_bound(unit="EUR", low=10, base=20, high=30,
+                                     sampling_discipline="continuous")
+            result = run_with_fixture(
+                tmpdir,
+                missing_values=[
+                    {"id": "a", "label": "a", "unit": "EUR",
+                     "why_needed": "x", "suggested_estimation_method": "x"},
+                    {"id": "b", "label": "b", "unit": "EUR",
+                     "why_needed": "x", "suggested_estimation_method": "x"},
+                ],
+                recommended=[{
+                    "id": "c_total", "label": "total",
+                    "formula_hint": "total = a + b",
+                    "output_name": "total", "output_unit": "EUR",
+                    "depends_on": ["a", "b"], "why_first": "x",
+                }],
+                bounds={"a": bound_a, "b": bound_b, "total": bound_total},
+                calc_source=calc,
+                _settings={"n_runs": 100, "seed": 1},
+            )
+
+        stripped = result["bounds_post_processor"]["stripped_threshold_ids"]
+        self.assertEqual(stripped, [{"id": "total", "reason": "calculation-output"}])
+
+        messages = [w["message"] for w in result["warnings"]]
+        calc_warnings = [m for m in messages if "calculation output" in m]
+        self.assertEqual(len(calc_warnings), 1, msg=messages)
+        warning = calc_warnings[0]
+        self.assertIn("'total'", warning)
+        self.assertIn("calculations.py", warning)
+        # The threshold-variable phrasing must NOT appear for a
+        # calculation-output strip — that wording is reserved for
+        # suffix/formula-side strips where the value really does come
+        # from parameters.json.
+        self.assertNotIn("threshold variable", warning)
+        self.assertNotIn("stated value", warning)
+
+    def test_threshold_strip_warning_unchanged(self):
+        """Sibling guard: the original threshold-variable wording is
+        preserved for suffix/formula-side strips."""
+        with tempfile.TemporaryDirectory() as td:
+            tmpdir = Path(td)
+            calc = "def out(x: float) -> float:\n    return x\n"
+            bound_x = make_bound(low=0.1, base=0.2, high=0.3,
+                                 sampling_discipline="fraction")
+            bound_threshold = make_bound(low=0.4, base=0.5, high=0.6,
+                                         sampling_discipline="fraction")
+            result = run_with_fixture(
+                tmpdir,
+                missing_values=[{"id": "x", "label": "x", "unit": "fraction",
+                                 "why_needed": "x", "suggested_estimation_method": "x"}],
+                key_values=[{
+                    "id": "x_threshold", "label": "threshold",
+                    "category": "operational", "value_type": "explicit",
+                    "unit": "fraction", "value": 0.5,
+                    "comment": "x", "formula_hint": None,
+                    "output_name": None, "output_unit": None,
+                    "depends_on": [], "modelling_priority": "high",
+                    "uncertainty": "low", "source_text": "x",
+                }],
+                recommended=[{
+                    "id": "c", "label": "c",
+                    "formula_hint": "out = x",
+                    "output_name": "out", "output_unit": "fraction",
+                    "depends_on": ["x"], "why_first": "x",
+                }],
+                bounds={"x": bound_x, "x_threshold": bound_threshold},
+                calc_source=calc,
+                _settings={"n_runs": 100, "seed": 1},
+            )
+
+        messages = [w["message"] for w in result["warnings"]]
+        threshold_warnings = [m for m in messages if "threshold variable" in m]
+        self.assertEqual(len(threshold_warnings), 1, msg=messages)
+        self.assertIn("'x_threshold'", threshold_warnings[0])
+        self.assertIn("stated value", threshold_warnings[0])
+
 
 class TestSchemaValidation(unittest.TestCase):
     def test_missing_sampling_discipline(self):
diff --git a/experiments/napkin_math/tests/test_strip_threshold_bounds.py b/experiments/napkin_math/tests/test_strip_threshold_bounds.py