sriumcp
diff --git a/‎orchestrator/deployment_recommendation.py‎
Lines changed: 38 additions & 4 deletions b/‎orchestrator/deployment_recommendation.py‎
Lines changed: 38 additions & 4 deletions
diff --git a/‎orchestrator/iteration.py‎
Lines changed: 105 additions & 12 deletions b/‎orchestrator/iteration.py‎
Lines changed: 105 additions & 12 deletions
diff --git a/‎orchestrator/principles_classifier.py‎
Lines changed: 183 additions & 0 deletions b/‎orchestrator/principles_classifier.py‎
Lines changed: 183 additions & 0 deletions
@@ -173,14 +173,48 @@ def make_deployment_recommendation(
     caveats) when there's nothing to recommend — never raises.
     """
     work_dir = Path(work_dir)
-    best_found = _read_json(work_dir / "best_found.json")
+    best_found_path = work_dir / "best_found.json"
+    best_found = _read_json(best_found_path)
+
+    # Issue #178: distinguish "no candidate beat baseline" (genuine
+    # fall-back) from "best_found.json is missing" (upstream wiring
+    # gap — see #177). Both keep the conservative fall_back_to_baseline
+    # verdict, but the caveats now tell the operator what actually
+    # happened. Each caveat passes meta_findings.validate_caveat
+    # (cites a concrete artifact name + numeric / issue reference).
+    if best_found is None:
+        return DeploymentRecommendation(
+            verdict="fall_back_to_baseline",
+            caveats=[
+                f"best_found.json not present at {best_found_path}; "
+                f"cannot rank candidates. The iteration finalize step "
+                f"either did not run or did not call update_best_found. "
+                f"See issue #177 in orchestrator/iteration.py."
+            ],
+        )
 
-    if not best_found or not best_found.get("top_k"):
-        return DeploymentRecommendation(verdict="fall_back_to_baseline")
+    if not best_found.get("top_k"):
+        return DeploymentRecommendation(
+            verdict="fall_back_to_baseline",
+            caveats=[
+                f"best_found.json present at {best_found_path} but "
+                f"top_k is empty (k={best_found.get('k', 0)}); no "
+                f"candidate scored above baseline across the iterations "
+                f"recorded in runs/iter-N/findings.json."
+            ],
+        )
 
     top = best_found["top_k"][0]
     if not isinstance(top, dict):
-        return DeploymentRecommendation(verdict="fall_back_to_baseline")
+        return DeploymentRecommendation(
+            verdict="fall_back_to_baseline",
+            caveats=[
+                f"best_found.json top_k[0] has unexpected type "
+                f"{type(top).__name__!r} at {best_found_path}; "
+                f"expected dict. Investigate whether update_best_found "
+                f"wrote a corrupt entry — see issue #177."
+            ],
+        )
 
     best_score = float(top.get("score", 0.0))
     iteration = int(top.get("iteration", 0))
 
@@ -162,6 +162,102 @@ def _enter_phase(engine, phase):
     return True
 
 
+def _resolve_objective(campaign: dict):
+    """Resolve campaign.yaml's objective block to an ObjectiveSpec, or None.
+
+    Issue #177: the iteration finalize step calls update_best_found with
+    this objective. Legacy campaigns without `objective` or `objective_preset`
+    fall through to the legacy status-based ranking inside update_best_found.
+    """
+    if not isinstance(campaign, dict):
+        return None
+    from orchestrator.composite_score import ObjectiveSpec, get_preset
+
+    if (preset := campaign.get("objective_preset")):
+        try:
+            return get_preset(str(preset))
+        except ValueError:
+            return None
+
+    obj = campaign.get("objective")
+    if isinstance(obj, dict) and obj.get("weights"):
+        try:
+            return ObjectiveSpec(
+                weights={str(k): float(v) for k, v in obj["weights"].items()},
+                metric_extractors=dict(obj.get("metric_extractors") or {}),
+                deploy_threshold=float(obj.get("deploy_threshold", 0.1)),
+            )
+        except (TypeError, ValueError):
+            return None
+    return None
+
+
+def finalize_iteration(
+    *,
+    work_dir: Path,
+    iter_dir: Path,
+    iteration: int,
+    campaign: dict,
+) -> None:
+    """Run the deterministic post-gate finalize steps for an iteration.
+
+    Public seam (issue #177) so integration tests can drive the same
+    code path that ``run_iteration`` calls after HUMAN_FINDINGS_GATE
+    approves. The sort_bench dry-run on 2026-05-25 surfaced the gap:
+    ``update_best_found`` shipped in PR #172 with passing unit tests
+    but no caller — this function is the caller.
+
+    Steps (deterministic Python, no LLM):
+      1. Classify principle_updates.json in place — fill empirical_content
+         / derivation_type from text heuristics (issue #179).
+      2. Merge ``principle_updates.json`` into ``principles.json``.
+      3. Re-rank candidates and atomically rewrite ``best_found.json``
+         (issue #168 / #177).
+      4. Surface validator warnings for any residual unclassified
+         domain principles (issue #179, #86).
+      5. Regenerate per-campaign ``CLAUDE.md`` so the next iteration's
+         session sees the updated principles + handoff (issue #131).
+
+    Tolerant of partial fixtures: missing principle_updates.json,
+    missing findings.json, and CLAUDE.md regeneration failures all
+    soft-fail — the iteration's terminal artifacts (``best_found.json``,
+    ``principles.json``) are still written.
+    """
+    from orchestrator.composite_score import update_best_found
+    from orchestrator.principles_classifier import classify_principle_updates_in_place
+    from orchestrator.validate import validate_principles_have_empirical_content
+
+    # Classify BEFORE merge so principles.json reflects the tags on its
+    # very first write (issue #179).
+    classify_principle_updates_in_place(iter_dir)
+
+    _merge_principles(work_dir, iter_dir)
+
+    objective = _resolve_objective(campaign)
+    update_best_found(work_dir, objective=objective, top_k=5)
+
+    # Surface validator warnings for residual unclassified domain
+    # principles. Advisory only — doesn't roll back the merge.
+    principles_path = work_dir / "principles.json"
+    if principles_path.exists():
+        try:
+            store = json.loads(principles_path.read_text())
+            for warning in validate_principles_have_empirical_content(
+                store.get("principles", []),
+            ):
+                logger.warning("%s", warning)
+        except (OSError, json.JSONDecodeError):
+            pass
+
+    # CLAUDE.md regenerate is best-effort; failure here doesn't roll back
+    # the merged principles or the best_found ranking.
+    try:
+        from orchestrator.claude_md import regenerate_from_disk
+        regenerate_from_disk(work_dir, campaign, iteration=iteration)
+    except (OSError, RuntimeError) as exc:
+        logger.warning("Failed to regenerate CLAUDE.md: %s", exc)
+
+
 def _merge_principles(work_dir: Path, iter_dir: Path) -> None:
     """Merge principle_updates.json into the shared principles.json store."""
     updates_path = iter_dir / "principle_updates.json"
@@ -534,19 +630,16 @@ def _max_turns_for(phase_key: str) -> int:
             print("Aborted.")
             return IterationOutcome.ABORTED
 
-    # ─── PRINCIPLE MERGE (Python, no LLM) ─────────────────────────────────
-    _merge_principles(work_dir, iter_dir)
+    # ─── FINALIZE: merge principles + write best_found.json + CLAUDE.md ───
+    # Issue #177: the sort_bench dry-run on 2026-05-25 surfaced that
+    # update_best_found (#168) had no caller in the production path.
+    # finalize_iteration is the caller. Tests drive it directly.
+    finalize_iteration(
+        work_dir=work_dir, iter_dir=iter_dir,
+        iteration=iteration, campaign=campaign,
+    )
     print(f"  -> Principles merged into {work_dir / 'principles.json'}")
-
-    # ─── CLAUDE.md REGENERATE (Python, no LLM) — issue #131 ───────────────
-    # Refresh per-campaign CLAUDE.md so the next iteration's session loads
-    # the updated principles + handoff via Claude Code's auto-context loading.
-    try:
-        from orchestrator.claude_md import regenerate_from_disk
-        regenerate_from_disk(work_dir, campaign, iteration=iteration)
-    except (OSError, RuntimeError) as exc:
-        # Best-effort: a CLAUDE.md write failure shouldn't abort the iteration.
-        logger.warning("Failed to regenerate CLAUDE.md: %s", exc)
+    print(f"  -> best_found.json updated at {work_dir / 'best_found.json'}")
 
     if final:
         engine.transition("DONE")
 
@@ -0,0 +1,183 @@
+"""Deterministic post-extraction classifier for principle empiricism (issue #179).
+
+The sort_bench dry-run on 2026-05-25 surfaced that extracted principles
+ship with `empirical_content` and `derivation_type` (issue #86) unset
+because the methodology prompt is advisory and the schema treats them
+as optional. RP-2 in that run was a clear empirical observation
+(*"timsort uses 460 comparisons on nearly-sorted input"*) but was
+silently filed without tags.
+
+This module provides a deterministic Python heuristic that runs on
+``principle_updates.json`` before merge into ``principles.json``,
+filling the fields when the statement is classifiable. Residual
+unclassifiable principles are caught by the validator warning in
+``orchestrator.validate.validate_principles_have_empirical_content``.
+
+Approach: composable A+B from issue #179.
+  * A: This module — deterministic auto-classifier.
+  * B: ``validate.py`` — soft validator emitting WARN on residual misses.
+
+Heuristic priority:
+  1. Existing explicit tags are preserved (explicit > heuristic).
+  2. Algebraic markers (`iff`, `algebraic`, `identity`, `theorem`) → algebraic.
+  3. Definitional markers (`is defined as`, `by definition`) → definitional.
+  4. Empirical markers (`iter-N`, numeric measurements with units,
+     `observed`, `measured`, `experiments`) → empirical.
+  5. Otherwise leave None — the validator warning surfaces to the human.
+
+No LLM, no live calls. Tests assert on the heuristic's verdicts for
+known statement shapes.
+"""
+from __future__ import annotations
+
+import json
+import re
+from copy import deepcopy
+from pathlib import Path
+
+from orchestrator.util import atomic_write
+
+
+_ALGEBRAIC_MARKERS = (
+    re.compile(r"\biff\b", re.IGNORECASE),
+    re.compile(r"\bif\s+and\s+only\s+if\b", re.IGNORECASE),
+    re.compile(r"\balgebraic(?:ally)?\b", re.IGNORECASE),
+    re.compile(r"\bidentity\b", re.IGNORECASE),
+    re.compile(r"\bequivalent(?:ly)?\s+to\b", re.IGNORECASE),
+    re.compile(r"\bfollows\s+from\b", re.IGNORECASE),
+    re.compile(r"\btheorem\b", re.IGNORECASE),
+    re.compile(r"\baxiom\b", re.IGNORECASE),
+    re.compile(r"\bproof\b", re.IGNORECASE),
+)
+
+_DEFINITIONAL_MARKERS = (
+    re.compile(r"\bis\s+defined\s+as\b", re.IGNORECASE),
+    re.compile(r"\bby\s+definition\b", re.IGNORECASE),
+    re.compile(r"\bdefinitional(?:ly)?\b", re.IGNORECASE),
+)
+
+_EMPIRICAL_MARKERS = (
+    # Iteration / arm citations
+    re.compile(r"\biter[-_ ]?\d+\b", re.IGNORECASE),
+    re.compile(r"\barm[-_]?\w+\b", re.IGNORECASE),
+    # Empirical-process verbs
+    re.compile(r"\bobserved\b", re.IGNORECASE),
+    re.compile(r"\bmeasured\b", re.IGNORECASE),
+    re.compile(r"\bfound\s+that\b", re.IGNORECASE),
+    re.compile(r"\bexperiments?\b", re.IGNORECASE),
+    re.compile(r"\bdiscover(?:ed|y)?\b", re.IGNORECASE),
+    re.compile(r"\bempirical(?:ly)?\b", re.IGNORECASE),
+    # Numeric measurements with units (high signal)
+    re.compile(
+        r"\b\d+(?:\.\d+)?\s*"
+        r"(?:%|ms|us|s|MB|GB|comparisons?|tokens?|seeds?|x)\b",
+        re.IGNORECASE,
+    ),
+    # Concrete equations / values: "= 460", "approximately 0.85"
+    re.compile(r"=\s*\d{2,}"),
+    re.compile(r"\bratio\s*=?\s*\d", re.IGNORECASE),
+)
+
+
+def classify_principle(p: dict) -> dict:
+    """Return a copy of ``p`` with ``empirical_content`` / ``derivation_type``
+    filled in if the heuristic fires and the field is currently unset.
+
+    Pure: does not mutate the input. Existing values are preserved
+    (explicit > heuristic). When neither side fires strongly, returns
+    a copy with the fields still ``None`` — the validator warning
+    surfaces the residual to the human.
+    """
+    if not isinstance(p, dict):
+        return p  # malformed; let downstream validators catch it
+    out = deepcopy(p)
+
+    # If both fields are already set, no change.
+    has_empirical = out.get("empirical_content") is not None
+    has_derivation = out.get("derivation_type") is not None
+    if has_empirical and has_derivation:
+        return out
+
+    statement = str(out.get("statement") or "")
+    algebraic_hits = sum(1 for r in _ALGEBRAIC_MARKERS if r.search(statement))
+    definitional_hits = sum(1 for r in _DEFINITIONAL_MARKERS if r.search(statement))
+    empirical_hits = sum(1 for r in _EMPIRICAL_MARKERS if r.search(statement))
+
+    # Case 1: ``empirical_content`` was explicitly set; derivation_type
+    # follows. True ⇒ empirical; False ⇒ algebraic or definitional
+    # depending on which marker family dominates.
+    if has_empirical and not has_derivation:
+        if out.get("empirical_content") is True:
+            out["derivation_type"] = "empirical"
+        else:
+            if definitional_hits >= 1 and definitional_hits >= algebraic_hits:
+                out["derivation_type"] = "definitional"
+            else:
+                out["derivation_type"] = "algebraic"
+        return out
+
+    # Case 2: ``derivation_type`` was explicitly set; empirical_content
+    # follows by definition (only "empirical" → True; the others → False).
+    if has_derivation and not has_empirical:
+        out["empirical_content"] = (out.get("derivation_type") == "empirical")
+        return out
+
+    # Case 3: neither set. Apply the heuristic with priority:
+    # definitional > algebraic > empirical.
+
+    # Definitional markers are most specific — "is defined as" /
+    # "by definition" override algebraic markers that may co-occur.
+    if definitional_hits >= 1:
+        out["empirical_content"] = False
+        out["derivation_type"] = "definitional"
+        return out
+
+    # Algebraic markers — at least one of {iff, theorem, identity, …}
+    # AND no stronger empirical signal.
+    if algebraic_hits >= 1 and algebraic_hits >= empirical_hits:
+        out["empirical_content"] = False
+        out["derivation_type"] = "algebraic"
+        return out
+
+    # Empirical markers — require at least 2 (single iter-N alone is too
+    # weak; we want corroborating evidence like a numeric measurement
+    # or a process verb).
+    if empirical_hits >= 2 and empirical_hits > algebraic_hits:
+        out["empirical_content"] = True
+        out["derivation_type"] = "empirical"
+        return out
+
+    # Neither side fired strongly. Leave fields as-is (likely None) —
+    # validator will warn for category=domain principles.
+    return out
+
+
+def classify_principles(principles: list[dict]) -> list[dict]:
+    """Classify a list of principle dicts; returns a new list."""
+    if not isinstance(principles, list):
+        return principles
+    return [classify_principle(p) for p in principles]
+
+
+def classify_principle_updates_in_place(iter_dir: Path) -> None:
+    """Read ``runs/iter-N/principle_updates.json``, classify, and write back atomically.
+
+    No-op if the file is missing or malformed. Idempotent: re-running on
+    an already-classified file produces byte-equal output.
+
+    This is the seam ``finalize_iteration`` calls before
+    ``_merge_principles``, so the merged ``principles.json`` reflects
+    the tags on its very first write.
+    """
+    updates_path = Path(iter_dir) / "principle_updates.json"
+    if not updates_path.exists():
+        return
+    try:
+        updates = json.loads(updates_path.read_text())
+    except (OSError, json.JSONDecodeError):
+        return
+    if not isinstance(updates, list):
+        return
+
+    classified = classify_principles(updates)
+    atomic_write(updates_path, json.dumps(classified, indent=2) + "\n")