Harden deterministic operational failure taxonomy registration and coverage

ProfRandom92 · web-flow · commit 973c0f18b2d4 · 2026-05-19T10:27:33.000-07:00
diff --git a/docs/failure_taxonomy.md b/docs/failure_taxonomy.md
@@ -0,0 +1,32 @@
+# Deterministic Operational Failure Taxonomy
+
+This taxonomy defines stable replay/admissibility failure labels with explicit operational semantics. Every label maps to an observable deterministic condition, a failed contract/invariant type, or explicit artifact/metric drift.
+
+Non-goals:
+- no semantic-only labels
+- no fuzzy matching labels
+- no model-judged labels
+
+Canonical source for registered labels and field definitions: `src/validation/failure_taxonomy.py`.
+
+## Required fields per label
+
+Each registered label includes:
+- label name
+- operational meaning
+- observable trigger
+- linked contract or invariant type
+- severity class
+- explicit non-goal (what it must not mean)
+
+## Preferred labels hardened in this taxonomy
+
+- `TOOL_ORDER_VIOLATION`
+- `RECOVERY_PATH_LOSS`
+- `BLOCKER_DETACHMENT`
+- `GOVERNANCE_DRIFT`
+- `DEPENDENCY_CHAIN_BREAK`
+- `EVIDENCE_SURVIVAL_LOSS`
+- `HIGH_CRITICAL_EVIDENCE_LOSS`
+
+These preferred labels are operationally defined in the canonical registry, regardless of whether a given fixture family currently emits each one.
diff --git a/src/validation/failure_taxonomy.py b/src/validation/failure_taxonomy.py
@@ -0,0 +1,157 @@
+"""Deterministic operational failure taxonomy for replay admissibility validation."""
+
+from __future__ import annotations
+
+from typing import Final
+
+BANNED_FUZZY_TERMS: Final[tuple[str, ...]] = (
+    "ambiguous",
+    "semantic",
+    "fuzzy",
+    "llm_judge",
+    "reasoning",
+    "confusion",
+)
+
+FAILURE_TAXONOMY: Final[dict[str, dict[str, str]]] = {
+    "TOOL_ORDER_VIOLATION": {
+        "operational_meaning": "Observed tool execution sequence diverges from required deterministic step ordering.",
+        "observable_trigger": "A strict order assertion fails when replayed tool-call indices are compared to contract order.",
+        "contract_or_invariant_type": "ordering",
+        "severity_class": "high",
+        "non_goal": "Not a semantic quality judgment; only deterministic ordering mismatch.",
+    },
+    "RECOVERY_PATH_LOSS": {
+        "operational_meaning": "At least one required recovery route is not preserved in reconstructed dependency paths.",
+        "observable_trigger": "Reachability from a required failure node to one or more recovery targets is absent.",
+        "contract_or_invariant_type": "reachability",
+        "severity_class": "high",
+        "non_goal": "Not a probabilistic prediction about future recovery behavior.",
+    },
+    "BLOCKER_DETACHMENT": {
+        "operational_meaning": "Blocking constraints no longer remain attached at full survival during replay.",
+        "observable_trigger": "blocker_survival_rate < 1.0 in deterministic replay metrics.",
+        "contract_or_invariant_type": "operational_metric",
+        "severity_class": "high",
+        "non_goal": "Not a narrative claim about intent; metric-derived only.",
+    },
+    "GOVERNANCE_DRIFT": {
+        "operational_meaning": "Governance-linked constraints drift below full deterministic preservation.",
+        "observable_trigger": "Governance layer contract score or required governance metric falls below 1.0.",
+        "contract_or_invariant_type": "governance",
+        "severity_class": "medium",
+        "non_goal": "Not a policy interpretation beyond explicit contract outputs.",
+    },
+    "DEPENDENCY_CHAIN_BREAK": {
+        "operational_meaning": "Required dependency edges or causal chains are missing in reconstructed graphs.",
+        "observable_trigger": "Comparator reports missing dependency or causal-edge preservation below full coverage.",
+        "contract_or_invariant_type": "relational_dependency",
+        "severity_class": "high",
+        "non_goal": "Not an inferred semantic relation; strictly graph-structural.",
+    },
+    "EVIDENCE_SURVIVAL_LOSS": {
+        "operational_meaning": "Evidence units expected in replay are not fully preserved.",
+        "observable_trigger": "has_evidence is true and (evidence_survived < evidence_total or evidence_survival_rate < 1.0).",
+        "contract_or_invariant_type": "operational_metric",
+        "severity_class": "medium",
+        "non_goal": "Not a free-form evidence relevance judgement.",
+    },
+    "HIGH_CRITICAL_EVIDENCE_LOSS": {
+        "operational_meaning": "High-critical evidence survival is below full preservation.",
+        "observable_trigger": "has_high_critical_evidence is true and high_critical_evidence_survival_rate < 1.0.",
+        "contract_or_invariant_type": "operational_metric",
+        "severity_class": "critical",
+        "non_goal": "Not any low-priority evidence drop; only high-critical metric-gated loss.",
+    },
+    "POLICY_ORDER_BROKEN": {
+        "operational_meaning": "Ordering contract failed for policy-ordered replay steps.",
+        "observable_trigger": "ContractValidator ordering contract returns passed == false.",
+        "contract_or_invariant_type": "ordering",
+        "severity_class": "high",
+        "non_goal": "Not an evaluation of policy correctness, only order conformance.",
+    },
+    "RECOVERY_PATH_INVALID": {
+        "operational_meaning": "Reachability contract indicates required recovery path set is not satisfied.",
+        "observable_trigger": "ContractValidator reachability contract returns passed == false.",
+        "contract_or_invariant_type": "reachability",
+        "severity_class": "high",
+        "non_goal": "Not a runtime incident-resolution claim outside replay graph checks.",
+    },
+    "CAUSAL_DEPENDENCY_LOSS": {
+        "operational_meaning": "Required causal dependency edges are missing in reconstruction.",
+        "observable_trigger": "Causality contract or comparator reports missing required causal edges.",
+        "contract_or_invariant_type": "causality",
+        "severity_class": "high",
+        "non_goal": "Not temporal speculation; explicit causal-edge checks only.",
+    },
+    "INVARIANT_VIOLATION": {
+        "operational_meaning": "A predefined deterministic invariant is violated.",
+        "observable_trigger": "Invariant contract evaluation returns passed == false.",
+        "contract_or_invariant_type": "invariant",
+        "severity_class": "high",
+        "non_goal": "Not a broad quality label; only declared invariant failure.",
+    },
+    "ORPHAN_DEPENDENCY": {
+        "operational_meaning": "Nodes requiring upstream dependencies became orphaned after replay.",
+        "observable_trigger": "Comparator orphan_rate > 0 with affected node evidence.",
+        "contract_or_invariant_type": "relational_dependency",
+        "severity_class": "high",
+        "non_goal": "Not missing optional links; only required incoming dependency loss.",
+    },
+    "DETACHED_DEPENDENCY": {
+        "operational_meaning": "Required dependency edges are detached in reconstructed graph.",
+        "observable_trigger": "Comparator detached_dependency_rate > 0.",
+        "contract_or_invariant_type": "relational_dependency",
+        "severity_class": "high",
+        "non_goal": "Not a semantic mismatch; edge-presence based only.",
+    },
+    "CYCLE_INTRODUCED": {
+        "operational_meaning": "Reconstructed graph introduces cyclic dependency where baseline is acyclic.",
+        "observable_trigger": "Comparator acyclicity_preserved == false.",
+        "contract_or_invariant_type": "relational_dependency",
+        "severity_class": "high",
+        "non_goal": "Not a cycle severity estimate; binary structural condition only.",
+    },
+    "GRAPH_FRAGMENTATION": {
+        "operational_meaning": "Connected dependency structure fragments across replayed graph segments.",
+        "observable_trigger": "Comparator dependency_integrity/reachability evidence indicates fragmentation failure.",
+        "contract_or_invariant_type": "relational_dependency",
+        "severity_class": "medium",
+        "non_goal": "Not about organizational context loss; graph-connectivity only.",
+    },
+    "TEMPORAL_ORDER_VIOLATION": {
+        "operational_meaning": "Relative deterministic topological order over shared nodes is violated.",
+        "observable_trigger": "Comparator temporal_order_violation_rate > 0.",
+        "contract_or_invariant_type": "relational_temporal",
+        "severity_class": "medium",
+        "non_goal": "Not wall-clock latency drift; ordering relation only.",
+    },
+    "ARTIFACT_INTEGRITY_VIOLATION": {
+        "operational_meaning": "Expected artifact fields drift from checked deterministic contract bundle.",
+        "observable_trigger": "Artifact drift or expected artifact parity checks fail.",
+        "contract_or_invariant_type": "artifact_integrity",
+        "severity_class": "critical",
+        "non_goal": "Not formatting-only differences; contract-relevant drift.",
+    },
+    "REPLAY_NON_REPRODUCIBLE": {
+        "operational_meaning": "Replay output fails reproducibility requirements under fixed deterministic inputs.",
+        "observable_trigger": "Reproducibility check reports non-stable artifact or score output.",
+        "contract_or_invariant_type": "reproducibility",
+        "severity_class": "critical",
+        "non_goal": "Not a single-run runtime fault outside deterministic replay validation.",
+    },
+    "CONSTRAINT_DRIFT": {
+        "operational_meaning": "Constraint preservation falls below full deterministic survival.",
+        "observable_trigger": "constraint_survival_rate < 1.0 in replay metrics.",
+        "contract_or_invariant_type": "operational_metric",
+        "severity_class": "medium",
+        "non_goal": "Not policy reinterpretation; metric threshold only.",
+    },
+    "EVIDENCE_LOSS": {
+        "operational_meaning": "Evidence preservation falls below required full survival.",
+        "observable_trigger": "has_evidence is true and evidence metrics indicate < 1.0 survival.",
+        "contract_or_invariant_type": "operational_metric",
+        "severity_class": "medium",
+        "non_goal": "Not semantic evidence relevance scoring.",
+    },
+}
diff --git a/tests/test_failure_taxonomy.py b/tests/test_failure_taxonomy.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from src.validation.failure_taxonomy import BANNED_FUZZY_TERMS, FAILURE_TAXONOMY
+
+
+ROOT = Path(__file__).resolve().parent.parent
+
+
+def _load_json(path: Path) -> object:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _collect_fixture_failure_labels() -> set[str]:
+    labels: set[str] = set()
+    for path in sorted((ROOT / "fixtures").glob("**/expected/failures.json")):
+        payload = _load_json(path)
+        if not isinstance(payload, dict):
+            continue
+        for key in ("expected_failures", "allowed_failures", "disallowed_failures"):
+            values = payload.get(key, [])
+            if isinstance(values, list):
+                labels.update(str(value) for value in values)
+    return labels
+
+
+def _collect_artifact_failure_labels() -> set[str]:
+    labels: set[str] = set()
+    for path in sorted((ROOT / "artifacts").glob("*.json")):
+        payload = _load_json(path)
+
+        def walk(value: object) -> None:
+            if isinstance(value, dict):
+                for key, nested in value.items():
+                    if key == "failure_labels" and isinstance(nested, list):
+                        labels.update(str(item) for item in nested)
+                    walk(nested)
+            elif isinstance(value, list):
+                for nested in value:
+                    walk(nested)
+
+        walk(payload)
+    return labels
+
+
+def test_fixture_expected_failure_labels_are_registered() -> None:
+    fixture_labels = _collect_fixture_failure_labels()
+    missing = sorted(label for label in fixture_labels if label not in FAILURE_TAXONOMY)
+    assert not missing, f"fixture labels missing from failure taxonomy: {missing}"
+
+
+def test_artifact_failure_labels_are_registered() -> None:
+    artifact_labels = _collect_artifact_failure_labels()
+    missing = sorted(label for label in artifact_labels if label not in FAILURE_TAXONOMY)
+    assert not missing, f"artifact labels missing from failure taxonomy: {missing}"
+
+
+def test_registered_labels_have_required_operational_fields() -> None:
+    required_fields = (
+        "operational_meaning",
+        "observable_trigger",
+        "contract_or_invariant_type",
+        "severity_class",
+        "non_goal",
+    )
+    for label, spec in FAILURE_TAXONOMY.items():
+        for field in required_fields:
+            value = spec.get(field, "")
+            assert isinstance(value, str) and value.strip(), f"label {label} missing required field {field}"
+
+
+def test_registered_labels_do_not_use_banned_fuzzy_terms() -> None:
+    for label in FAILURE_TAXONOMY:
+        normalized = label.lower()
+        for banned in BANNED_FUZZY_TERMS:
+            assert banned not in normalized, f"label '{label}' contains banned fuzzy term '{banned}'"