Skip to content

Commit 973c0f1

Browse files
authored
Harden deterministic operational failure taxonomy registration and coverage
1 parent 6800975 commit 973c0f1

3 files changed

Lines changed: 267 additions & 0 deletions

File tree

docs/failure_taxonomy.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Deterministic Operational Failure Taxonomy
2+
3+
This taxonomy defines stable replay/admissibility failure labels with explicit operational semantics. Every label maps to an observable deterministic condition, a failed contract/invariant type, or explicit artifact/metric drift.
4+
5+
Non-goals:
6+
- no semantic-only labels
7+
- no fuzzy matching labels
8+
- no model-judged labels
9+
10+
Canonical source for registered labels and field definitions: `src/validation/failure_taxonomy.py`.
11+
12+
## Required fields per label
13+
14+
Each registered label includes:
15+
- label name
16+
- operational meaning
17+
- observable trigger
18+
- linked contract or invariant type
19+
- severity class
20+
- explicit non-goal (what it must not mean)
21+
22+
## Preferred labels hardened in this taxonomy
23+
24+
- `TOOL_ORDER_VIOLATION`
25+
- `RECOVERY_PATH_LOSS`
26+
- `BLOCKER_DETACHMENT`
27+
- `GOVERNANCE_DRIFT`
28+
- `DEPENDENCY_CHAIN_BREAK`
29+
- `EVIDENCE_SURVIVAL_LOSS`
30+
- `HIGH_CRITICAL_EVIDENCE_LOSS`
31+
32+
These preferred labels are operationally defined in the canonical registry, regardless of whether a given fixture family currently emits each one.

src/validation/failure_taxonomy.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""Deterministic operational failure taxonomy for replay admissibility validation."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Final
6+
7+
BANNED_FUZZY_TERMS: Final[tuple[str, ...]] = (
8+
"ambiguous",
9+
"semantic",
10+
"fuzzy",
11+
"llm_judge",
12+
"reasoning",
13+
"confusion",
14+
)
15+
16+
FAILURE_TAXONOMY: Final[dict[str, dict[str, str]]] = {
17+
"TOOL_ORDER_VIOLATION": {
18+
"operational_meaning": "Observed tool execution sequence diverges from required deterministic step ordering.",
19+
"observable_trigger": "A strict order assertion fails when replayed tool-call indices are compared to contract order.",
20+
"contract_or_invariant_type": "ordering",
21+
"severity_class": "high",
22+
"non_goal": "Not a semantic quality judgment; only deterministic ordering mismatch.",
23+
},
24+
"RECOVERY_PATH_LOSS": {
25+
"operational_meaning": "At least one required recovery route is not preserved in reconstructed dependency paths.",
26+
"observable_trigger": "Reachability from a required failure node to one or more recovery targets is absent.",
27+
"contract_or_invariant_type": "reachability",
28+
"severity_class": "high",
29+
"non_goal": "Not a probabilistic prediction about future recovery behavior.",
30+
},
31+
"BLOCKER_DETACHMENT": {
32+
"operational_meaning": "Blocking constraints no longer remain attached at full survival during replay.",
33+
"observable_trigger": "blocker_survival_rate < 1.0 in deterministic replay metrics.",
34+
"contract_or_invariant_type": "operational_metric",
35+
"severity_class": "high",
36+
"non_goal": "Not a narrative claim about intent; metric-derived only.",
37+
},
38+
"GOVERNANCE_DRIFT": {
39+
"operational_meaning": "Governance-linked constraints drift below full deterministic preservation.",
40+
"observable_trigger": "Governance layer contract score or required governance metric falls below 1.0.",
41+
"contract_or_invariant_type": "governance",
42+
"severity_class": "medium",
43+
"non_goal": "Not a policy interpretation beyond explicit contract outputs.",
44+
},
45+
"DEPENDENCY_CHAIN_BREAK": {
46+
"operational_meaning": "Required dependency edges or causal chains are missing in reconstructed graphs.",
47+
"observable_trigger": "Comparator reports missing dependency or causal-edge preservation below full coverage.",
48+
"contract_or_invariant_type": "relational_dependency",
49+
"severity_class": "high",
50+
"non_goal": "Not an inferred semantic relation; strictly graph-structural.",
51+
},
52+
"EVIDENCE_SURVIVAL_LOSS": {
53+
"operational_meaning": "Evidence units expected in replay are not fully preserved.",
54+
"observable_trigger": "has_evidence is true and (evidence_survived < evidence_total or evidence_survival_rate < 1.0).",
55+
"contract_or_invariant_type": "operational_metric",
56+
"severity_class": "medium",
57+
"non_goal": "Not a free-form evidence relevance judgement.",
58+
},
59+
"HIGH_CRITICAL_EVIDENCE_LOSS": {
60+
"operational_meaning": "High-critical evidence survival is below full preservation.",
61+
"observable_trigger": "has_high_critical_evidence is true and high_critical_evidence_survival_rate < 1.0.",
62+
"contract_or_invariant_type": "operational_metric",
63+
"severity_class": "critical",
64+
"non_goal": "Not any low-priority evidence drop; only high-critical metric-gated loss.",
65+
},
66+
"POLICY_ORDER_BROKEN": {
67+
"operational_meaning": "Ordering contract failed for policy-ordered replay steps.",
68+
"observable_trigger": "ContractValidator ordering contract returns passed == false.",
69+
"contract_or_invariant_type": "ordering",
70+
"severity_class": "high",
71+
"non_goal": "Not an evaluation of policy correctness, only order conformance.",
72+
},
73+
"RECOVERY_PATH_INVALID": {
74+
"operational_meaning": "Reachability contract indicates required recovery path set is not satisfied.",
75+
"observable_trigger": "ContractValidator reachability contract returns passed == false.",
76+
"contract_or_invariant_type": "reachability",
77+
"severity_class": "high",
78+
"non_goal": "Not a runtime incident-resolution claim outside replay graph checks.",
79+
},
80+
"CAUSAL_DEPENDENCY_LOSS": {
81+
"operational_meaning": "Required causal dependency edges are missing in reconstruction.",
82+
"observable_trigger": "Causality contract or comparator reports missing required causal edges.",
83+
"contract_or_invariant_type": "causality",
84+
"severity_class": "high",
85+
"non_goal": "Not temporal speculation; explicit causal-edge checks only.",
86+
},
87+
"INVARIANT_VIOLATION": {
88+
"operational_meaning": "A predefined deterministic invariant is violated.",
89+
"observable_trigger": "Invariant contract evaluation returns passed == false.",
90+
"contract_or_invariant_type": "invariant",
91+
"severity_class": "high",
92+
"non_goal": "Not a broad quality label; only declared invariant failure.",
93+
},
94+
"ORPHAN_DEPENDENCY": {
95+
"operational_meaning": "Nodes requiring upstream dependencies became orphaned after replay.",
96+
"observable_trigger": "Comparator orphan_rate > 0 with affected node evidence.",
97+
"contract_or_invariant_type": "relational_dependency",
98+
"severity_class": "high",
99+
"non_goal": "Not missing optional links; only required incoming dependency loss.",
100+
},
101+
"DETACHED_DEPENDENCY": {
102+
"operational_meaning": "Required dependency edges are detached in reconstructed graph.",
103+
"observable_trigger": "Comparator detached_dependency_rate > 0.",
104+
"contract_or_invariant_type": "relational_dependency",
105+
"severity_class": "high",
106+
"non_goal": "Not a semantic mismatch; edge-presence based only.",
107+
},
108+
"CYCLE_INTRODUCED": {
109+
"operational_meaning": "Reconstructed graph introduces cyclic dependency where baseline is acyclic.",
110+
"observable_trigger": "Comparator acyclicity_preserved == false.",
111+
"contract_or_invariant_type": "relational_dependency",
112+
"severity_class": "high",
113+
"non_goal": "Not a cycle severity estimate; binary structural condition only.",
114+
},
115+
"GRAPH_FRAGMENTATION": {
116+
"operational_meaning": "Connected dependency structure fragments across replayed graph segments.",
117+
"observable_trigger": "Comparator dependency_integrity/reachability evidence indicates fragmentation failure.",
118+
"contract_or_invariant_type": "relational_dependency",
119+
"severity_class": "medium",
120+
"non_goal": "Not about organizational context loss; graph-connectivity only.",
121+
},
122+
"TEMPORAL_ORDER_VIOLATION": {
123+
"operational_meaning": "Relative deterministic topological order over shared nodes is violated.",
124+
"observable_trigger": "Comparator temporal_order_violation_rate > 0.",
125+
"contract_or_invariant_type": "relational_temporal",
126+
"severity_class": "medium",
127+
"non_goal": "Not wall-clock latency drift; ordering relation only.",
128+
},
129+
"ARTIFACT_INTEGRITY_VIOLATION": {
130+
"operational_meaning": "Expected artifact fields drift from checked deterministic contract bundle.",
131+
"observable_trigger": "Artifact drift or expected artifact parity checks fail.",
132+
"contract_or_invariant_type": "artifact_integrity",
133+
"severity_class": "critical",
134+
"non_goal": "Not formatting-only differences; contract-relevant drift.",
135+
},
136+
"REPLAY_NON_REPRODUCIBLE": {
137+
"operational_meaning": "Replay output fails reproducibility requirements under fixed deterministic inputs.",
138+
"observable_trigger": "Reproducibility check reports non-stable artifact or score output.",
139+
"contract_or_invariant_type": "reproducibility",
140+
"severity_class": "critical",
141+
"non_goal": "Not a single-run runtime fault outside deterministic replay validation.",
142+
},
143+
"CONSTRAINT_DRIFT": {
144+
"operational_meaning": "Constraint preservation falls below full deterministic survival.",
145+
"observable_trigger": "constraint_survival_rate < 1.0 in replay metrics.",
146+
"contract_or_invariant_type": "operational_metric",
147+
"severity_class": "medium",
148+
"non_goal": "Not policy reinterpretation; metric threshold only.",
149+
},
150+
"EVIDENCE_LOSS": {
151+
"operational_meaning": "Evidence preservation falls below required full survival.",
152+
"observable_trigger": "has_evidence is true and evidence metrics indicate < 1.0 survival.",
153+
"contract_or_invariant_type": "operational_metric",
154+
"severity_class": "medium",
155+
"non_goal": "Not semantic evidence relevance scoring.",
156+
},
157+
}

tests/test_failure_taxonomy.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from pathlib import Path
5+
6+
from src.validation.failure_taxonomy import BANNED_FUZZY_TERMS, FAILURE_TAXONOMY
7+
8+
9+
ROOT = Path(__file__).resolve().parent.parent
10+
11+
12+
def _load_json(path: Path) -> object:
13+
return json.loads(path.read_text(encoding="utf-8"))
14+
15+
16+
def _collect_fixture_failure_labels() -> set[str]:
17+
labels: set[str] = set()
18+
for path in sorted((ROOT / "fixtures").glob("**/expected/failures.json")):
19+
payload = _load_json(path)
20+
if not isinstance(payload, dict):
21+
continue
22+
for key in ("expected_failures", "allowed_failures", "disallowed_failures"):
23+
values = payload.get(key, [])
24+
if isinstance(values, list):
25+
labels.update(str(value) for value in values)
26+
return labels
27+
28+
29+
def _collect_artifact_failure_labels() -> set[str]:
30+
labels: set[str] = set()
31+
for path in sorted((ROOT / "artifacts").glob("*.json")):
32+
payload = _load_json(path)
33+
34+
def walk(value: object) -> None:
35+
if isinstance(value, dict):
36+
for key, nested in value.items():
37+
if key == "failure_labels" and isinstance(nested, list):
38+
labels.update(str(item) for item in nested)
39+
walk(nested)
40+
elif isinstance(value, list):
41+
for nested in value:
42+
walk(nested)
43+
44+
walk(payload)
45+
return labels
46+
47+
48+
def test_fixture_expected_failure_labels_are_registered() -> None:
49+
fixture_labels = _collect_fixture_failure_labels()
50+
missing = sorted(label for label in fixture_labels if label not in FAILURE_TAXONOMY)
51+
assert not missing, f"fixture labels missing from failure taxonomy: {missing}"
52+
53+
54+
def test_artifact_failure_labels_are_registered() -> None:
55+
artifact_labels = _collect_artifact_failure_labels()
56+
missing = sorted(label for label in artifact_labels if label not in FAILURE_TAXONOMY)
57+
assert not missing, f"artifact labels missing from failure taxonomy: {missing}"
58+
59+
60+
def test_registered_labels_have_required_operational_fields() -> None:
61+
required_fields = (
62+
"operational_meaning",
63+
"observable_trigger",
64+
"contract_or_invariant_type",
65+
"severity_class",
66+
"non_goal",
67+
)
68+
for label, spec in FAILURE_TAXONOMY.items():
69+
for field in required_fields:
70+
value = spec.get(field, "")
71+
assert isinstance(value, str) and value.strip(), f"label {label} missing required field {field}"
72+
73+
74+
def test_registered_labels_do_not_use_banned_fuzzy_terms() -> None:
75+
for label in FAILURE_TAXONOMY:
76+
normalized = label.lower()
77+
for banned in BANNED_FUZZY_TERMS:
78+
assert banned not in normalized, f"label '{label}' contains banned fuzzy term '{banned}'"

0 commit comments

Comments
 (0)