Skip to content

Commit e465d7c

Browse files
authored
Add deterministic degradation curve artifacts
Add deterministic degradation curve artifacts - Add DegradationCurveGenerator with stable JSON and Markdown artifact output. - Add expected/disallowed failure validation for fixture expectations. - Read fixture_version from fixture metadata and require explicit curve_id generation. - Add committed layered admissibility artifact and benchmark documentation. Validation reported in PR: degradation curve generator tests, full pytest suite, and npm run check passed.
1 parent 3d9c6fc commit e465d7c

5 files changed

Lines changed: 372 additions & 0 deletions

File tree

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"curve_id": "coding_workflow_pr_review_curve_v1",
3+
"generated_by": "DegradationCurveGenerator",
4+
"points": [
5+
{
6+
"expected_admissible": true,
7+
"failed_contracts": [],
8+
"failure_labels": [],
9+
"fixture_id": "coding_workflow_pr_review_v1",
10+
"fixture_path": "fixtures/coding_workflow_pr_review_v1",
11+
"fixture_version": "1.0.0",
12+
"governance_score": 1.0,
13+
"observed_admissible": true,
14+
"operational_score": 1.0,
15+
"overall_admissibility_score": 1.0,
16+
"passed_contracts": [
17+
"no_orphan_tool_calls",
18+
"pre_merge_review",
19+
"recovery_path_available",
20+
"security_causal_block"
21+
],
22+
"relational_score": 1.0,
23+
"structural_score": 1.0
24+
},
25+
{
26+
"expected_admissible": false,
27+
"failed_contracts": [
28+
"no_orphan_tool_calls",
29+
"pre_merge_review",
30+
"recovery_path_available",
31+
"security_causal_block"
32+
],
33+
"failure_labels": [
34+
"CAUSAL_DEPENDENCY_LOSS",
35+
"INVARIANT_VIOLATION",
36+
"POLICY_ORDER_BROKEN",
37+
"RECOVERY_PATH_INVALID"
38+
],
39+
"fixture_id": "coding_workflow_pr_review_degraded_v1",
40+
"fixture_path": "fixtures/coding_workflow_pr_review_degraded_v1",
41+
"fixture_version": "1.0.0",
42+
"governance_score": 1.0,
43+
"observed_admissible": false,
44+
"operational_score": 0.0,
45+
"overall_admissibility_score": 0.5,
46+
"passed_contracts": [],
47+
"relational_score": 0.0,
48+
"structural_score": 1.0
49+
}
50+
],
51+
"version": "1.0"
52+
}

docs/LAYERED_ADMISSIBILITY_SCORE_v1.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,9 @@ For each layer (`structural`, `relational`, `operational`, `governance`):
6868

6969
- v1 uses unweighted averages only.
7070
- Future versions may add explicit configured weights, but not learned weights.
71+
72+
73+
## Generated artifacts
74+
75+
- `artifacts/layered_admissibility_results.json`
76+
- `docs/benchmarks/layered_admissibility.md`
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Layered Admissibility Degradation Benchmark
2+
3+
## Purpose
4+
5+
Deterministically compare admissibility outcomes across fixture bundles using ContractValidator and AdmissibilityScorer.
6+
7+
## Fixture results
8+
9+
| fixture_id | expected_admissible | observed_admissible | structural_score | relational_score | operational_score | governance_score | overall_admissibility_score | failure_labels |
10+
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
11+
| coding_workflow_pr_review_v1 | true | true | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | none |
12+
| coding_workflow_pr_review_degraded_v1 | false | false | 1.000 | 0.000 | 0.000 | 1.000 | 0.500 | CAUSAL_DEPENDENCY_LOSS, INVARIANT_VIOLATION, POLICY_ORDER_BROKEN, RECOVERY_PATH_INVALID |
13+
14+
## Interpretation
15+
16+
The positive fixture remains fully admissible while the degraded fixture shows deterministic score loss and explicit failure labels.
17+
18+
## Non-goals
19+
20+
- no LLM judges
21+
- no embeddings
22+
- no fuzzy matching
23+
- no semantic equivalence
24+
25+
## Future
26+
27+
- add more fixture families
28+
- add progressive degradation levels
29+
- add SVG curve visualization later
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from dataclasses import asdict, dataclass
5+
from pathlib import Path
6+
from typing import Any
7+
8+
from src.validation.admissibility_scorer import AdmissibilityScorer
9+
from src.validation.contract_validator import ContractValidator
10+
11+
12+
@dataclass(frozen=True, slots=True)
13+
class FixtureScorePoint:
14+
fixture_id: str
15+
fixture_version: str
16+
fixture_path: str
17+
expected_admissible: bool
18+
observed_admissible: bool
19+
structural_score: float
20+
relational_score: float
21+
operational_score: float
22+
governance_score: float
23+
overall_admissibility_score: float
24+
passed_contracts: tuple[str, ...]
25+
failed_contracts: tuple[str, ...]
26+
failure_labels: tuple[str, ...]
27+
28+
29+
@dataclass(frozen=True, slots=True)
30+
class DegradationCurve:
31+
curve_id: str
32+
version: str
33+
generated_by: str
34+
points: tuple[FixtureScorePoint, ...]
35+
36+
37+
class DegradationCurveGenerator:
38+
VERSION = "1.0"
39+
40+
def _load_json(self, path: Path) -> dict[str, Any]:
41+
if not path.exists():
42+
raise FileNotFoundError(f"missing required fixture file: {path}")
43+
return json.loads(path.read_text(encoding="utf-8"))
44+
45+
def _fixture_version(self, fixture_path: Path, expected_admissibility: dict[str, Any]) -> str:
46+
if "fixture_version" not in expected_admissibility:
47+
raise ValueError(f"missing fixture_version in {fixture_path / 'expected/admissibility.json'}")
48+
return str(expected_admissibility["fixture_version"])
49+
50+
def _validate_expected_failures(
51+
self,
52+
fixture_path: Path,
53+
expected_failures_payload: dict[str, Any],
54+
observed_failure_labels: tuple[str, ...],
55+
) -> None:
56+
expected = set(expected_failures_payload.get("expected_failures", []))
57+
disallowed = set(expected_failures_payload.get("disallowed_failures", []))
58+
observed = set(observed_failure_labels)
59+
60+
missing_expected = sorted(expected - observed)
61+
if missing_expected:
62+
raise ValueError(f"missing expected failure labels for {fixture_path}: {missing_expected}")
63+
64+
emitted_disallowed = sorted(disallowed & observed)
65+
if emitted_disallowed:
66+
raise ValueError(f"emitted disallowed failure labels for {fixture_path}: {emitted_disallowed}")
67+
68+
def evaluate_fixture(self, fixture_path: Path) -> FixtureScorePoint:
69+
original = {
70+
**self._load_json(fixture_path / "original/trace.json"),
71+
**self._load_json(fixture_path / "original/state.json"),
72+
"dependency_graph": self._load_json(fixture_path / "original/dependency_graph.json"),
73+
}
74+
reconstructed = {
75+
**self._load_json(fixture_path / "reconstructed/trace.json"),
76+
**self._load_json(fixture_path / "reconstructed/state.json"),
77+
"dependency_graph": self._load_json(fixture_path / "reconstructed/dependency_graph.json"),
78+
}
79+
contracts_dir = fixture_path / "original/contracts"
80+
contracts = [self._load_json(contract_path) for contract_path in sorted(contracts_dir.glob("*.json"))]
81+
if not contracts:
82+
raise FileNotFoundError(f"no contract files found in fixture: {contracts_dir}")
83+
84+
expected_admissibility = self._load_json(fixture_path / "expected/admissibility.json")
85+
expected_admissible = bool(expected_admissibility["expected_admissible"])
86+
fixture_version = self._fixture_version(fixture_path, expected_admissibility)
87+
expected_failures = self._load_json(fixture_path / "expected/failures.json")
88+
89+
results = ContractValidator().validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts)
90+
score = AdmissibilityScorer().score(results, expected_admissible=expected_admissible)
91+
self._validate_expected_failures(fixture_path, expected_failures, score.failure_labels)
92+
93+
return FixtureScorePoint(
94+
fixture_id=fixture_path.name,
95+
fixture_version=fixture_version,
96+
fixture_path=fixture_path.as_posix(),
97+
expected_admissible=score.expected_admissible,
98+
observed_admissible=score.observed_admissible,
99+
structural_score=score.structural_score,
100+
relational_score=score.relational_score,
101+
operational_score=score.operational_score,
102+
governance_score=score.governance_score,
103+
overall_admissibility_score=score.overall_admissibility_score,
104+
passed_contracts=tuple(sorted(score.passed_contracts)),
105+
failed_contracts=tuple(sorted(score.failed_contracts)),
106+
failure_labels=tuple(sorted(score.failure_labels)),
107+
)
108+
109+
def generate(self, fixtures: list[Path], curve_id: str) -> DegradationCurve:
110+
points = tuple(self.evaluate_fixture(path) for path in fixtures)
111+
return DegradationCurve(curve_id=curve_id, version=self.VERSION, generated_by=self.__class__.__name__, points=points)
112+
113+
def to_dict(self, curve: DegradationCurve) -> dict[str, object]:
114+
return {
115+
"curve_id": curve.curve_id,
116+
"version": curve.version,
117+
"generated_by": curve.generated_by,
118+
"points": [
119+
{
120+
**asdict(point),
121+
"passed_contracts": list(point.passed_contracts),
122+
"failed_contracts": list(point.failed_contracts),
123+
"failure_labels": list(point.failure_labels),
124+
}
125+
for point in curve.points
126+
],
127+
}
128+
129+
def write_json(self, curve: DegradationCurve, output_path: Path) -> None:
130+
output_path.parent.mkdir(parents=True, exist_ok=True)
131+
output_path.write_text(json.dumps(self.to_dict(curve), indent=2, sort_keys=True) + "\n", encoding="utf-8")
132+
133+
def write_markdown(self, curve: DegradationCurve, output_path: Path) -> None:
134+
output_path.parent.mkdir(parents=True, exist_ok=True)
135+
rows = []
136+
for point in curve.points:
137+
labels = ", ".join(point.failure_labels) if point.failure_labels else "none"
138+
rows.append(
139+
f"| {point.fixture_id} | {str(point.expected_admissible).lower()} | {str(point.observed_admissible).lower()} | "
140+
f"{point.structural_score:.3f} | {point.relational_score:.3f} | {point.operational_score:.3f} | "
141+
f"{point.governance_score:.3f} | {point.overall_admissibility_score:.3f} | {labels} |"
142+
)
143+
144+
markdown = "\n".join(
145+
[
146+
"# Layered Admissibility Degradation Benchmark",
147+
"",
148+
"## Purpose",
149+
"",
150+
"Deterministically compare admissibility outcomes across fixture bundles using ContractValidator and AdmissibilityScorer.",
151+
"",
152+
"## Fixture results",
153+
"",
154+
"| fixture_id | expected_admissible | observed_admissible | structural_score | relational_score | operational_score | governance_score | overall_admissibility_score | failure_labels |",
155+
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |",
156+
*rows,
157+
"",
158+
"## Interpretation",
159+
"",
160+
"The positive fixture remains fully admissible while the degraded fixture shows deterministic score loss and explicit failure labels.",
161+
"",
162+
"## Non-goals",
163+
"",
164+
"- no LLM judges",
165+
"- no embeddings",
166+
"- no fuzzy matching",
167+
"- no semantic equivalence",
168+
"",
169+
"## Future",
170+
"",
171+
"- add more fixture families",
172+
"- add progressive degradation levels",
173+
"- add SVG curve visualization later",
174+
"",
175+
]
176+
)
177+
output_path.write_text(markdown, encoding="utf-8")
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from pathlib import Path
5+
6+
import pytest
7+
8+
from src.validation.degradation_curve_generator import DegradationCurveGenerator
9+
10+
11+
POS_FIXTURE = Path("fixtures/coding_workflow_pr_review_v1")
12+
NEG_FIXTURE = Path("fixtures/coding_workflow_pr_review_degraded_v1")
13+
ARTIFACT_PATH = Path("artifacts/layered_admissibility_results.json")
14+
CURVE_ID = "coding_workflow_pr_review_curve_v1"
15+
16+
17+
def test_evaluate_positive_fixture_scores_one() -> None:
18+
point = DegradationCurveGenerator().evaluate_fixture(POS_FIXTURE)
19+
assert point.fixture_version == "1.0.0"
20+
assert point.observed_admissible is True
21+
assert point.overall_admissibility_score == 1.0
22+
assert point.failed_contracts == ()
23+
assert point.failure_labels == ()
24+
25+
26+
def test_evaluate_negative_fixture_detects_expected_failures() -> None:
27+
point = DegradationCurveGenerator().evaluate_fixture(NEG_FIXTURE)
28+
assert point.fixture_version == "1.0.0"
29+
assert point.observed_admissible is False
30+
assert point.overall_admissibility_score < 1.0
31+
assert {
32+
"POLICY_ORDER_BROKEN",
33+
"RECOVERY_PATH_INVALID",
34+
"CAUSAL_DEPENDENCY_LOSS",
35+
"INVARIANT_VIOLATION",
36+
}.issubset(set(point.failure_labels))
37+
38+
39+
def test_generate_curve_is_deterministic() -> None:
40+
generator = DegradationCurveGenerator()
41+
fixtures = [POS_FIXTURE, NEG_FIXTURE]
42+
assert generator.to_dict(generator.generate(fixtures, curve_id=CURVE_ID)) == generator.to_dict(
43+
generator.generate(fixtures, curve_id=CURVE_ID)
44+
)
45+
46+
47+
def test_to_dict_is_json_compatible_and_sorted() -> None:
48+
generator = DegradationCurveGenerator()
49+
curve = generator.generate([POS_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
50+
curve_dict = generator.to_dict(curve)
51+
json.dumps(curve_dict, sort_keys=True)
52+
assert [point["fixture_path"] for point in curve_dict["points"]] == [
53+
POS_FIXTURE.as_posix(),
54+
NEG_FIXTURE.as_posix(),
55+
]
56+
57+
58+
def test_write_json_matches_committed_artifact(tmp_path: Path) -> None:
59+
generator = DegradationCurveGenerator()
60+
curve = generator.generate([POS_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
61+
generated_path = tmp_path / "layered_admissibility_results.json"
62+
generator.write_json(curve, generated_path)
63+
64+
generated = json.loads(generated_path.read_text(encoding="utf-8"))
65+
committed = json.loads(ARTIFACT_PATH.read_text(encoding="utf-8"))
66+
assert generated == committed
67+
68+
69+
def test_write_markdown_contains_fixture_rows(tmp_path: Path) -> None:
70+
generator = DegradationCurveGenerator()
71+
curve = generator.generate([POS_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
72+
markdown_path = tmp_path / "layered_admissibility.md"
73+
generator.write_markdown(curve, markdown_path)
74+
75+
content = markdown_path.read_text(encoding="utf-8")
76+
assert "coding_workflow_pr_review_v1" in content
77+
assert "coding_workflow_pr_review_degraded_v1" in content
78+
assert "POLICY_ORDER_BROKEN" in content
79+
assert "RECOVERY_PATH_INVALID" in content
80+
81+
82+
def test_missing_fixture_file_raises_clear_error(tmp_path: Path) -> None:
83+
incomplete = tmp_path / "fixture"
84+
incomplete.mkdir(parents=True)
85+
with pytest.raises(FileNotFoundError, match="missing required fixture file"):
86+
DegradationCurveGenerator().evaluate_fixture(incomplete)
87+
88+
89+
def test_missing_expected_failure_label_raises_clear_error() -> None:
90+
generator = DegradationCurveGenerator()
91+
92+
with pytest.raises(ValueError, match="missing expected failure labels"):
93+
generator._validate_expected_failures(
94+
Path("fixtures/example"),
95+
{"expected_failures": ["MISSING_EXPECTED_FAILURE"], "disallowed_failures": []},
96+
("OBSERVED_FAILURE",),
97+
)
98+
99+
100+
def test_disallowed_failure_label_raises_clear_error() -> None:
101+
generator = DegradationCurveGenerator()
102+
103+
with pytest.raises(ValueError, match="emitted disallowed failure labels"):
104+
generator._validate_expected_failures(
105+
Path("fixtures/example"),
106+
{"expected_failures": [], "disallowed_failures": ["DISALLOWED_FAILURE"]},
107+
("DISALLOWED_FAILURE",),
108+
)

0 commit comments

Comments
 (0)