Skip to content

Commit f7bdd8a

Browse files
authored
Manifest-drive layered admissibility curve fixture ordering
Manifest-drive layered admissibility curve fixture ordering - Load layered admissibility fixtures from fixtures/manifest.json - Preserve deterministic fixture ordering by family and degradation level - Use exact rational arithmetic for admissibility score calculation - Align benchmark artifact with exact deterministic score output
1 parent 10873ce commit f7bdd8a

4 files changed

Lines changed: 56 additions & 9 deletions

File tree

artifacts/layered_admissibility_results.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
"governance_score": 1.0,
6262
"observed_admissible": false,
6363
"operational_score": 1.0,
64-
"overall_admissibility_score": 0.8333333333333333,
64+
"overall_admissibility_score": 0.8333333333333334,
6565
"passed_contracts": [
6666
"no_orphan_tool_calls",
6767
"pre_merge_review"

src/validation/admissibility_scorer.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from dataclasses import dataclass
4+
from fractions import Fraction
45

56
from src.validation.contract_validator import Layer, ValidationResult
67

@@ -47,15 +48,18 @@ def score(self, results: list[ValidationResult], expected_admissible: bool | Non
4748

4849
layer_scores: list[LayerScore] = []
4950
score_by_layer: dict[Layer, float] = {}
51+
score_fraction_by_layer: dict[Layer, Fraction] = {}
5052

5153
for layer in self._LAYER_ORDER:
5254
layer_results = [result for result in results if result.layer == layer]
5355
passed_in_layer = tuple(sorted(result.contract_id for result in layer_results if result.passed))
5456
failed_in_layer = tuple(sorted(result.contract_id for result in layer_results if not result.passed))
5557
labels_in_layer = tuple(sorted({result.failure_label for result in layer_results if result.failure_label is not None}))
5658
total_contracts = len(layer_results)
57-
layer_score = 1.0 if total_contracts == 0 else len(passed_in_layer) / total_contracts
59+
layer_score_fraction = Fraction(1, 1) if total_contracts == 0 else Fraction(len(passed_in_layer), total_contracts)
60+
layer_score = float(layer_score_fraction)
5861
score_by_layer[layer] = layer_score
62+
score_fraction_by_layer[layer] = layer_score_fraction
5963
layer_scores.append(
6064
LayerScore(
6165
layer=layer,
@@ -66,7 +70,8 @@ def score(self, results: list[ValidationResult], expected_admissible: bool | Non
6670
)
6771
)
6872

69-
overall_admissibility_score = sum(score_by_layer[layer] for layer in self._LAYER_ORDER) / len(self._LAYER_ORDER)
73+
overall_score_fraction = sum(score_fraction_by_layer[layer] for layer in self._LAYER_ORDER) / len(self._LAYER_ORDER)
74+
overall_admissibility_score = float(overall_score_fraction)
7075

7176
return AdmissibilityScore(
7277
structural_score=score_by_layer[Layer.STRUCTURAL],

src/validation/degradation_curve_generator.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
from src.validation.contract_validator import ContractValidator
1010

1111

12+
MANIFEST_PATH = Path("fixtures/manifest.json")
13+
LAYERED_CURVE_FAMILY = "coding_workflow_pr_review"
14+
LAYERED_CURVE_LEVELS = ("baseline", "mild", "moderate", "severe")
15+
16+
1217
@dataclass(frozen=True, slots=True)
1318
class FixtureScorePoint:
1419
fixture_id: str
@@ -106,7 +111,33 @@ def evaluate_fixture(self, fixture_path: Path) -> FixtureScorePoint:
106111
failure_labels=tuple(sorted(score.failure_labels)),
107112
)
108113

109-
def generate(self, fixtures: list[Path], curve_id: str) -> DegradationCurve:
114+
def _load_fixture_manifest(self, manifest_path: Path = MANIFEST_PATH) -> tuple[dict[str, Any], ...]:
115+
manifest = self._load_json(manifest_path)
116+
fixtures = manifest.get("fixtures")
117+
if not isinstance(fixtures, list):
118+
raise ValueError(f"invalid fixture manifest format: {manifest_path}")
119+
return tuple(fixtures)
120+
121+
def fixtures_for_layered_admissibility_curve(self, manifest_path: Path = MANIFEST_PATH) -> tuple[Path, ...]:
122+
level_to_path: dict[str, Path] = {}
123+
124+
for entry in self._load_fixture_manifest(manifest_path):
125+
if entry.get("family") != LAYERED_CURVE_FAMILY:
126+
continue
127+
level = entry.get("degradation_level")
128+
if level in LAYERED_CURVE_LEVELS:
129+
path_str = entry.get("path")
130+
if not path_str:
131+
raise ValueError(f"missing path for fixture in manifest: {entry.get('fixture_id')}")
132+
level_to_path[str(level)] = Path(path_str)
133+
134+
missing_levels = [level for level in LAYERED_CURVE_LEVELS if level not in level_to_path]
135+
if missing_levels:
136+
raise ValueError(f"missing layered admissibility fixtures for levels: {missing_levels}")
137+
138+
return tuple(level_to_path[level] for level in LAYERED_CURVE_LEVELS)
139+
140+
def generate(self, fixtures: list[Path] | tuple[Path, ...], curve_id: str) -> DegradationCurve:
110141
points = tuple(self.evaluate_fixture(path) for path in fixtures)
111142
return DegradationCurve(curve_id=curve_id, version=self.VERSION, generated_by=self.__class__.__name__, points=points)
112143

tests/test_degradation_curve_generator.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,26 @@ def test_evaluate_negative_fixture_detects_expected_failures() -> None:
4040

4141
def test_generate_curve_is_deterministic() -> None:
4242
generator = DegradationCurveGenerator()
43-
fixtures = [POS_FIXTURE, MILD_FIXTURE, MODERATE_FIXTURE, NEG_FIXTURE]
43+
fixtures = generator.fixtures_for_layered_admissibility_curve()
4444
assert generator.to_dict(generator.generate(fixtures, curve_id=CURVE_ID)) == generator.to_dict(
4545
generator.generate(fixtures, curve_id=CURVE_ID)
4646
)
4747

4848

49+
50+
51+
def test_layered_curve_fixtures_are_loaded_from_manifest_order() -> None:
52+
fixtures = DegradationCurveGenerator().fixtures_for_layered_admissibility_curve()
53+
assert [fixture.as_posix() for fixture in fixtures] == [
54+
POS_FIXTURE.as_posix(),
55+
MILD_FIXTURE.as_posix(),
56+
MODERATE_FIXTURE.as_posix(),
57+
NEG_FIXTURE.as_posix(),
58+
]
59+
4960
def test_to_dict_is_json_compatible_and_sorted() -> None:
5061
generator = DegradationCurveGenerator()
51-
curve = generator.generate([POS_FIXTURE, MILD_FIXTURE, MODERATE_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
62+
curve = generator.generate(generator.fixtures_for_layered_admissibility_curve(), curve_id=CURVE_ID)
5263
curve_dict = generator.to_dict(curve)
5364
json.dumps(curve_dict, sort_keys=True)
5465
assert [point["fixture_path"] for point in curve_dict["points"]] == [
@@ -61,7 +72,7 @@ def test_to_dict_is_json_compatible_and_sorted() -> None:
6172

6273
def test_write_json_matches_committed_artifact(tmp_path: Path) -> None:
6374
generator = DegradationCurveGenerator()
64-
curve = generator.generate([POS_FIXTURE, MILD_FIXTURE, MODERATE_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
75+
curve = generator.generate(generator.fixtures_for_layered_admissibility_curve(), curve_id=CURVE_ID)
6576
generated_path = tmp_path / "layered_admissibility_results.json"
6677
generator.write_json(curve, generated_path)
6778

@@ -72,7 +83,7 @@ def test_write_json_matches_committed_artifact(tmp_path: Path) -> None:
7283

7384
def test_write_markdown_contains_fixture_rows(tmp_path: Path) -> None:
7485
generator = DegradationCurveGenerator()
75-
curve = generator.generate([POS_FIXTURE, MILD_FIXTURE, MODERATE_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
86+
curve = generator.generate(generator.fixtures_for_layered_admissibility_curve(), curve_id=CURVE_ID)
7687
markdown_path = tmp_path / "layered_admissibility.md"
7788
generator.write_markdown(curve, markdown_path)
7889

@@ -116,7 +127,7 @@ def test_disallowed_failure_label_raises_clear_error() -> None:
116127

117128
def test_progressive_curve_scores_are_monotonic_or_non_increasing() -> None:
118129
generator = DegradationCurveGenerator()
119-
curve = generator.generate([POS_FIXTURE, MILD_FIXTURE, MODERATE_FIXTURE, NEG_FIXTURE], curve_id=CURVE_ID)
130+
curve = generator.generate(generator.fixtures_for_layered_admissibility_curve(), curve_id=CURVE_ID)
120131
points = {point.fixture_id: point for point in curve.points}
121132

122133
assert points["coding_workflow_pr_review_v1"].overall_admissibility_score == 1.0

0 commit comments

Comments
 (0)