Skip to content

Commit 3d9c6fc

Browse files
authored
Add layered admissibility scoring
Add layered admissibility scoring - Add deterministic AdmissibilityScorer with layer scores and overall admissibility score. - Add stable JSON-safe score serialization and fixture integration tests. - Document v1 scoring rules, determinism guarantees, and non-goals. Validation reported in PR: targeted scorer, comparator, validator, fixture tests, full pytest, and npm run check passed.
1 parent bcbd43a commit 3d9c6fc

3 files changed

Lines changed: 326 additions & 0 deletions

File tree

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Layered Admissibility Score v1
2+
3+
## Purpose
4+
5+
`layered-admissibility-score-v1` adds a deterministic scoring layer on top of `ContractValidator` results. It converts fixture-level pass/fail outcomes into explicit, serializable layer scores and an overall admissibility score.
6+
7+
## Score fields
8+
9+
The scorer emits:
10+
11+
- `structural_score`
12+
- `relational_score`
13+
- `operational_score`
14+
- `governance_score`
15+
- `overall_admissibility_score`
16+
- `expected_admissible`
17+
- `observed_admissible`
18+
- `passed_contracts`
19+
- `failed_contracts`
20+
- `failure_labels`
21+
- `layer_scores` (per-layer contract lists, labels, and score)
22+
23+
## Layer scoring rules
24+
25+
For each layer (`structural`, `relational`, `operational`, `governance`):
26+
27+
- Score is `passed_contract_count / total_contract_count` in that layer.
28+
- If a layer has no contracts in the input result set, that layer score is `1.0`.
29+
- Passed/failed contract IDs are sorted deterministically.
30+
- Failure labels are sorted, unique, and derived from non-null `failure_label` values.
31+
32+
## Overall scoring rule
33+
34+
`overall_admissibility_score` is the unweighted arithmetic mean of the four layer scores:
35+
36+
- structural
37+
- relational
38+
- operational
39+
- governance
40+
41+
`observed_admissible` is true only when every `ValidationResult.passed` is true.
42+
43+
`expected_admissible` defaults to `observed_admissible` unless an explicit override is provided.
44+
45+
## Determinism guarantees
46+
47+
- No randomness.
48+
- No clock/time dependencies.
49+
- No external API/network calls.
50+
- Stable sorted outputs for contract IDs and failure labels.
51+
- `to_dict` produces JSON-compatible structures with tuple fields serialized as lists.
52+
53+
## Non-goals
54+
55+
- No learned weighting.
56+
- No LLM judges.
57+
- No embeddings.
58+
- No fuzzy matching.
59+
- No semantic equivalence.
60+
61+
## How this connects
62+
63+
- **ContractValidator:** consumes `ValidationResult` objects produced by contract validation.
64+
- **Positive/negative fixtures:** scores both `coding_workflow_pr_review_v1` and `coding_workflow_pr_review_degraded_v1` deterministically.
65+
- **Future degradation curves:** provides stable primitives for trajectory/degradation analysis across fixture families.
66+
67+
## Prototype caveat
68+
69+
- v1 uses unweighted averages only.
70+
- Future versions may add explicit configured weights, but not learned weights.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
from src.validation.contract_validator import Layer, ValidationResult
6+
7+
8+
@dataclass(frozen=True, slots=True)
9+
class LayerScore:
10+
layer: Layer
11+
passed_contracts: tuple[str, ...]
12+
failed_contracts: tuple[str, ...]
13+
failure_labels: tuple[str, ...]
14+
score: float
15+
16+
17+
@dataclass(frozen=True, slots=True)
18+
class AdmissibilityScore:
19+
structural_score: float
20+
relational_score: float
21+
operational_score: float
22+
governance_score: float
23+
overall_admissibility_score: float
24+
expected_admissible: bool
25+
observed_admissible: bool
26+
passed_contracts: tuple[str, ...]
27+
failed_contracts: tuple[str, ...]
28+
failure_labels: tuple[str, ...]
29+
layer_scores: tuple[LayerScore, ...]
30+
31+
32+
class AdmissibilityScorer:
33+
_LAYER_ORDER: tuple[Layer, ...] = (
34+
Layer.STRUCTURAL,
35+
Layer.RELATIONAL,
36+
Layer.OPERATIONAL,
37+
Layer.GOVERNANCE,
38+
)
39+
40+
def score(self, results: list[ValidationResult], expected_admissible: bool | None = None) -> AdmissibilityScore:
41+
observed_admissible = all(result.passed for result in results)
42+
effective_expected = observed_admissible if expected_admissible is None else expected_admissible
43+
44+
passed_contracts = tuple(sorted(result.contract_id for result in results if result.passed))
45+
failed_contracts = tuple(sorted(result.contract_id for result in results if not result.passed))
46+
failure_labels = tuple(sorted({result.failure_label for result in results if result.failure_label is not None}))
47+
48+
layer_scores: list[LayerScore] = []
49+
score_by_layer: dict[Layer, float] = {}
50+
51+
for layer in self._LAYER_ORDER:
52+
layer_results = [result for result in results if result.layer == layer]
53+
passed_in_layer = tuple(sorted(result.contract_id for result in layer_results if result.passed))
54+
failed_in_layer = tuple(sorted(result.contract_id for result in layer_results if not result.passed))
55+
labels_in_layer = tuple(sorted({result.failure_label for result in layer_results if result.failure_label is not None}))
56+
total_contracts = len(layer_results)
57+
layer_score = 1.0 if total_contracts == 0 else len(passed_in_layer) / total_contracts
58+
score_by_layer[layer] = layer_score
59+
layer_scores.append(
60+
LayerScore(
61+
layer=layer,
62+
passed_contracts=passed_in_layer,
63+
failed_contracts=failed_in_layer,
64+
failure_labels=labels_in_layer,
65+
score=layer_score,
66+
)
67+
)
68+
69+
overall_admissibility_score = sum(score_by_layer[layer] for layer in self._LAYER_ORDER) / len(self._LAYER_ORDER)
70+
71+
return AdmissibilityScore(
72+
structural_score=score_by_layer[Layer.STRUCTURAL],
73+
relational_score=score_by_layer[Layer.RELATIONAL],
74+
operational_score=score_by_layer[Layer.OPERATIONAL],
75+
governance_score=score_by_layer[Layer.GOVERNANCE],
76+
overall_admissibility_score=overall_admissibility_score,
77+
expected_admissible=effective_expected,
78+
observed_admissible=observed_admissible,
79+
passed_contracts=passed_contracts,
80+
failed_contracts=failed_contracts,
81+
failure_labels=failure_labels,
82+
layer_scores=tuple(layer_scores),
83+
)
84+
85+
def to_dict(self, score: AdmissibilityScore) -> dict[str, object]:
86+
return {
87+
"structural_score": score.structural_score,
88+
"relational_score": score.relational_score,
89+
"operational_score": score.operational_score,
90+
"governance_score": score.governance_score,
91+
"overall_admissibility_score": score.overall_admissibility_score,
92+
"expected_admissible": score.expected_admissible,
93+
"observed_admissible": score.observed_admissible,
94+
"passed_contracts": list(score.passed_contracts),
95+
"failed_contracts": list(score.failed_contracts),
96+
"failure_labels": list(score.failure_labels),
97+
"layer_scores": [
98+
{
99+
"layer": layer_score.layer.value,
100+
"passed_contracts": list(layer_score.passed_contracts),
101+
"failed_contracts": list(layer_score.failed_contracts),
102+
"failure_labels": list(layer_score.failure_labels),
103+
"score": layer_score.score,
104+
}
105+
for layer_score in score.layer_scores
106+
],
107+
}

tests/test_admissibility_scorer.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from pathlib import Path
5+
6+
from src.validation.admissibility_scorer import AdmissibilityScorer
7+
from src.validation.contract_validator import ContractType, ContractValidator, Layer, ValidationResult
8+
9+
10+
def _result(contract_id: str, layer: Layer, passed: bool, failure_label: str | None = None) -> ValidationResult:
11+
return ValidationResult(
12+
contract_id=contract_id,
13+
layer=layer,
14+
contract_type=ContractType.ORDERING,
15+
passed=passed,
16+
severity="high",
17+
failure_label=failure_label,
18+
deterministic_evidence={},
19+
)
20+
21+
22+
def _load_json(path: Path) -> dict[str, object]:
23+
return json.loads(path.read_text(encoding="utf-8"))
24+
25+
26+
def _fixture_validation_results(fixture_root: Path) -> list[ValidationResult]:
27+
original = {
28+
**_load_json(fixture_root / "original/trace.json"),
29+
**_load_json(fixture_root / "original/state.json"),
30+
"dependency_graph": _load_json(fixture_root / "original/dependency_graph.json"),
31+
}
32+
reconstructed = {
33+
**_load_json(fixture_root / "reconstructed/trace.json"),
34+
**_load_json(fixture_root / "reconstructed/state.json"),
35+
"dependency_graph": _load_json(fixture_root / "reconstructed/dependency_graph.json"),
36+
}
37+
contracts = [_load_json(path) for path in sorted((fixture_root / "original/contracts").glob("*.json"))]
38+
return ContractValidator().validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts)
39+
40+
41+
def test_all_contracts_pass_score_is_one() -> None:
42+
score = AdmissibilityScorer().score(
43+
[
44+
_result("op_a", Layer.OPERATIONAL, True),
45+
_result("rel_a", Layer.RELATIONAL, True),
46+
]
47+
)
48+
49+
assert score.overall_admissibility_score == 1.0
50+
assert score.observed_admissible is True
51+
assert score.failed_contracts == ()
52+
53+
54+
def test_failed_relational_contract_reduces_relational_and_overall_score() -> None:
55+
score = AdmissibilityScorer().score(
56+
[
57+
_result("rel_a", Layer.RELATIONAL, True),
58+
_result("rel_b", Layer.RELATIONAL, False, "REL_FAIL"),
59+
]
60+
)
61+
62+
assert score.relational_score == 0.5
63+
assert score.overall_admissibility_score == 0.875
64+
assert score.observed_admissible is False
65+
66+
67+
def test_failed_operational_contract_reduces_operational_score() -> None:
68+
score = AdmissibilityScorer().score([_result("op_a", Layer.OPERATIONAL, False, "OP_FAIL")])
69+
70+
assert score.operational_score == 0.0
71+
assert score.overall_admissibility_score == 0.75
72+
73+
74+
def test_empty_results_are_admissible_with_all_scores_one() -> None:
75+
score = AdmissibilityScorer().score([])
76+
77+
assert score.structural_score == 1.0
78+
assert score.relational_score == 1.0
79+
assert score.operational_score == 1.0
80+
assert score.governance_score == 1.0
81+
assert score.overall_admissibility_score == 1.0
82+
assert score.observed_admissible is True
83+
84+
85+
def test_failure_labels_are_sorted_unique() -> None:
86+
score = AdmissibilityScorer().score(
87+
[
88+
_result("a", Layer.RELATIONAL, False, "Z_LABEL"),
89+
_result("b", Layer.OPERATIONAL, False, "A_LABEL"),
90+
_result("c", Layer.GOVERNANCE, False, "A_LABEL"),
91+
]
92+
)
93+
94+
assert score.failure_labels == ("A_LABEL", "Z_LABEL")
95+
96+
97+
def test_passed_and_failed_contracts_are_sorted() -> None:
98+
score = AdmissibilityScorer().score(
99+
[
100+
_result("c", Layer.RELATIONAL, True),
101+
_result("a", Layer.RELATIONAL, False, "X"),
102+
_result("b", Layer.OPERATIONAL, True),
103+
]
104+
)
105+
106+
assert score.passed_contracts == ("b", "c")
107+
assert score.failed_contracts == ("a",)
108+
109+
110+
def test_to_dict_is_stable_and_json_compatible() -> None:
111+
scorer = AdmissibilityScorer()
112+
score = scorer.score([_result("b", Layer.OPERATIONAL, True), _result("a", Layer.RELATIONAL, False, "REL_FAIL")])
113+
114+
as_dict_first = scorer.to_dict(score)
115+
as_dict_second = scorer.to_dict(score)
116+
117+
assert as_dict_first == as_dict_second
118+
assert isinstance(as_dict_first["passed_contracts"], list)
119+
assert isinstance(as_dict_first["failed_contracts"], list)
120+
assert isinstance(as_dict_first["failure_labels"], list)
121+
assert isinstance(as_dict_first["layer_scores"], list)
122+
123+
124+
def test_expected_admissible_override() -> None:
125+
score = AdmissibilityScorer().score([_result("rel_a", Layer.RELATIONAL, True)], expected_admissible=False)
126+
127+
assert score.expected_admissible is False
128+
assert score.observed_admissible is True
129+
130+
131+
def test_scores_positive_fixture_contract_results() -> None:
132+
results = _fixture_validation_results(Path("fixtures/coding_workflow_pr_review_v1"))
133+
score = AdmissibilityScorer().score(results)
134+
135+
assert score.observed_admissible is True
136+
assert score.overall_admissibility_score == 1.0
137+
138+
139+
def test_scores_negative_fixture_contract_results() -> None:
140+
results = _fixture_validation_results(Path("fixtures/coding_workflow_pr_review_degraded_v1"))
141+
score = AdmissibilityScorer().score(results)
142+
143+
assert score.observed_admissible is False
144+
assert score.relational_score < 1.0
145+
assert score.operational_score < 1.0
146+
assert "POLICY_ORDER_BROKEN" in score.failure_labels
147+
assert "RECOVERY_PATH_INVALID" in score.failure_labels
148+
assert "CAUSAL_DEPENDENCY_LOSS" in score.failure_labels
149+
assert "INVARIANT_VIOLATION" in score.failure_labels

0 commit comments

Comments
 (0)