From 03c18bd48280e3209c72284e2dd3dd4902c03a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Wed, 20 May 2026 03:43:19 -0700 Subject: [PATCH 1/3] Add replay semantic integrity artifact --- .../replay_semantic_integrity_results.json | 292 ++++++++++++++++++ package.json | 3 +- ...rate_replay_semantic_integrity_artifact.py | 154 +++++++++ ...test_replay_semantic_integrity_artifact.py | 114 +++++++ 4 files changed, 562 insertions(+), 1 deletion(-) create mode 100644 artifacts/replay_semantic_integrity_results.json create mode 100644 scripts/generate_replay_semantic_integrity_artifact.py create mode 100644 tests/test_replay_semantic_integrity_artifact.py diff --git a/artifacts/replay_semantic_integrity_results.json b/artifacts/replay_semantic_integrity_results.json new file mode 100644 index 0000000..abeb339 --- /dev/null +++ b/artifacts/replay_semantic_integrity_results.json @@ -0,0 +1,292 @@ +{ + "artifact_id": "replay_semantic_integrity_results_v1", + "generated_by": "ReplaySemanticIntegrityArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": [ + { + "family": "coding_workflow_pr_review", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + } + } + }, + { + "family": "incident_response_page_triage", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 6, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "dependencies": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + } + } + }, + { + "family": "cross_domain_operational_dependency_workflow", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ] + } + } + }, + { + "family": "mcp_trace_replay", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 4, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ] + }, + "recovery_paths": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 4, + "failed": 0, + "failure_labels": [] + }, + "capability_boundaries": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 0, + "failed": 0, + "failure_labels": [] + } + } + } + ], + "global_summary": { + "family_count": 4, + "fixture_count": 16, + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none" + } +} diff --git a/package.json b/package.json index 0607cf9..1a12cb3 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py", "generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py", "generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py", - "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py" + "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py", + "generate:replay-semantic-integrity": "python scripts/generate_replay_semantic_integrity_artifact.py" } } diff --git a/scripts/generate_replay_semantic_integrity_artifact.py b/scripts/generate_replay_semantic_integrity_artifact.py new file mode 100644 index 0000000..cf78362 --- /dev/null +++ b/scripts/generate_replay_semantic_integrity_artifact.py @@ -0,0 +1,154 @@ +"""Deterministic entrypoint for replay semantic integrity artifact regeneration.""" + +from __future__ import annotations + +import json +from collections import OrderedDict +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +MULTI_FAMILY_PATH = REPO_ROOT / "artifacts" / "multi_family_admissibility_results.json" +OUTPUT_PATH = REPO_ROOT / "artifacts" / "replay_semantic_integrity_results.json" + +ARTIFACT_ID = "replay_semantic_integrity_results_v1" +LEVELS = ("baseline", "mild", "moderate", "severe") +COMMITMENT_CLASS_ORDER = ( + "evidence", + "constraints", + "dependencies", + "recovery_paths", + "tool_order", + "capability_boundaries", + "governance_or_policy", + "invariants", +) + + +def _load_json(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _family_order_and_counts() -> tuple[list[str], dict[str, int], dict[str, str]]: + manifest = _load_json(MANIFEST_PATH) + fixtures = manifest["fixtures"] + family_order: list[str] = [] + fixture_counts: dict[str, int] = {} + fixture_levels: dict[str, str] = {} + + for entry in fixtures: + family = entry["family"] + fixture_id = entry["fixture_id"] + level = entry["degradation_level"] + if family not in fixture_counts: + family_order.append(family) + fixture_counts[family] = 0 + fixture_counts[family] += 1 + fixture_levels[fixture_id] = level + + return family_order, fixture_counts, fixture_levels + + +def _class_for_contract(contract_id: str) -> str: + contract = contract_id.lower() + + if any(token in contract for token in ("evidence",)): + return "evidence" + if any(token in contract for token in ("constraint", "validation")): + return "constraints" + if any(token in contract for token in ("dependency", "causal", "chain")): + return "dependencies" + if any(token in contract for token in ("recovery", "rollback", "escalation")): + return "recovery_paths" + if any(token in contract for token in ("order", "ordering", "sequence", "tool_call_order")): + return "tool_order" + if any(token in contract for token in ("capability", "boundary")): + return "capability_boundaries" + if any(token in contract for token in ("policy", "governance", "approval")): + return "governance_or_policy" + if any(token in contract for token in ("invariant", "orphan")): + return "invariants" + + return "constraints" + + +def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) -> Path: + multi_family = _load_json(MULTI_FAMILY_PATH) + family_order, fixture_counts, fixture_levels = _family_order_and_counts() + family_curves = {entry["family"]: entry["curve"] for entry in multi_family["families"]} + + families_payload: list[dict[str, object]] = [] + total_fixture_count = 0 + + for family in family_order: + curve = family_curves[family] + points = sorted(curve["points"], key=lambda point: LEVELS.index(fixture_levels[point["fixture_id"]])) + + commitment_classes: OrderedDict[str, dict[str, object]] = OrderedDict() + for commitment_class in COMMITMENT_CLASS_ORDER: + commitment_classes[commitment_class] = { + "passed": 0, + "failed": 0, + "failure_labels": set(), + } + + for point in points: + failed_contracts = set(point["failed_contracts"]) + for contract_id in point["passed_contracts"] + point["failed_contracts"]: + commitment_class = _class_for_contract(contract_id) + if contract_id in failed_contracts: + commitment_classes[commitment_class]["failed"] += 1 + for failure_label in point["failure_labels"]: + commitment_classes[commitment_class]["failure_labels"].add(failure_label) + else: + commitment_classes[commitment_class]["passed"] += 1 + + serializable_classes: OrderedDict[str, dict[str, object]] = OrderedDict() + for commitment_class in COMMITMENT_CLASS_ORDER: + values = commitment_classes[commitment_class] + serializable_classes[commitment_class] = { + "passed": values["passed"], + "failed": values["failed"], + "failure_labels": sorted(values["failure_labels"]), + } + + families_payload.append( + { + "family": family, + "fixture_count": fixture_counts[family], + "levels": list(LEVELS), + "commitment_classes": serializable_classes, + } + ) + total_fixture_count += fixture_counts[family] + + payload = { + "artifact_id": ARTIFACT_ID, + "generated_by": "ReplaySemanticIntegrityArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": families_payload, + "global_summary": { + "family_count": len(families_payload), + "fixture_count": total_fixture_count, + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + return output_path + + +def main() -> int: + output_path = generate_replay_semantic_integrity_artifact() + print(output_path.relative_to(REPO_ROOT).as_posix()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_replay_semantic_integrity_artifact.py b/tests/test_replay_semantic_integrity_artifact.py new file mode 100644 index 0000000..719dcea --- /dev/null +++ b/tests/test_replay_semantic_integrity_artifact.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.generate_replay_semantic_integrity_artifact import ( + ARTIFACT_ID, + COMMITMENT_CLASS_ORDER, + LEVELS, + generate_replay_semantic_integrity_artifact, +) +from src.validation.failure_taxonomy import FAILURE_TAXONOMY + +ARTIFACT_PATH = Path("artifacts/replay_semantic_integrity_results.json") +MANIFEST_PATH = Path("fixtures/manifest.json") +EXPECTED_FAMILIES = [ + "coding_workflow_pr_review", + "incident_response_page_triage", + "cross_domain_operational_dependency_workflow", + "mcp_trace_replay", +] +FORBIDDEN_FIELDS = { + "timestamp", + "generated_at", + "environment", + "hostname", + "cwd", + "machine", + "semantic_similarity", + "embedding", + "llm", + "judge", +} + + +def _load_json(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _walk_keys(value: object) -> set[str]: + keys: set[str] = set() + if isinstance(value, dict): + keys.update(value.keys()) + for nested in value.values(): + keys.update(_walk_keys(nested)) + elif isinstance(value, list): + for nested in value: + keys.update(_walk_keys(nested)) + return keys + + +def test_script_output_matches_committed_artifact(tmp_path: Path) -> None: + output_path = tmp_path / "replay_semantic_integrity_results.json" + generate_replay_semantic_integrity_artifact(output_path) + + assert _load_json(output_path) == _load_json(ARTIFACT_PATH) + + +def test_artifact_schema_has_no_time_or_environment_fields() -> None: + payload = _load_json(ARTIFACT_PATH) + assert payload["artifact_id"] == ARTIFACT_ID + + all_keys = _walk_keys(payload) + for forbidden in FORBIDDEN_FIELDS: + assert forbidden not in all_keys + + +def test_all_required_families_are_represented_in_manifest_order() -> None: + payload = _load_json(ARTIFACT_PATH) + families = [entry["family"] for entry in payload["families"]] + assert families == EXPECTED_FAMILIES + + +def test_fixture_count_matches_manifest_and_levels_are_deterministic() -> None: + payload = _load_json(ARTIFACT_PATH) + manifest = _load_json(MANIFEST_PATH) + + expected_fixture_count = len(manifest["fixtures"]) + assert payload["global_summary"]["fixture_count"] == expected_fixture_count + + for family_payload in payload["families"]: + assert family_payload["fixture_count"] == 4 + assert family_payload["levels"] == list(LEVELS) + + +def test_commitment_class_order_is_stable_and_complete() -> None: + payload = _load_json(ARTIFACT_PATH) + + for family_payload in payload["families"]: + class_keys = list(family_payload["commitment_classes"].keys()) + assert class_keys == list(COMMITMENT_CLASS_ORDER) + + +def test_failure_labels_are_registered_and_sorted() -> None: + payload = _load_json(ARTIFACT_PATH) + registered_labels = set(FAILURE_TAXONOMY.keys()) + + for family_payload in payload["families"]: + for class_payload in family_payload["commitment_classes"].values(): + labels = class_payload["failure_labels"] + assert labels == sorted(labels) + for label in labels: + assert label in registered_labels + + +def test_artifact_declares_deterministic_mode_and_no_external_evaluators() -> None: + payload = _load_json(ARTIFACT_PATH) + + assert payload["evaluation_mode"] == "deterministic" + assert payload["llm_judges"] == "none" + assert payload["external_apis"] == "none" + assert payload["global_summary"]["deterministic_evaluation"] is True + assert payload["global_summary"]["llm_judges"] == "none" + assert payload["global_summary"]["external_apis"] == "none" From 568a85480ec45cc0c4da5df38563a03697b238e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Wed, 20 May 2026 04:01:40 -0700 Subject: [PATCH 2/3] Fix contract-linked replay semantic integrity labels --- .../replay_semantic_integrity_results.json | 107 ++++++------------ ...rate_replay_semantic_integrity_artifact.py | 86 +++++++++----- ...test_replay_semantic_integrity_artifact.py | 74 ++++++++++++ 3 files changed, 166 insertions(+), 101 deletions(-) diff --git a/artifacts/replay_semantic_integrity_results.json b/artifacts/replay_semantic_integrity_results.json index abeb339..567be58 100644 --- a/artifacts/replay_semantic_integrity_results.json +++ b/artifacts/replay_semantic_integrity_results.json @@ -22,39 +22,30 @@ "failure_labels": [] }, "constraints": { - "passed": 3, - "failed": 1, - "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" - ] + "passed": 0, + "failed": 0, + "failure_labels": [] }, "dependencies": { "passed": 2, "failed": 2, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" + "CAUSAL_DEPENDENCY_LOSS" ] }, "recovery_paths": { "passed": 1, "failed": 3, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", "RECOVERY_PATH_INVALID" ] }, "tool_order": { - "passed": 0, - "failed": 0, - "failure_labels": [] + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] }, "capability_boundaries": { "passed": 0, @@ -70,10 +61,7 @@ "passed": 3, "failed": 1, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" + "INVARIANT_VIOLATION" ] } } @@ -94,34 +82,30 @@ "failure_labels": [] }, "constraints": { - "passed": 6, - "failed": 2, - "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" - ] - }, - "dependencies": { "passed": 0, "failed": 0, "failure_labels": [] }, + "dependencies": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS" + ] + }, "recovery_paths": { "passed": 1, "failed": 3, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", "RECOVERY_PATH_INVALID" ] }, "tool_order": { - "passed": 0, - "failed": 0, - "failure_labels": [] + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] }, "capability_boundaries": { "passed": 0, @@ -137,10 +121,7 @@ "passed": 2, "failed": 2, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" + "INVARIANT_VIOLATION" ] } } @@ -169,50 +150,38 @@ "passed": 2, "failed": 2, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" + "CAUSAL_DEPENDENCY_LOSS" ] }, "recovery_paths": { "passed": 1, "failed": 3, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", "RECOVERY_PATH_INVALID" ] }, "tool_order": { - "passed": 3, - "failed": 1, - "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" - ] - }, - "capability_boundaries": { "passed": 0, "failed": 0, "failure_labels": [] }, - "governance_or_policy": { + "capability_boundaries": { "passed": 0, "failed": 0, "failure_labels": [] }, + "governance_or_policy": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] + }, "invariants": { "passed": 1, "failed": 3, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "POLICY_ORDER_BROKEN", - "RECOVERY_PATH_INVALID" + "INVARIANT_VIOLATION" ] } } @@ -241,17 +210,13 @@ "passed": 2, "failed": 2, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "RECOVERY_PATH_INVALID" + "CAUSAL_DEPENDENCY_LOSS" ] }, "recovery_paths": { "passed": 2, "failed": 2, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", "RECOVERY_PATH_INVALID" ] }, @@ -264,9 +229,7 @@ "passed": 1, "failed": 3, "failure_labels": [ - "CAUSAL_DEPENDENCY_LOSS", - "INVARIANT_VIOLATION", - "RECOVERY_PATH_INVALID" + "INVARIANT_VIOLATION" ] }, "governance_or_policy": { diff --git a/scripts/generate_replay_semantic_integrity_artifact.py b/scripts/generate_replay_semantic_integrity_artifact.py index cf78362..43e40bd 100644 --- a/scripts/generate_replay_semantic_integrity_artifact.py +++ b/scripts/generate_replay_semantic_integrity_artifact.py @@ -3,12 +3,18 @@ from __future__ import annotations import json +import sys from collections import OrderedDict from pathlib import Path +from typing import Any REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from src.validation.contract_validator import ContractType, ContractValidator, Layer + MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" -MULTI_FAMILY_PATH = REPO_ROOT / "artifacts" / "multi_family_admissibility_results.json" OUTPUT_PATH = REPO_ROOT / "artifacts" / "replay_semantic_integrity_results.json" ARTIFACT_ID = "replay_semantic_integrity_results_v1" @@ -29,60 +35,65 @@ def _load_json(path: Path) -> dict[str, object]: return json.loads(path.read_text(encoding="utf-8")) -def _family_order_and_counts() -> tuple[list[str], dict[str, int], dict[str, str]]: +def _family_order_and_counts() -> tuple[list[str], dict[str, int]]: manifest = _load_json(MANIFEST_PATH) fixtures = manifest["fixtures"] family_order: list[str] = [] fixture_counts: dict[str, int] = {} - fixture_levels: dict[str, str] = {} for entry in fixtures: family = entry["family"] - fixture_id = entry["fixture_id"] - level = entry["degradation_level"] if family not in fixture_counts: family_order.append(family) fixture_counts[family] = 0 fixture_counts[family] += 1 - fixture_levels[fixture_id] = level - return family_order, fixture_counts, fixture_levels + return family_order, fixture_counts -def _class_for_contract(contract_id: str) -> str: +def _class_for_contract(contract_id: str, contract_type: ContractType, layer: Layer) -> str: contract = contract_id.lower() + if any(token in contract for token in ("capability", "boundary")): + return "capability_boundaries" + if any(token in contract for token in ("policy", "governance", "approval")): + return "governance_or_policy" + if any(token in contract for token in ("recovery", "rollback", "escalation")): + return "recovery_paths" + if any(token in contract for token in ("dependency", "causal", "chain")): + return "dependencies" + if any(token in contract for token in ("order", "ordering", "sequence", "tool_call_order")): + return "tool_order" + if any(token in contract for token in ("invariant", "orphan")): + return "invariants" if any(token in contract for token in ("evidence",)): return "evidence" if any(token in contract for token in ("constraint", "validation")): return "constraints" - if any(token in contract for token in ("dependency", "causal", "chain")): + + if contract_type == ContractType.CAUSALITY: return "dependencies" - if any(token in contract for token in ("recovery", "rollback", "escalation")): + if contract_type == ContractType.REACHABILITY: return "recovery_paths" - if any(token in contract for token in ("order", "ordering", "sequence", "tool_call_order")): - return "tool_order" - if any(token in contract for token in ("capability", "boundary")): - return "capability_boundaries" - if any(token in contract for token in ("policy", "governance", "approval")): - return "governance_or_policy" - if any(token in contract for token in ("invariant", "orphan")): + if contract_type == ContractType.ORDERING: + return "governance_or_policy" if layer == Layer.GOVERNANCE else "tool_order" + if contract_type == ContractType.INVARIANT: return "invariants" return "constraints" def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) -> Path: - multi_family = _load_json(MULTI_FAMILY_PATH) - family_order, fixture_counts, fixture_levels = _family_order_and_counts() - family_curves = {entry["family"]: entry["curve"] for entry in multi_family["families"]} + manifest = _load_json(MANIFEST_PATH) + fixtures = manifest["fixtures"] + family_order, fixture_counts = _family_order_and_counts() families_payload: list[dict[str, object]] = [] total_fixture_count = 0 for family in family_order: - curve = family_curves[family] - points = sorted(curve["points"], key=lambda point: LEVELS.index(fixture_levels[point["fixture_id"]])) + family_fixtures = [entry for entry in fixtures if entry["family"] == family] + points = sorted(family_fixtures, key=lambda entry: LEVELS.index(entry["degradation_level"])) commitment_classes: OrderedDict[str, dict[str, object]] = OrderedDict() for commitment_class in COMMITMENT_CLASS_ORDER: @@ -92,14 +103,31 @@ def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) "failure_labels": set(), } - for point in points: - failed_contracts = set(point["failed_contracts"]) - for contract_id in point["passed_contracts"] + point["failed_contracts"]: - commitment_class = _class_for_contract(contract_id) - if contract_id in failed_contracts: + for fixture_entry in points: + fixture_path = REPO_ROOT / str(fixture_entry["path"]) + original: dict[str, Any] = { + **_load_json(fixture_path / "original/trace.json"), + **_load_json(fixture_path / "original/state.json"), + "dependency_graph": _load_json(fixture_path / "original/dependency_graph.json"), + } + reconstructed: dict[str, Any] = { + **_load_json(fixture_path / "reconstructed/trace.json"), + **_load_json(fixture_path / "reconstructed/state.json"), + "dependency_graph": _load_json(fixture_path / "reconstructed/dependency_graph.json"), + } + contracts_dir = fixture_path / "original/contracts" + contracts_by_id = { + contract["contract_id"]: contract for contract in (_load_json(path) for path in sorted(contracts_dir.glob("*.json"))) + } + contracts = [contracts_by_id[contract_id] for contract_id in fixture_entry["contracts"]] + results = ContractValidator().validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts) + + for result in results: + commitment_class = _class_for_contract(result.contract_id, result.contract_type, result.layer) + if not result.passed: commitment_classes[commitment_class]["failed"] += 1 - for failure_label in point["failure_labels"]: - commitment_classes[commitment_class]["failure_labels"].add(failure_label) + if result.failure_label is not None: + commitment_classes[commitment_class]["failure_labels"].add(result.failure_label) else: commitment_classes[commitment_class]["passed"] += 1 diff --git a/tests/test_replay_semantic_integrity_artifact.py b/tests/test_replay_semantic_integrity_artifact.py index 719dcea..4320044 100644 --- a/tests/test_replay_semantic_integrity_artifact.py +++ b/tests/test_replay_semantic_integrity_artifact.py @@ -1,14 +1,18 @@ from __future__ import annotations import json +from collections import defaultdict from pathlib import Path +from typing import Any from scripts.generate_replay_semantic_integrity_artifact import ( ARTIFACT_ID, COMMITMENT_CLASS_ORDER, LEVELS, + _class_for_contract, generate_replay_semantic_integrity_artifact, ) +from src.validation.contract_validator import ContractValidator from src.validation.failure_taxonomy import FAILURE_TAXONOMY ARTIFACT_PATH = Path("artifacts/replay_semantic_integrity_results.json") @@ -49,6 +53,39 @@ def _walk_keys(value: object) -> set[str]: return keys +def _validation_label_union_by_family_and_class() -> dict[str, dict[str, set[str]]]: + manifest = _load_json(MANIFEST_PATH) + validator = ContractValidator() + output: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set)) + + for fixture_entry in manifest["fixtures"]: + fixture_path = Path(fixture_entry["path"]) + original: dict[str, Any] = { + **_load_json(fixture_path / "original/trace.json"), + **_load_json(fixture_path / "original/state.json"), + "dependency_graph": _load_json(fixture_path / "original/dependency_graph.json"), + } + reconstructed: dict[str, Any] = { + **_load_json(fixture_path / "reconstructed/trace.json"), + **_load_json(fixture_path / "reconstructed/state.json"), + "dependency_graph": _load_json(fixture_path / "reconstructed/dependency_graph.json"), + } + contracts_dir = fixture_path / "original/contracts" + contracts_by_id = { + contract["contract_id"]: contract for contract in (_load_json(path) for path in sorted(contracts_dir.glob("*.json"))) + } + contracts = [contracts_by_id[contract_id] for contract_id in fixture_entry["contracts"]] + results = validator.validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts) + + family = fixture_entry["family"] + for result in results: + commitment_class = _class_for_contract(result.contract_id, result.contract_type, result.layer) + if not result.passed and result.failure_label is not None: + output[family][commitment_class].add(result.failure_label) + + return output + + def test_script_output_matches_committed_artifact(tmp_path: Path) -> None: output_path = tmp_path / "replay_semantic_integrity_results.json" generate_replay_semantic_integrity_artifact(output_path) @@ -112,3 +149,40 @@ def test_artifact_declares_deterministic_mode_and_no_external_evaluators() -> No assert payload["global_summary"]["deterministic_evaluation"] is True assert payload["global_summary"]["llm_judges"] == "none" assert payload["global_summary"]["external_apis"] == "none" + + +def test_contract_linked_label_behavior_recovery_and_ordering() -> None: + payload = _load_json(ARTIFACT_PATH) + families = {entry["family"]: entry for entry in payload["families"]} + + coding_recovery_labels = set(families["coding_workflow_pr_review"]["commitment_classes"]["recovery_paths"]["failure_labels"]) + assert "POLICY_ORDER_BROKEN" not in coding_recovery_labels + assert "CAUSAL_DEPENDENCY_LOSS" not in coding_recovery_labels + assert coding_recovery_labels == {"RECOVERY_PATH_INVALID"} + + cross_domain_order_labels = set( + families["cross_domain_operational_dependency_workflow"]["commitment_classes"]["governance_or_policy"]["failure_labels"] + ) + assert cross_domain_order_labels == {"POLICY_ORDER_BROKEN"} + + +def test_no_class_gets_full_fixture_label_set_without_contract_support() -> None: + payload = _load_json(ARTIFACT_PATH) + validation_union = _validation_label_union_by_family_and_class() + + for family_payload in payload["families"]: + family = family_payload["family"] + for commitment_class, class_payload in family_payload["commitment_classes"].items(): + artifact_labels = set(class_payload["failure_labels"]) + expected_labels = validation_union.get(family, {}).get(commitment_class, set()) + assert artifact_labels == expected_labels + + +def test_direct_validation_consistency_for_labels() -> None: + payload = _load_json(ARTIFACT_PATH) + validation_union = _validation_label_union_by_family_and_class() + + for family_payload in payload["families"]: + family = family_payload["family"] + for commitment_class, class_payload in family_payload["commitment_classes"].items(): + assert set(class_payload["failure_labels"]) == validation_union.get(family, {}).get(commitment_class, set()) From 28f80841e3cc249c244d863a34e23f47cb137b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Wed, 20 May 2026 04:07:19 -0700 Subject: [PATCH 3/3] Simplify replay semantic integrity dictionaries --- .../generate_replay_semantic_integrity_artifact.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/scripts/generate_replay_semantic_integrity_artifact.py b/scripts/generate_replay_semantic_integrity_artifact.py index 43e40bd..cb746a7 100644 --- a/scripts/generate_replay_semantic_integrity_artifact.py +++ b/scripts/generate_replay_semantic_integrity_artifact.py @@ -4,7 +4,6 @@ import json import sys -from collections import OrderedDict from pathlib import Path from typing import Any @@ -95,13 +94,10 @@ def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) family_fixtures = [entry for entry in fixtures if entry["family"] == family] points = sorted(family_fixtures, key=lambda entry: LEVELS.index(entry["degradation_level"])) - commitment_classes: OrderedDict[str, dict[str, object]] = OrderedDict() - for commitment_class in COMMITMENT_CLASS_ORDER: - commitment_classes[commitment_class] = { - "passed": 0, - "failed": 0, - "failure_labels": set(), - } + commitment_classes: dict[str, dict[str, object]] = { + commitment_class: {"passed": 0, "failed": 0, "failure_labels": set()} + for commitment_class in COMMITMENT_CLASS_ORDER + } for fixture_entry in points: fixture_path = REPO_ROOT / str(fixture_entry["path"]) @@ -131,7 +127,7 @@ def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) else: commitment_classes[commitment_class]["passed"] += 1 - serializable_classes: OrderedDict[str, dict[str, object]] = OrderedDict() + serializable_classes: dict[str, dict[str, object]] = {} for commitment_class in COMMITMENT_CLASS_ORDER: values = commitment_classes[commitment_class] serializable_classes[commitment_class] = {