diff --git a/artifacts/replay_semantic_integrity_results.json b/artifacts/replay_semantic_integrity_results.json new file mode 100644 index 0000000..567be58 --- /dev/null +++ b/artifacts/replay_semantic_integrity_results.json @@ -0,0 +1,255 @@ +{ + "artifact_id": "replay_semantic_integrity_results_v1", + "generated_by": "ReplaySemanticIntegrityArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": [ + { + "family": "coding_workflow_pr_review", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS" + ] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "INVARIANT_VIOLATION" + ] + } + } + }, + { + "family": "incident_response_page_triage", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS" + ] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "INVARIANT_VIOLATION" + ] + } + } + }, + { + "family": "cross_domain_operational_dependency_workflow", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS" + ] + }, + "recovery_paths": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "capability_boundaries": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "governance_or_policy": { + "passed": 3, + "failed": 1, + "failure_labels": [ + "POLICY_ORDER_BROKEN" + ] + }, + "invariants": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "INVARIANT_VIOLATION" + ] + } + } + }, + { + "family": "mcp_trace_replay", + "fixture_count": 4, + "levels": [ + "baseline", + "mild", + "moderate", + "severe" + ], + "commitment_classes": { + "evidence": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "constraints": { + "passed": 4, + "failed": 0, + "failure_labels": [] + }, + "dependencies": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS" + ] + }, + "recovery_paths": { + "passed": 2, + "failed": 2, + "failure_labels": [ + "RECOVERY_PATH_INVALID" + ] + }, + "tool_order": { + "passed": 4, + "failed": 0, + "failure_labels": [] + }, + "capability_boundaries": { + "passed": 1, + "failed": 3, + "failure_labels": [ + "INVARIANT_VIOLATION" + ] + }, + "governance_or_policy": { + "passed": 0, + "failed": 0, + "failure_labels": [] + }, + "invariants": { + "passed": 0, + "failed": 0, + "failure_labels": [] + } + } + } + ], + "global_summary": { + "family_count": 4, + "fixture_count": 16, + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none" + } +} diff --git a/package.json b/package.json index 0607cf9..1a12cb3 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py", "generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py", "generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py", - "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py" + "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py", + "generate:replay-semantic-integrity": "python scripts/generate_replay_semantic_integrity_artifact.py" } } diff --git a/scripts/generate_replay_semantic_integrity_artifact.py b/scripts/generate_replay_semantic_integrity_artifact.py new file mode 100644 index 0000000..cb746a7 --- /dev/null +++ b/scripts/generate_replay_semantic_integrity_artifact.py @@ -0,0 +1,178 @@ +"""Deterministic entrypoint for replay semantic integrity artifact regeneration.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from src.validation.contract_validator import ContractType, ContractValidator, Layer + +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +OUTPUT_PATH = REPO_ROOT / "artifacts" / "replay_semantic_integrity_results.json" + +ARTIFACT_ID = "replay_semantic_integrity_results_v1" +LEVELS = ("baseline", "mild", "moderate", "severe") +COMMITMENT_CLASS_ORDER = ( + "evidence", + "constraints", + "dependencies", + "recovery_paths", + "tool_order", + "capability_boundaries", + "governance_or_policy", + "invariants", +) + + +def _load_json(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _family_order_and_counts() -> tuple[list[str], dict[str, int]]: + manifest = _load_json(MANIFEST_PATH) + fixtures = manifest["fixtures"] + family_order: list[str] = [] + fixture_counts: dict[str, int] = {} + + for entry in fixtures: + family = entry["family"] + if family not in fixture_counts: + family_order.append(family) + fixture_counts[family] = 0 + fixture_counts[family] += 1 + + return family_order, fixture_counts + + +def _class_for_contract(contract_id: str, contract_type: ContractType, layer: Layer) -> str: + contract = contract_id.lower() + + if any(token in contract for token in ("capability", "boundary")): + return "capability_boundaries" + if any(token in contract for token in ("policy", "governance", "approval")): + return "governance_or_policy" + if any(token in contract for token in ("recovery", "rollback", "escalation")): + return "recovery_paths" + if any(token in contract for token in ("dependency", "causal", "chain")): + return "dependencies" + if any(token in contract for token in ("order", "ordering", "sequence", "tool_call_order")): + return "tool_order" + if any(token in contract for token in ("invariant", "orphan")): + return "invariants" + if any(token in contract for token in ("evidence",)): + return "evidence" + if any(token in contract for token in ("constraint", "validation")): + return "constraints" + + if contract_type == ContractType.CAUSALITY: + return "dependencies" + if contract_type == ContractType.REACHABILITY: + return "recovery_paths" + if contract_type == ContractType.ORDERING: + return "governance_or_policy" if layer == Layer.GOVERNANCE else "tool_order" + if contract_type == ContractType.INVARIANT: + return "invariants" + + return "constraints" + + +def generate_replay_semantic_integrity_artifact(output_path: Path = OUTPUT_PATH) -> Path: + manifest = _load_json(MANIFEST_PATH) + fixtures = manifest["fixtures"] + family_order, fixture_counts = _family_order_and_counts() + + families_payload: list[dict[str, object]] = [] + total_fixture_count = 0 + + for family in family_order: + family_fixtures = [entry for entry in fixtures if entry["family"] == family] + points = sorted(family_fixtures, key=lambda entry: LEVELS.index(entry["degradation_level"])) + + commitment_classes: dict[str, dict[str, object]] = { + commitment_class: {"passed": 0, "failed": 0, "failure_labels": set()} + for commitment_class in COMMITMENT_CLASS_ORDER + } + + for fixture_entry in points: + fixture_path = REPO_ROOT / str(fixture_entry["path"]) + original: dict[str, Any] = { + **_load_json(fixture_path / "original/trace.json"), + **_load_json(fixture_path / "original/state.json"), + "dependency_graph": _load_json(fixture_path / "original/dependency_graph.json"), + } + reconstructed: dict[str, Any] = { + **_load_json(fixture_path / "reconstructed/trace.json"), + **_load_json(fixture_path / "reconstructed/state.json"), + "dependency_graph": _load_json(fixture_path / "reconstructed/dependency_graph.json"), + } + contracts_dir = fixture_path / "original/contracts" + contracts_by_id = { + contract["contract_id"]: contract for contract in (_load_json(path) for path in sorted(contracts_dir.glob("*.json"))) + } + contracts = [contracts_by_id[contract_id] for contract_id in fixture_entry["contracts"]] + results = ContractValidator().validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts) + + for result in results: + commitment_class = _class_for_contract(result.contract_id, result.contract_type, result.layer) + if not result.passed: + commitment_classes[commitment_class]["failed"] += 1 + if result.failure_label is not None: + commitment_classes[commitment_class]["failure_labels"].add(result.failure_label) + else: + commitment_classes[commitment_class]["passed"] += 1 + + serializable_classes: dict[str, dict[str, object]] = {} + for commitment_class in COMMITMENT_CLASS_ORDER: + values = commitment_classes[commitment_class] + serializable_classes[commitment_class] = { + "passed": values["passed"], + "failed": values["failed"], + "failure_labels": sorted(values["failure_labels"]), + } + + families_payload.append( + { + "family": family, + "fixture_count": fixture_counts[family], + "levels": list(LEVELS), + "commitment_classes": serializable_classes, + } + ) + total_fixture_count += fixture_counts[family] + + payload = { + "artifact_id": ARTIFACT_ID, + "generated_by": "ReplaySemanticIntegrityArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": families_payload, + "global_summary": { + "family_count": len(families_payload), + "fixture_count": total_fixture_count, + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + return output_path + + +def main() -> int: + output_path = generate_replay_semantic_integrity_artifact() + print(output_path.relative_to(REPO_ROOT).as_posix()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_replay_semantic_integrity_artifact.py b/tests/test_replay_semantic_integrity_artifact.py new file mode 100644 index 0000000..4320044 --- /dev/null +++ b/tests/test_replay_semantic_integrity_artifact.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +from scripts.generate_replay_semantic_integrity_artifact import ( + ARTIFACT_ID, + COMMITMENT_CLASS_ORDER, + LEVELS, + _class_for_contract, + generate_replay_semantic_integrity_artifact, +) +from src.validation.contract_validator import ContractValidator +from src.validation.failure_taxonomy import FAILURE_TAXONOMY + +ARTIFACT_PATH = Path("artifacts/replay_semantic_integrity_results.json") +MANIFEST_PATH = Path("fixtures/manifest.json") +EXPECTED_FAMILIES = [ + "coding_workflow_pr_review", + "incident_response_page_triage", + "cross_domain_operational_dependency_workflow", + "mcp_trace_replay", +] +FORBIDDEN_FIELDS = { + "timestamp", + "generated_at", + "environment", + "hostname", + "cwd", + "machine", + "semantic_similarity", + "embedding", + "llm", + "judge", +} + + +def _load_json(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _walk_keys(value: object) -> set[str]: + keys: set[str] = set() + if isinstance(value, dict): + keys.update(value.keys()) + for nested in value.values(): + keys.update(_walk_keys(nested)) + elif isinstance(value, list): + for nested in value: + keys.update(_walk_keys(nested)) + return keys + + +def _validation_label_union_by_family_and_class() -> dict[str, dict[str, set[str]]]: + manifest = _load_json(MANIFEST_PATH) + validator = ContractValidator() + output: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set)) + + for fixture_entry in manifest["fixtures"]: + fixture_path = Path(fixture_entry["path"]) + original: dict[str, Any] = { + **_load_json(fixture_path / "original/trace.json"), + **_load_json(fixture_path / "original/state.json"), + "dependency_graph": _load_json(fixture_path / "original/dependency_graph.json"), + } + reconstructed: dict[str, Any] = { + **_load_json(fixture_path / "reconstructed/trace.json"), + **_load_json(fixture_path / "reconstructed/state.json"), + "dependency_graph": _load_json(fixture_path / "reconstructed/dependency_graph.json"), + } + contracts_dir = fixture_path / "original/contracts" + contracts_by_id = { + contract["contract_id"]: contract for contract in (_load_json(path) for path in sorted(contracts_dir.glob("*.json"))) + } + contracts = [contracts_by_id[contract_id] for contract_id in fixture_entry["contracts"]] + results = validator.validate_contracts(original=original, reconstructed=reconstructed, contracts=contracts) + + family = fixture_entry["family"] + for result in results: + commitment_class = _class_for_contract(result.contract_id, result.contract_type, result.layer) + if not result.passed and result.failure_label is not None: + output[family][commitment_class].add(result.failure_label) + + return output + + +def test_script_output_matches_committed_artifact(tmp_path: Path) -> None: + output_path = tmp_path / "replay_semantic_integrity_results.json" + generate_replay_semantic_integrity_artifact(output_path) + + assert _load_json(output_path) == _load_json(ARTIFACT_PATH) + + +def test_artifact_schema_has_no_time_or_environment_fields() -> None: + payload = _load_json(ARTIFACT_PATH) + assert payload["artifact_id"] == ARTIFACT_ID + + all_keys = _walk_keys(payload) + for forbidden in FORBIDDEN_FIELDS: + assert forbidden not in all_keys + + +def test_all_required_families_are_represented_in_manifest_order() -> None: + payload = _load_json(ARTIFACT_PATH) + families = [entry["family"] for entry in payload["families"]] + assert families == EXPECTED_FAMILIES + + +def test_fixture_count_matches_manifest_and_levels_are_deterministic() -> None: + payload = _load_json(ARTIFACT_PATH) + manifest = _load_json(MANIFEST_PATH) + + expected_fixture_count = len(manifest["fixtures"]) + assert payload["global_summary"]["fixture_count"] == expected_fixture_count + + for family_payload in payload["families"]: + assert family_payload["fixture_count"] == 4 + assert family_payload["levels"] == list(LEVELS) + + +def test_commitment_class_order_is_stable_and_complete() -> None: + payload = _load_json(ARTIFACT_PATH) + + for family_payload in payload["families"]: + class_keys = list(family_payload["commitment_classes"].keys()) + assert class_keys == list(COMMITMENT_CLASS_ORDER) + + +def test_failure_labels_are_registered_and_sorted() -> None: + payload = _load_json(ARTIFACT_PATH) + registered_labels = set(FAILURE_TAXONOMY.keys()) + + for family_payload in payload["families"]: + for class_payload in family_payload["commitment_classes"].values(): + labels = class_payload["failure_labels"] + assert labels == sorted(labels) + for label in labels: + assert label in registered_labels + + +def test_artifact_declares_deterministic_mode_and_no_external_evaluators() -> None: + payload = _load_json(ARTIFACT_PATH) + + assert payload["evaluation_mode"] == "deterministic" + assert payload["llm_judges"] == "none" + assert payload["external_apis"] == "none" + assert payload["global_summary"]["deterministic_evaluation"] is True + assert payload["global_summary"]["llm_judges"] == "none" + assert payload["global_summary"]["external_apis"] == "none" + + +def test_contract_linked_label_behavior_recovery_and_ordering() -> None: + payload = _load_json(ARTIFACT_PATH) + families = {entry["family"]: entry for entry in payload["families"]} + + coding_recovery_labels = set(families["coding_workflow_pr_review"]["commitment_classes"]["recovery_paths"]["failure_labels"]) + assert "POLICY_ORDER_BROKEN" not in coding_recovery_labels + assert "CAUSAL_DEPENDENCY_LOSS" not in coding_recovery_labels + assert coding_recovery_labels == {"RECOVERY_PATH_INVALID"} + + cross_domain_order_labels = set( + families["cross_domain_operational_dependency_workflow"]["commitment_classes"]["governance_or_policy"]["failure_labels"] + ) + assert cross_domain_order_labels == {"POLICY_ORDER_BROKEN"} + + +def test_no_class_gets_full_fixture_label_set_without_contract_support() -> None: + payload = _load_json(ARTIFACT_PATH) + validation_union = _validation_label_union_by_family_and_class() + + for family_payload in payload["families"]: + family = family_payload["family"] + for commitment_class, class_payload in family_payload["commitment_classes"].items(): + artifact_labels = set(class_payload["failure_labels"]) + expected_labels = validation_union.get(family, {}).get(commitment_class, set()) + assert artifact_labels == expected_labels + + +def test_direct_validation_consistency_for_labels() -> None: + payload = _load_json(ARTIFACT_PATH) + validation_union = _validation_label_union_by_family_and_class() + + for family_payload in payload["families"]: + family = family_payload["family"] + for commitment_class, class_payload in family_payload["commitment_classes"].items(): + assert set(class_payload["failure_labels"]) == validation_union.get(family, {}).get(commitment_class, set())