diff --git a/artifacts/mcp_trace_replay_results.json b/artifacts/mcp_trace_replay_results.json new file mode 100644 index 0000000..59afd49 --- /dev/null +++ b/artifacts/mcp_trace_replay_results.json @@ -0,0 +1,93 @@ +{ + "artifact_id": "mcp_trace_replay_results_v1", + "family": "mcp_trace_replay", + "fixtures": [ + { + "degradation_level": "baseline", + "expected_admissible": true, + "failed_contracts": [], + "failure_labels": [], + "fixture_id": "mcp_trace_replay_v1", + "observed_admissible": true, + "overall_admissibility_score": "1.000000", + "passed_contracts": [ + "capability_boundary_respected", + "dependency_chain_preserved", + "recovery_path_available", + "tool_call_order_preserved", + "validation_before_unsafe_action" + ] + }, + { + "degradation_level": "mild", + "expected_admissible": false, + "failed_contracts": [ + "capability_boundary_respected", + "recovery_path_available" + ], + "failure_labels": [ + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "fixture_id": "mcp_trace_replay_mild_v1", + "observed_admissible": false, + "overall_admissibility_score": "0.833333", + "passed_contracts": [ + "dependency_chain_preserved", + "tool_call_order_preserved", + "validation_before_unsafe_action" + ] + }, + { + "degradation_level": "moderate", + "expected_admissible": false, + "failed_contracts": [ + "capability_boundary_respected", + "dependency_chain_preserved" + ], + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION" + ], + "fixture_id": "mcp_trace_replay_moderate_v1", + "observed_admissible": false, + "overall_admissibility_score": "0.833333", + "passed_contracts": [ + "recovery_path_available", + "tool_call_order_preserved", + "validation_before_unsafe_action" + ] + }, + { + "degradation_level": "severe", + "expected_admissible": false, + "failed_contracts": [ + "capability_boundary_respected", + "dependency_chain_preserved", + "recovery_path_available" + ], + "failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "fixture_id": "mcp_trace_replay_degraded_v1", + "observed_admissible": false, + "overall_admissibility_score": "0.750000", + "passed_contracts": [ + "tool_call_order_preserved", + "validation_before_unsafe_action" + ] + } + ], + "generated_by": "McpTraceReplayArtifactGenerator", + "summary": { + "baseline_admissible": true, + "deterministic_evaluation": true, + "external_apis": "none", + "fixture_count": 4, + "llm_judges": "none", + "severe_admissible": false + }, + "version": "1.0" +} diff --git a/package.json b/package.json index 25894f4..0607cf9 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "check": "npm run layout && npm run typecheck && npm run validate && npm run build && npm run test", "generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py", "generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py", - "generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py" + "generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py", + "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py" } } diff --git a/scripts/generate_mcp_trace_replay_artifact.py b/scripts/generate_mcp_trace_replay_artifact.py new file mode 100644 index 0000000..3cd5ec7 --- /dev/null +++ b/scripts/generate_mcp_trace_replay_artifact.py @@ -0,0 +1,83 @@ +"""Deterministic entrypoint for MCP trace replay artifact regeneration.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from src.validation.degradation_curve_generator import DegradationCurveGenerator + +ARTIFACT_ID = "mcp_trace_replay_results_v1" +FAMILY = "mcp_trace_replay" +CURVE_LEVELS = ("baseline", "mild", "moderate", "severe") +OUTPUT_PATH = REPO_ROOT / "artifacts" / "mcp_trace_replay_results.json" +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" + + +def _fixture_payload(point: dict[str, Any], degradation_level: str) -> dict[str, Any]: + return { + "fixture_id": point["fixture_id"], + "degradation_level": degradation_level, + "expected_admissible": point["expected_admissible"], + "observed_admissible": point["observed_admissible"], + "overall_admissibility_score": f"{point['overall_admissibility_score']:.6f}", + "passed_contracts": point["passed_contracts"], + "failed_contracts": point["failed_contracts"], + "failure_labels": point["failure_labels"], + } + + +def _repo_rooted_fixture_paths(fixtures: tuple[Path, ...]) -> tuple[Path, ...]: + return tuple(path if path.is_absolute() else REPO_ROOT / path for path in fixtures) + + +def generate_mcp_trace_replay_artifact(output_path: Path = OUTPUT_PATH) -> Path: + generator = DegradationCurveGenerator() + fixtures = generator.fixtures_for_manifest_family( + FAMILY, + levels=CURVE_LEVELS, + manifest_path=MANIFEST_PATH, + ) + curve = generator.generate(_repo_rooted_fixture_paths(fixtures), curve_id=f"{FAMILY}_curve_v1") + curve_dict = generator.to_dict(curve) + + fixture_payload = [ + _fixture_payload(point, level) + for point, level in zip(curve_dict["points"], CURVE_LEVELS, strict=True) + ] + + payload = { + "artifact_id": ARTIFACT_ID, + "generated_by": "McpTraceReplayArtifactGenerator", + "version": "1.0", + "family": FAMILY, + "fixtures": fixture_payload, + "summary": { + "fixture_count": len(fixture_payload), + "baseline_admissible": fixture_payload[0]["observed_admissible"], + "severe_admissible": fixture_payload[-1]["observed_admissible"], + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + return output_path + + +def main() -> int: + output_path = generate_mcp_trace_replay_artifact() + print(output_path.relative_to(REPO_ROOT).as_posix()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_mcp_trace_replay_artifact.py b/tests/test_mcp_trace_replay_artifact.py new file mode 100644 index 0000000..5d99481 --- /dev/null +++ b/tests/test_mcp_trace_replay_artifact.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.generate_mcp_trace_replay_artifact import ARTIFACT_ID, FAMILY, generate_mcp_trace_replay_artifact + +ARTIFACT_PATH = Path("artifacts/mcp_trace_replay_results.json") +MANIFEST_PATH = Path("fixtures/manifest.json") +EXPECTED_ORDER = [ + "mcp_trace_replay_v1", + "mcp_trace_replay_mild_v1", + "mcp_trace_replay_moderate_v1", + "mcp_trace_replay_degraded_v1", +] + + +def _load_json(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _manifest_fixture_index() -> dict[str, dict[str, object]]: + manifest = _load_json(MANIFEST_PATH) + return {entry["fixture_id"]: entry for entry in manifest["fixtures"]} + + +def test_script_output_matches_committed_artifact(tmp_path: Path) -> None: + output_path = tmp_path / "mcp_trace_replay_results.json" + generate_mcp_trace_replay_artifact(output_path) + + assert _load_json(output_path) == _load_json(ARTIFACT_PATH) + + +def test_artifact_has_stable_schema_no_time_or_environment_fields() -> None: + payload = _load_json(ARTIFACT_PATH) + + assert set(payload.keys()) == {"artifact_id", "generated_by", "version", "family", "fixtures", "summary"} + assert payload["artifact_id"] == ARTIFACT_ID + assert payload["family"] == FAMILY + + +def test_fixture_order_is_deterministic() -> None: + payload = _load_json(ARTIFACT_PATH) + fixture_ids = [entry["fixture_id"] for entry in payload["fixtures"]] + assert fixture_ids == EXPECTED_ORDER + + +def test_labels_and_admissibility_align_with_manifest_expectations() -> None: + payload = _load_json(ARTIFACT_PATH) + manifest_index = _manifest_fixture_index() + + for fixture in payload["fixtures"]: + fixture_id = fixture["fixture_id"] + expected = manifest_index[fixture_id] + assert fixture["expected_admissible"] == expected["expected_admissible"] + assert fixture["failure_labels"] == expected["expected_failure_labels"] + + +def test_baseline_and_severe_admissibility_guarantee() -> None: + payload = _load_json(ARTIFACT_PATH) + fixtures = payload["fixtures"] + summary = payload["summary"] + + assert fixtures[0]["degradation_level"] == "baseline" + assert fixtures[0]["observed_admissible"] is True + assert fixtures[-1]["degradation_level"] == "severe" + assert fixtures[-1]["observed_admissible"] is False + assert summary["baseline_admissible"] is True + assert summary["severe_admissible"] is False