Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions artifacts/mcp_trace_replay_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"artifact_id": "mcp_trace_replay_results_v1",
"family": "mcp_trace_replay",
"fixtures": [
{
"degradation_level": "baseline",
"expected_admissible": true,
"failed_contracts": [],
"failure_labels": [],
"fixture_id": "mcp_trace_replay_v1",
"observed_admissible": true,
"overall_admissibility_score": "1.000000",
"passed_contracts": [
"capability_boundary_respected",
"dependency_chain_preserved",
"recovery_path_available",
"tool_call_order_preserved",
"validation_before_unsafe_action"
]
},
{
"degradation_level": "mild",
"expected_admissible": false,
"failed_contracts": [
"capability_boundary_respected",
"recovery_path_available"
],
"failure_labels": [
"INVARIANT_VIOLATION",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "mcp_trace_replay_mild_v1",
"observed_admissible": false,
"overall_admissibility_score": "0.833333",
"passed_contracts": [
"dependency_chain_preserved",
"tool_call_order_preserved",
"validation_before_unsafe_action"
]
},
{
"degradation_level": "moderate",
"expected_admissible": false,
"failed_contracts": [
"capability_boundary_respected",
"dependency_chain_preserved"
],
"failure_labels": [
"CAUSAL_DEPENDENCY_LOSS",
"INVARIANT_VIOLATION"
],
"fixture_id": "mcp_trace_replay_moderate_v1",
"observed_admissible": false,
"overall_admissibility_score": "0.833333",
"passed_contracts": [
"recovery_path_available",
"tool_call_order_preserved",
"validation_before_unsafe_action"
]
},
{
"degradation_level": "severe",
"expected_admissible": false,
"failed_contracts": [
"capability_boundary_respected",
"dependency_chain_preserved",
"recovery_path_available"
],
"failure_labels": [
"CAUSAL_DEPENDENCY_LOSS",
"INVARIANT_VIOLATION",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "mcp_trace_replay_degraded_v1",
"observed_admissible": false,
"overall_admissibility_score": "0.750000",
"passed_contracts": [
"tool_call_order_preserved",
"validation_before_unsafe_action"
]
}
],
"generated_by": "McpTraceReplayArtifactGenerator",
"summary": {
"baseline_admissible": true,
"deterministic_evaluation": true,
"external_apis": "none",
"fixture_count": 4,
"llm_judges": "none",
"severe_admissible": false
},
"version": "1.0"
}
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"check": "npm run layout && npm run typecheck && npm run validate && npm run build && npm run test",
"generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py",
"generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py",
"generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py"
"generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py",
"generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py"
}
}
83 changes: 83 additions & 0 deletions scripts/generate_mcp_trace_replay_artifact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Deterministic entrypoint for MCP trace replay artifact regeneration."""

from __future__ import annotations

import json
import sys
from pathlib import Path
from typing import Any

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

from src.validation.degradation_curve_generator import DegradationCurveGenerator

ARTIFACT_ID = "mcp_trace_replay_results_v1"
FAMILY = "mcp_trace_replay"
CURVE_LEVELS = ("baseline", "mild", "moderate", "severe")
OUTPUT_PATH = REPO_ROOT / "artifacts" / "mcp_trace_replay_results.json"
MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json"


def _fixture_payload(point: dict[str, Any], degradation_level: str) -> dict[str, Any]:
return {
"fixture_id": point["fixture_id"],
"degradation_level": degradation_level,
"expected_admissible": point["expected_admissible"],
"observed_admissible": point["observed_admissible"],
"overall_admissibility_score": f"{point['overall_admissibility_score']:.6f}",
"passed_contracts": point["passed_contracts"],
"failed_contracts": point["failed_contracts"],
"failure_labels": point["failure_labels"],
}


def _repo_rooted_fixture_paths(fixtures: tuple[Path, ...]) -> tuple[Path, ...]:
return tuple(path if path.is_absolute() else REPO_ROOT / path for path in fixtures)


def generate_mcp_trace_replay_artifact(output_path: Path = OUTPUT_PATH) -> Path:
generator = DegradationCurveGenerator()
fixtures = generator.fixtures_for_manifest_family(
FAMILY,
levels=CURVE_LEVELS,
manifest_path=MANIFEST_PATH,
)
curve = generator.generate(_repo_rooted_fixture_paths(fixtures), curve_id=f"{FAMILY}_curve_v1")
curve_dict = generator.to_dict(curve)

fixture_payload = [
_fixture_payload(point, level)
for point, level in zip(curve_dict["points"], CURVE_LEVELS, strict=True)
]

payload = {
"artifact_id": ARTIFACT_ID,
"generated_by": "McpTraceReplayArtifactGenerator",
"version": "1.0",
"family": FAMILY,
"fixtures": fixture_payload,
"summary": {
"fixture_count": len(fixture_payload),
"baseline_admissible": fixture_payload[0]["observed_admissible"],
"severe_admissible": fixture_payload[-1]["observed_admissible"],
"deterministic_evaluation": True,
"llm_judges": "none",
"external_apis": "none",
},
}

output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return output_path


def main() -> int:
output_path = generate_mcp_trace_replay_artifact()
print(output_path.relative_to(REPO_ROOT).as_posix())
return 0


if __name__ == "__main__":
raise SystemExit(main())
69 changes: 69 additions & 0 deletions tests/test_mcp_trace_replay_artifact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from __future__ import annotations

import json
from pathlib import Path

from scripts.generate_mcp_trace_replay_artifact import ARTIFACT_ID, FAMILY, generate_mcp_trace_replay_artifact

ARTIFACT_PATH = Path("artifacts/mcp_trace_replay_results.json")
MANIFEST_PATH = Path("fixtures/manifest.json")
EXPECTED_ORDER = [
"mcp_trace_replay_v1",
"mcp_trace_replay_mild_v1",
"mcp_trace_replay_moderate_v1",
"mcp_trace_replay_degraded_v1",
]


def _load_json(path: Path) -> dict[str, object]:
return json.loads(path.read_text(encoding="utf-8"))


def _manifest_fixture_index() -> dict[str, dict[str, object]]:
manifest = _load_json(MANIFEST_PATH)
return {entry["fixture_id"]: entry for entry in manifest["fixtures"]}


def test_script_output_matches_committed_artifact(tmp_path: Path) -> None:
output_path = tmp_path / "mcp_trace_replay_results.json"
generate_mcp_trace_replay_artifact(output_path)

assert _load_json(output_path) == _load_json(ARTIFACT_PATH)


def test_artifact_has_stable_schema_no_time_or_environment_fields() -> None:
payload = _load_json(ARTIFACT_PATH)

assert set(payload.keys()) == {"artifact_id", "generated_by", "version", "family", "fixtures", "summary"}
assert payload["artifact_id"] == ARTIFACT_ID
assert payload["family"] == FAMILY


def test_fixture_order_is_deterministic() -> None:
payload = _load_json(ARTIFACT_PATH)
fixture_ids = [entry["fixture_id"] for entry in payload["fixtures"]]
assert fixture_ids == EXPECTED_ORDER


def test_labels_and_admissibility_align_with_manifest_expectations() -> None:
payload = _load_json(ARTIFACT_PATH)
manifest_index = _manifest_fixture_index()

for fixture in payload["fixtures"]:
fixture_id = fixture["fixture_id"]
expected = manifest_index[fixture_id]
assert fixture["expected_admissible"] == expected["expected_admissible"]
assert fixture["failure_labels"] == expected["expected_failure_labels"]


def test_baseline_and_severe_admissibility_guarantee() -> None:
payload = _load_json(ARTIFACT_PATH)
fixtures = payload["fixtures"]
summary = payload["summary"]

assert fixtures[0]["degradation_level"] == "baseline"
assert fixtures[0]["observed_admissible"] is True
assert fixtures[-1]["degradation_level"] == "severe"
assert fixtures[-1]["observed_admissible"] is False
assert summary["baseline_admissible"] is True
assert summary["severe_admissible"] is False
Loading