diff --git a/artifacts/evidence_index.json b/artifacts/evidence_index.json new file mode 100644 index 0000000..f6f55b2 --- /dev/null +++ b/artifacts/evidence_index.json @@ -0,0 +1,200 @@ +{ + "artifact_id": "evidence_index_v1", + "generated_by": "EvidenceIndexGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "artifacts": [ + { + "path": "artifacts/capability_boundary_replay_results.json", + "format": "json", + "generator": "scripts/generate_capability_boundary_replay_artifact.py", + "evidence_category": "capability_boundary_replay", + "evidence_role": "capability boundary drift replay evidence", + "fixture_families": [ + "coding_workflow_pr_review", + "cross_domain_operational_dependency_workflow", + "incident_response_page_triage", + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "evaluation_mode", + "external_apis", + "families", + "generated_by", + "global_summary", + "llm_judges", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": true, + "evidence_bearing": true, + "visualization_only": false + }, + { + "path": "artifacts/graph_diff_results.json", + "format": "json", + "generator": "scripts/generate_graph_diff_artifact.py", + "evidence_category": "graph_diff", + "evidence_role": "relational replay graph evidence", + "fixture_families": [ + "coding_workflow_pr_review", + "cross_domain_operational_dependency_workflow", + "incident_response_page_triage", + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "evaluation_mode", + "external_apis", + "families", + "generated_by", + "global_summary", + "llm_judges", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": true, + "evidence_bearing": true, + "visualization_only": false + }, + { + "path": "artifacts/mcp_trace_replay_results.json", + "format": "json", + "generator": "scripts/generate_mcp_trace_replay_artifact.py", + "evidence_category": "mcp_trace_replay", + "evidence_role": "single-family MCP trace replay evidence", + "fixture_families": [ + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "family", + "fixtures", + "generated_by", + "summary", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": false, + "evidence_bearing": true, + "visualization_only": false + }, + { + "path": "artifacts/multi_family_admissibility_curves.svg", + "format": "svg", + "generator": "scripts/render_multi_family_admissibility_svg.py", + "evidence_category": "multi_family_admissibility_visualization", + "evidence_role": "visualization of admissibility outcomes", + "fixture_families": [], + "top_level_keys": [], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": false, + "evidence_bearing": false, + "visualization_only": true + }, + { + "path": "artifacts/multi_family_admissibility_results.json", + "format": "json", + "generator": "scripts/generate_multi_family_admissibility_artifact.py", + "evidence_category": "multi_family_admissibility", + "evidence_role": "cross-family admissibility evidence", + "fixture_families": [ + "coding_workflow_pr_review", + "cross_domain_operational_dependency_workflow", + "incident_response_page_triage", + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "families", + "generated_by", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": true, + "evidence_bearing": true, + "visualization_only": false + }, + { + "path": "artifacts/replay_semantic_integrity_results.json", + "format": "json", + "generator": "scripts/generate_replay_semantic_integrity_artifact.py", + "evidence_category": "replay_semantic_integrity", + "evidence_role": "semantic replay integrity evidence", + "fixture_families": [ + "coding_workflow_pr_review", + "cross_domain_operational_dependency_workflow", + "incident_response_page_triage", + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "evaluation_mode", + "external_apis", + "families", + "generated_by", + "global_summary", + "llm_judges", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": true, + "evidence_bearing": true, + "visualization_only": false + }, + { + "path": "artifacts/tool_ordering_replay_results.json", + "format": "json", + "generator": "scripts/generate_tool_ordering_replay_artifact.py", + "evidence_category": "tool_ordering_replay", + "evidence_role": "tool-order replay drift evidence", + "fixture_families": [ + "coding_workflow_pr_review", + "cross_domain_operational_dependency_workflow", + "incident_response_page_triage", + "mcp_trace_replay" + ], + "top_level_keys": [ + "artifact_id", + "evaluation_mode", + "external_apis", + "families", + "generated_by", + "global_summary", + "llm_judges", + "version" + ], + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": true, + "evidence_bearing": true, + "visualization_only": false + } + ], + "global_summary": { + "artifact_count": 7, + "json_artifact_count": 6, + "svg_artifact_count": 1, + "evidence_bearing_count": 6, + "visualization_only_count": 1, + "deterministic_artifact_count": 7, + "llm_free_artifact_count": 7, + "external_api_free_artifact_count": 7 + } +} diff --git a/scripts/generate_evidence_index.py b/scripts/generate_evidence_index.py new file mode 100644 index 0000000..3f14d14 --- /dev/null +++ b/scripts/generate_evidence_index.py @@ -0,0 +1,171 @@ +"""Generate deterministic evidence index for committed artifacts.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +OUTPUT_PATH = REPO_ROOT / "artifacts" / "evidence_index.json" + +ARTIFACT_SPECS: tuple[dict[str, Any], ...] = ( + { + "path": "artifacts/capability_boundary_replay_results.json", + "format": "json", + "generator": "scripts/generate_capability_boundary_replay_artifact.py", + "evidence_category": "capability_boundary_replay", + "evidence_role": "capability boundary drift replay evidence", + "evidence_bearing": True, + "visualization_only": False, + }, + { + "path": "artifacts/graph_diff_results.json", + "format": "json", + "generator": "scripts/generate_graph_diff_artifact.py", + "evidence_category": "graph_diff", + "evidence_role": "relational replay graph evidence", + "evidence_bearing": True, + "visualization_only": False, + }, + { + "path": "artifacts/mcp_trace_replay_results.json", + "format": "json", + "generator": "scripts/generate_mcp_trace_replay_artifact.py", + "evidence_category": "mcp_trace_replay", + "evidence_role": "single-family MCP trace replay evidence", + "evidence_bearing": True, + "visualization_only": False, + }, + { + "path": "artifacts/multi_family_admissibility_curves.svg", + "format": "svg", + "generator": "scripts/render_multi_family_admissibility_svg.py", + "evidence_category": "multi_family_admissibility_visualization", + "evidence_role": "visualization of admissibility outcomes", + "evidence_bearing": False, + "visualization_only": True, + }, + { + "path": "artifacts/multi_family_admissibility_results.json", + "format": "json", + "generator": "scripts/generate_multi_family_admissibility_artifact.py", + "evidence_category": "multi_family_admissibility", + "evidence_role": "cross-family admissibility evidence", + "evidence_bearing": True, + "visualization_only": False, + }, + { + "path": "artifacts/replay_semantic_integrity_results.json", + "format": "json", + "generator": "scripts/generate_replay_semantic_integrity_artifact.py", + "evidence_category": "replay_semantic_integrity", + "evidence_role": "semantic replay integrity evidence", + "evidence_bearing": True, + "visualization_only": False, + }, + { + "path": "artifacts/tool_ordering_replay_results.json", + "format": "json", + "generator": "scripts/generate_tool_ordering_replay_artifact.py", + "evidence_category": "tool_ordering_replay", + "evidence_role": "tool-order replay drift evidence", + "evidence_bearing": True, + "visualization_only": False, + }, +) + + +def _load_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _manifest_families() -> set[str]: + manifest = _load_json(MANIFEST_PATH) + return {str(fixture["family"]) for fixture in manifest["fixtures"]} + + +def _extract_fixture_families(payload: dict[str, Any]) -> list[str]: + families: set[str] = set() + if isinstance(payload.get("families"), list): + for family in payload["families"]: + if isinstance(family, dict) and isinstance(family.get("family"), str): + families.add(family["family"]) + if isinstance(payload.get("family"), str): + families.add(payload["family"]) + return sorted(families) + + +def _build_artifact_entry(spec: dict[str, Any], manifest_families: set[str]) -> dict[str, Any] | None: + artifact_path = REPO_ROOT / spec["path"] + if not artifact_path.exists(): + return None + + entry = { + "path": spec["path"], + "format": spec["format"], + "generator": spec["generator"], + "evidence_category": spec["evidence_category"], + "evidence_role": spec["evidence_role"], + "fixture_families": [], + "top_level_keys": [], + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + "manifest_aligned": False, + "evidence_bearing": spec["evidence_bearing"], + "visualization_only": spec["visualization_only"], + } + + if spec["format"] == "json": + payload = _load_json(artifact_path) + families = _extract_fixture_families(payload) + entry["fixture_families"] = families + entry["top_level_keys"] = sorted(payload.keys()) + entry["deterministic_evaluation"] = payload.get("evaluation_mode", "deterministic") == "deterministic" + entry["llm_judges"] = payload.get("llm_judges", "none") + entry["external_apis"] = payload.get("external_apis", "none") + if families: + entry["manifest_aligned"] = set(families) == manifest_families + + return entry + + +def generate_evidence_index(output_path: Path = OUTPUT_PATH) -> Path: + manifest_families = _manifest_families() + artifacts = [ + entry + for spec in sorted(ARTIFACT_SPECS, key=lambda item: item["path"]) + for entry in [_build_artifact_entry(spec, manifest_families)] + if entry is not None + ] + + index = { + "artifact_id": "evidence_index_v1", + "generated_by": "EvidenceIndexGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "artifacts": artifacts, + "global_summary": { + "artifact_count": len(artifacts), + "json_artifact_count": sum(1 for item in artifacts if item["format"] == "json"), + "svg_artifact_count": sum(1 for item in artifacts if item["format"] == "svg"), + "evidence_bearing_count": sum(1 for item in artifacts if item["evidence_bearing"]), + "visualization_only_count": sum(1 for item in artifacts if item["visualization_only"]), + "deterministic_artifact_count": sum(1 for item in artifacts if item["deterministic_evaluation"]), + "llm_free_artifact_count": sum(1 for item in artifacts if item["llm_judges"] == "none"), + "external_api_free_artifact_count": sum(1 for item in artifacts if item["external_apis"] == "none"), + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(index, indent=2) + "\n", encoding="utf-8") + return output_path + + +if __name__ == "__main__": + path = generate_evidence_index() + print(path.relative_to(REPO_ROOT).as_posix()) diff --git a/tests/test_evidence_index.py b/tests/test_evidence_index.py new file mode 100644 index 0000000..669d1de --- /dev/null +++ b/tests/test_evidence_index.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.generate_evidence_index import generate_evidence_index + +REPO_ROOT = Path(__file__).resolve().parents[1] +ARTIFACT_PATH = REPO_ROOT / "artifacts" / "evidence_index.json" + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def test_artifact_exists() -> None: + assert ARTIFACT_PATH.exists() + + +def test_generator_output_matches_committed_artifact(tmp_path: Path) -> None: + output = tmp_path / "evidence_index.json" + generate_evidence_index(output) + assert output.read_text(encoding="utf-8") == ARTIFACT_PATH.read_text(encoding="utf-8") + + +def test_top_level_schema_is_stable() -> None: + artifact = _load_json(ARTIFACT_PATH) + assert list(artifact) == [ + "artifact_id", + "generated_by", + "version", + "evaluation_mode", + "llm_judges", + "external_apis", + "artifacts", + "global_summary", + ] + + +def test_determinism_and_sanitization(tmp_path: Path) -> None: + first = tmp_path / "a.json" + second = tmp_path / "b.json" + generate_evidence_index(first) + generate_evidence_index(second) + + first_text = first.read_text(encoding="utf-8") + second_text = second.read_text(encoding="utf-8") + assert first_text == second_text + + blob = first_text.lower() + assert "timestamp" not in blob + assert "generated_at" not in blob + assert "environment" not in blob + assert "user" not in blob + assert "hostname" not in blob + assert "hash" not in blob + assert "digest" not in blob + assert "/workspace/" not in first_text + assert str(Path.home()) not in first_text + + +def test_entries_are_sorted_and_files_exist() -> None: + artifact = _load_json(ARTIFACT_PATH) + paths = [entry["path"] for entry in artifact["artifacts"]] + assert paths == sorted(paths) + for path in paths: + assert (REPO_ROOT / path).exists() + + +def test_json_artifacts_parse_and_list_top_level_keys() -> None: + artifact = _load_json(ARTIFACT_PATH) + for entry in artifact["artifacts"]: + if entry["format"] != "json": + continue + payload = _load_json(REPO_ROOT / entry["path"]) + assert entry["top_level_keys"] == sorted(payload.keys()) + assert entry["deterministic_evaluation"] is True + assert entry["llm_judges"] == "none" + assert entry["external_apis"] == "none" + + +def test_svg_artifacts_are_visualization_only() -> None: + artifact = _load_json(ARTIFACT_PATH) + svg_entries = [entry for entry in artifact["artifacts"] if entry["format"] == "svg"] + for entry in svg_entries: + assert entry["visualization_only"] is True + assert entry["evidence_bearing"] is False + + +def test_global_summary_counts_match_entries() -> None: + artifact = _load_json(ARTIFACT_PATH) + entries = artifact["artifacts"] + summary = artifact["global_summary"] + + assert summary["artifact_count"] == len(entries) + assert summary["json_artifact_count"] == sum(1 for item in entries if item["format"] == "json") + assert summary["svg_artifact_count"] == sum(1 for item in entries if item["format"] == "svg") + assert summary["evidence_bearing_count"] == sum(1 for item in entries if item["evidence_bearing"]) + assert summary["visualization_only_count"] == sum(1 for item in entries if item["visualization_only"]) + assert summary["deterministic_artifact_count"] == sum(1 for item in entries if item["deterministic_evaluation"]) + assert summary["llm_free_artifact_count"] == sum(1 for item in entries if item["llm_judges"] == "none") + assert summary["external_api_free_artifact_count"] == sum(1 for item in entries if item["external_apis"] == "none")