From e18d6adb608c629d0af1dcd8c3b344aaae72b9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Wed, 20 May 2026 08:55:28 -0700 Subject: [PATCH] Add deterministic graph diff artifact generation --- artifacts/graph_diff_results.json | 2029 +++++++++++++++++++++++ package.json | 3 +- scripts/generate_graph_diff_artifact.py | 201 +++ tests/test_graph_diff_artifact.py | 101 ++ 4 files changed, 2333 insertions(+), 1 deletion(-) create mode 100644 artifacts/graph_diff_results.json create mode 100644 scripts/generate_graph_diff_artifact.py create mode 100644 tests/test_graph_diff_artifact.py diff --git a/artifacts/graph_diff_results.json b/artifacts/graph_diff_results.json new file mode 100644 index 0000000..9b69f84 --- /dev/null +++ b/artifacts/graph_diff_results.json @@ -0,0 +1,2029 @@ +{ + "artifact_id": "graph_diff_results_v1", + "generated_by": "GraphDiffArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": [ + { + "family": "coding_workflow_pr_review", + "fixtures": [ + { + "fixture_id": "coding_workflow_pr_review_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 12, + "replay_edge_count": 6, + "missing_edges": [ + [ + "escalate_to_human", + "human_review" + ], + [ + "human_review", + "merge" + ], + [ + "rollback", + "human_review" + ], + [ + "security_scan_failed", + "deploy_blocked" + ], + [ + "test_failure", + "escalate_to_human" + ], + [ + "test_failure", + "rollback" + ] + ], + "added_edges": [], + "missing_nodes": [ + "escalate_to_human", + "rollback" + ], + "added_nodes": [], + "original_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ], + "replay_nodes": [ + "deploy_blocked", + "generate_patch", + "human_review", + "merge", + "run_tests", + "security_scan_failed", + "test_failure" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "coding_workflow_pr_review_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 12, + "replay_edge_count": 12, + "missing_edges": [ + [ + "test_failure", + "escalate_to_human" + ], + [ + "test_failure", + "rollback" + ] + ], + "added_edges": [ + [ + "run_tests", + "escalate_to_human" + ], + [ + "run_tests", + "rollback" + ] + ], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ], + "replay_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "coding_workflow_pr_review_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 12, + "replay_edge_count": 12, + "missing_edges": [ + [ + "security_scan_failed", + "deploy_blocked" + ], + [ + "test_failure", + "escalate_to_human" + ], + [ + "test_failure", + "rollback" + ] + ], + "added_edges": [ + [ + "run_tests", + "deploy_blocked" + ], + [ + "run_tests", + "escalate_to_human" + ], + [ + "run_tests", + "rollback" + ] + ], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ], + "replay_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "coding_workflow_pr_review_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 12, + "replay_edge_count": 12, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ], + "replay_nodes": [ + "deploy_blocked", + "escalate_to_human", + "generate_patch", + "human_review", + "merge", + "rollback", + "run_tests", + "security_scan_failed", + "test_failure" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + } + ] + }, + { + "family": "cross_domain_operational_dependency_workflow", + "fixtures": [ + { + "fixture_id": "cross_domain_operational_dependency_workflow_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 5, + "missing_edges": [ + [ + "deploy_readiness", + "deploy_execute" + ], + [ + "migration_validate", + "human_escalation" + ], + [ + "migration_validate", + "rollback_available" + ] + ], + "added_edges": [ + [ + "deploy_execute", + "security_approval" + ] + ], + "missing_nodes": [ + "human_escalation", + "rollback_available" + ], + "added_nodes": [], + "original_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "human_escalation", + "migration_validate", + "rollback_available", + "security_approval" + ], + "replay_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "migration_validate", + "security_approval" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "human_escalation", + "rollback_available" + ], + "replay_nodes": [ + "human_escalation", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 5, + "missing_edges": [ + [ + "migration_validate", + "human_escalation" + ], + [ + "migration_validate", + "rollback_available" + ] + ], + "added_edges": [], + "missing_nodes": [ + "human_escalation", + "rollback_available" + ], + "added_nodes": [], + "original_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "human_escalation", + "migration_validate", + "rollback_available", + "security_approval" + ], + "replay_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "migration_validate", + "security_approval" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "human_escalation", + "rollback_available" + ], + "replay_nodes": [ + "human_escalation", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 5, + "missing_edges": [ + [ + "migration_validate", + "human_escalation" + ], + [ + "migration_validate", + "rollback_available" + ] + ], + "added_edges": [], + "missing_nodes": [ + "human_escalation", + "rollback_available" + ], + "added_nodes": [], + "original_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "human_escalation", + "migration_validate", + "rollback_available", + "security_approval" + ], + "replay_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "migration_validate", + "security_approval" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "human_escalation", + "rollback_available" + ], + "replay_nodes": [ + "human_escalation", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 7, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "human_escalation", + "migration_validate", + "rollback_available", + "security_approval" + ], + "replay_nodes": [ + "code_review_handoff", + "deploy_execute", + "deploy_readiness", + "human_escalation", + "migration_validate", + "rollback_available", + "security_approval" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "human_escalation", + "rollback_available" + ], + "replay_nodes": [ + "human_escalation", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + } + ] + }, + { + "family": "incident_response_page_triage", + "fixtures": [ + { + "fixture_id": "incident_response_page_triage_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 4, + "missing_edges": [ + [ + "incident_classified", + "root_cause_identified" + ], + [ + "mitigation_started", + "rollback_available" + ], + [ + "root_cause_identified", + "mitigation_started" + ] + ], + "added_edges": [], + "missing_nodes": [ + "rollback_available" + ], + "added_nodes": [], + "original_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ], + "replay_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "root_cause_identified" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback_available" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "incident_response_page_triage_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 7, + "missing_edges": [ + [ + "mitigation_started", + "rollback_available" + ] + ], + "added_edges": [ + [ + "incident_classified", + "rollback_available" + ] + ], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ], + "replay_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback_available" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "incident_response_page_triage_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 6, + "missing_edges": [ + [ + "mitigation_started", + "rollback_available" + ] + ], + "added_edges": [], + "missing_nodes": [ + "rollback_available" + ], + "added_nodes": [], + "original_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ], + "replay_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "root_cause_identified" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback_available" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "incident_response_page_triage_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 7, + "replay_edge_count": 7, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ], + "replay_nodes": [ + "alert_acknowledged", + "alert_received", + "escalate_to_human", + "incident_classified", + "mitigation_started", + "rollback_available", + "root_cause_identified" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 1, + "replay_edge_count": 1, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "escalate_to_human", + "rollback_available" + ], + "replay_nodes": [ + "escalate_to_human", + "rollback_available" + ] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + } + ] + }, + { + "family": "mcp_trace_replay", + "fixtures": [ + { + "fixture_id": "mcp_trace_replay_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 10, + "replay_edge_count": 5, + "missing_edges": [ + [ + "capability_scope_checked", + "execute_external_action" + ], + [ + "capability_scope_checked", + "tool_schema_validated" + ], + [ + "capability_scope_checked", + "validate_external_action" + ], + [ + "execute_external_action", + "recovery_path_registered" + ], + [ + "validate_external_action", + "execute_external_action" + ] + ], + "added_edges": [], + "missing_nodes": [ + "recovery_path_registered" + ], + "added_nodes": [], + "original_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ], + "replay_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "mcp_trace_replay_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 10, + "replay_edge_count": 9, + "missing_edges": [ + [ + "execute_external_action", + "recovery_path_registered" + ] + ], + "added_edges": [], + "missing_nodes": [ + "recovery_path_registered" + ], + "added_nodes": [], + "original_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ], + "replay_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "mcp_trace_replay_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION" + ], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 10, + "replay_edge_count": 7, + "missing_edges": [ + [ + "capability_scope_checked", + "validate_external_action" + ], + [ + "read_context", + "validate_external_action" + ], + [ + "validate_external_action", + "execute_external_action" + ] + ], + "added_edges": [], + "missing_nodes": [ + "validate_external_action" + ], + "added_nodes": [], + "original_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ], + "replay_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "verify_result" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + }, + { + "fixture_id": "mcp_trace_replay_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "edge_categories": { + "capability_boundaries": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "causal_dependencies": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "dependencies": { + "original_edge_count": 10, + "replay_edge_count": 10, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ], + "replay_nodes": [ + "capability_scope_checked", + "execute_external_action", + "read_context", + "recovery_path_registered", + "system_start", + "tool_schema_validated", + "user_request_received", + "validate_external_action", + "verify_result" + ] + }, + "dependency_chain": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "policy_steps": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "recovery_paths": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_call_order": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + }, + "tool_calls": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "missing_nodes": [], + "added_nodes": [], + "original_nodes": [], + "replay_nodes": [] + } + } + } + ] + } + ], + "global_summary": { + "family_count": 4, + "fixture_count": 16, + "total_missing_edges": 32, + "total_added_edges": 7, + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none" + } +} diff --git a/package.json b/package.json index 1a12cb3..cda5d7f 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ "generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py", "generate:multi-family-svg": "python scripts/render_multi_family_admissibility_svg.py", "generate:mcp-trace-replay": "python scripts/generate_mcp_trace_replay_artifact.py", - "generate:replay-semantic-integrity": "python scripts/generate_replay_semantic_integrity_artifact.py" + "generate:replay-semantic-integrity": "python scripts/generate_replay_semantic_integrity_artifact.py", + "generate:graph-diff": "python scripts/generate_graph_diff_artifact.py" } } diff --git a/scripts/generate_graph_diff_artifact.py b/scripts/generate_graph_diff_artifact.py new file mode 100644 index 0000000..b8b4bd8 --- /dev/null +++ b/scripts/generate_graph_diff_artifact.py @@ -0,0 +1,201 @@ +"""Generate deterministic graph-diff artifact from manifest fixtures.""" + +from __future__ import annotations + +import json +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from src.comptext_v7.graph import compare_edges, normalize_edges, nodes_from_edges +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +OUTPUT_PATH = REPO_ROOT / "artifacts" / "graph_diff_results.json" + +SUPPORTED_RELATION_KEYS = ( + "causal_dependencies", + "dependencies", + "dependency_chain", + "policy_steps", + "tool_calls", + "tool_call_order", + "recovery_paths", + "capability_boundaries", +) + + +def _load_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _discover_payload_files(base_dir: Path) -> list[Path]: + return sorted(path for path in base_dir.glob("*.json") if path.is_file()) + + +def _coerce_edge_pair(item: object) -> tuple[str, str] | None: + if not isinstance(item, (list, tuple)) or len(item) != 2: + return None + left, right = item + if not isinstance(left, str) or not isinstance(right, str): + return None + return (left, right) + + +def _coerce_node(item: object) -> str | None: + return item if isinstance(item, str) and item else None + + +def _extract_edges_from_relation_value(value: object) -> list[tuple[str, str]]: + edges: list[tuple[str, str]] = [] + if isinstance(value, dict): + for nested in value.values(): + if isinstance(nested, list): + edges.extend(_extract_edges_from_relation_value(nested)) + return edges + + if not isinstance(value, list): + return edges + + pair_like = [_coerce_edge_pair(item) for item in value] + if value and all(pair is not None for pair in pair_like): + return [pair for pair in pair_like if pair is not None] + + node_list = [_coerce_node(item) for item in value] + if value and all(node is not None for node in node_list): + ordered = [node for node in node_list if node is not None] + return [(ordered[idx], ordered[idx + 1]) for idx in range(len(ordered) - 1)] + + return edges + + +def _collect_relation_values(payload: object, relation_key: str) -> list[object]: + collected: list[object] = [] + if isinstance(payload, dict): + for key, value in payload.items(): + if key == relation_key: + collected.append(value) + collected.extend(_collect_relation_values(value, relation_key)) + elif isinstance(payload, list): + for item in payload: + collected.extend(_collect_relation_values(item, relation_key)) + return collected + + + + +def _collect_dependency_graph_edges(payload: object) -> list[tuple[str, str]]: + edges: list[tuple[str, str]] = [] + if isinstance(payload, dict): + if isinstance(payload.get("edges"), list): + for item in payload["edges"]: + if isinstance(item, dict): + source = item.get("source") + target = item.get("target") + if isinstance(source, str) and isinstance(target, str): + edges.append((source, target)) + for value in payload.values(): + edges.extend(_collect_dependency_graph_edges(value)) + elif isinstance(payload, list): + for item in payload: + edges.extend(_collect_dependency_graph_edges(item)) + return edges + +def _extract_edges_from_payloads(payloads: list[dict[str, Any]]) -> dict[str, tuple[tuple[str, str], ...]]: + extracted: dict[str, tuple[tuple[str, str], ...]] = {} + for relation_key in SUPPORTED_RELATION_KEYS: + edges: list[tuple[str, str]] = [] + for payload in payloads: + for relation_value in _collect_relation_values(payload, relation_key): + edges.extend(_extract_edges_from_relation_value(relation_value)) + if relation_key == "dependencies": + for payload in payloads: + edges.extend(_collect_dependency_graph_edges(payload)) + extracted[relation_key] = normalize_edges(edges) + return extracted + + +def generate_graph_diff_artifact(output_path: Path = OUTPUT_PATH) -> Path: + manifest = _load_json(MANIFEST_PATH) + fixtures: list[dict[str, Any]] = manifest["fixtures"] + + by_family: dict[str, list[dict[str, Any]]] = defaultdict(list) + for fixture in fixtures: + by_family[str(fixture["family"])].append(fixture) + + families_payload: list[dict[str, Any]] = [] + fixture_count = 0 + total_missing_edges = 0 + total_added_edges = 0 + + for family in sorted(by_family): + fixture_payloads: list[dict[str, Any]] = [] + for fixture in sorted(by_family[family], key=lambda item: str(item["fixture_id"])): + fixture_root = REPO_ROOT / str(fixture["path"]) + original_payloads = [_load_json(path) for path in _discover_payload_files(fixture_root / "original")] + replay_payloads = [_load_json(path) for path in _discover_payload_files(fixture_root / "reconstructed")] + + original_by_category = _extract_edges_from_payloads(original_payloads) + replay_by_category = _extract_edges_from_payloads(replay_payloads) + + category_payload: dict[str, Any] = {} + for category in sorted(SUPPORTED_RELATION_KEYS): + original_edges = original_by_category[category] + replay_edges = replay_by_category[category] + diff = compare_edges(original_edges, replay_edges) + total_missing_edges += len(diff.missing_edges) + total_added_edges += len(diff.added_edges) + category_payload[category] = { + "original_edge_count": len(original_edges), + "replay_edge_count": len(replay_edges), + "missing_edges": [list(edge) for edge in diff.missing_edges], + "added_edges": [list(edge) for edge in diff.added_edges], + "missing_nodes": list(diff.missing_nodes), + "added_nodes": list(diff.added_nodes), + "original_nodes": list(nodes_from_edges(original_edges)), + "replay_nodes": list(nodes_from_edges(replay_edges)), + } + + fixture_payloads.append( + { + "fixture_id": fixture["fixture_id"], + "degradation_level": fixture["degradation_level"], + "expected_admissible": fixture["expected_admissible"], + "expected_failure_labels": fixture["expected_failure_labels"], + "edge_categories": category_payload, + } + ) + fixture_count += 1 + + families_payload.append({"family": family, "fixtures": fixture_payloads}) + + artifact = { + "artifact_id": "graph_diff_results_v1", + "generated_by": "GraphDiffArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": families_payload, + "global_summary": { + "family_count": len(families_payload), + "fixture_count": fixture_count, + "total_missing_edges": total_missing_edges, + "total_added_edges": total_added_edges, + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(artifact, indent=2) + "\n", encoding="utf-8") + return output_path + + +if __name__ == "__main__": + path = generate_graph_diff_artifact() + print(path.relative_to(REPO_ROOT).as_posix()) diff --git a/tests/test_graph_diff_artifact.py b/tests/test_graph_diff_artifact.py new file mode 100644 index 0000000..5e54d04 --- /dev/null +++ b/tests/test_graph_diff_artifact.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.generate_graph_diff_artifact import generate_graph_diff_artifact + +REPO_ROOT = Path(__file__).resolve().parents[1] +ARTIFACT_PATH = REPO_ROOT / "artifacts" / "graph_diff_results.json" +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def test_artifact_exists() -> None: + assert ARTIFACT_PATH.exists() + + +def test_generator_output_matches_committed_artifact(tmp_path: Path) -> None: + output = tmp_path / "graph_diff_results.json" + generate_graph_diff_artifact(output) + assert output.read_text(encoding="utf-8") == ARTIFACT_PATH.read_text(encoding="utf-8") + + +def test_top_level_schema_is_stable() -> None: + artifact = _load_json(ARTIFACT_PATH) + assert list(artifact) == [ + "artifact_id", + "generated_by", + "version", + "evaluation_mode", + "llm_judges", + "external_apis", + "families", + "global_summary", + ] + + +def test_artifact_deterministic_and_sanitized(tmp_path: Path) -> None: + first = tmp_path / "a.json" + second = tmp_path / "b.json" + generate_graph_diff_artifact(first) + generate_graph_diff_artifact(second) + + first_text = first.read_text(encoding="utf-8") + second_text = second.read_text(encoding="utf-8") + assert first_text == second_text + + payload = json.loads(first_text) + text_blob = json.dumps(payload) + assert "timestamp" not in text_blob.lower() + assert str(Path.home()) not in text_blob + assert "/workspace/" not in text_blob + + +def test_manifest_alignment_and_fixture_ids() -> None: + manifest = _load_json(MANIFEST_PATH) + artifact = _load_json(ARTIFACT_PATH) + + manifest_fixtures = manifest["fixtures"] + manifest_families = {item["family"] for item in manifest_fixtures} + manifest_ids = [item["fixture_id"] for item in manifest_fixtures] + + artifact_fixtures = [fixture for family in artifact["families"] for fixture in family["fixtures"]] + artifact_ids = [fixture["fixture_id"] for fixture in artifact_fixtures] + + assert artifact["global_summary"]["family_count"] == len(manifest_families) + assert artifact["global_summary"]["fixture_count"] == len(manifest_fixtures) + assert sorted(artifact_ids) == sorted(manifest_ids) + + +def test_graph_diff_evidence_present_and_baseline_stable() -> None: + artifact = _load_json(ARTIFACT_PATH) + fixtures = [fixture for family in artifact["families"] for fixture in family["fixtures"]] + + assert any( + category["missing_edges"] or category["missing_nodes"] + for fixture in fixtures + for category in fixture["edge_categories"].values() + ) + + baseline = [fixture for fixture in fixtures if fixture["degradation_level"] == "baseline"] + for fixture in baseline: + for category in fixture["edge_categories"].values(): + assert category["missing_edges"] == [] + + +def test_failure_labels_manifest_scoped() -> None: + manifest = _load_json(MANIFEST_PATH) + artifact = _load_json(ARTIFACT_PATH) + + expected_by_fixture = { + fixture["fixture_id"]: fixture["expected_failure_labels"] + for fixture in manifest["fixtures"] + } + + for family in artifact["families"]: + for fixture in family["fixtures"]: + assert fixture["expected_failure_labels"] == expected_by_fixture[fixture["fixture_id"]]