diff --git a/artifacts/tool_ordering_replay_results.json b/artifacts/tool_ordering_replay_results.json new file mode 100644 index 0000000..aa1fa6c --- /dev/null +++ b/artifacts/tool_ordering_replay_results.json @@ -0,0 +1,371 @@ +{ + "artifact_id": "tool_ordering_replay_results_v1", + "generated_by": "ToolOrderingReplayArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": [ + { + "family": "coding_workflow_pr_review", + "fixtures": [ + { + "fixture_id": "coding_workflow_pr_review_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "coding_workflow_pr_review_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "coding_workflow_pr_review_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "coding_workflow_pr_review_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + } + ] + }, + { + "family": "cross_domain_operational_dependency_workflow", + "fixtures": [ + { + "fixture_id": "cross_domain_operational_dependency_workflow_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "cross_domain_operational_dependency_workflow_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + } + ] + }, + { + "family": "incident_response_page_triage", + "fixtures": [ + { + "fixture_id": "incident_response_page_triage_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "POLICY_ORDER_BROKEN", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "incident_response_page_triage_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "incident_response_page_triage_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "incident_response_page_triage_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + } + ] + }, + { + "family": "mcp_trace_replay", + "fixtures": [ + { + "fixture_id": "mcp_trace_replay_degraded_v1", + "degradation_level": "severe", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "mcp_trace_replay_mild_v1", + "degradation_level": "mild", + "expected_admissible": false, + "expected_failure_labels": [ + "INVARIANT_VIOLATION", + "RECOVERY_PATH_INVALID" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "mcp_trace_replay_moderate_v1", + "degradation_level": "moderate", + "expected_admissible": false, + "expected_failure_labels": [ + "CAUSAL_DEPENDENCY_LOSS", + "INVARIANT_VIOLATION" + ], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + }, + { + "fixture_id": "mcp_trace_replay_v1", + "degradation_level": "baseline", + "expected_admissible": true, + "expected_failure_labels": [], + "tool_ordering": { + "original_edge_count": 0, + "replay_edge_count": 0, + "missing_edges": [], + "added_edges": [], + "original_node_count": 0, + "replay_node_count": 0, + "missing_nodes": [], + "added_nodes": [], + "required_before_violations": [], + "drift_detected": false + } + } + ] + } + ], + "global_summary": { + "family_count": 4, + "fixture_count": 16, + "fixtures_with_tool_ordering_data": 0, + "fixtures_with_tool_ordering_drift": 0, + "total_missing_tool_order_edges": 0, + "total_added_tool_order_edges": 0, + "total_required_before_violations": 0, + "deterministic_evaluation": true, + "llm_judges": "none", + "external_apis": "none" + } +} diff --git a/scripts/generate_tool_ordering_replay_artifact.py b/scripts/generate_tool_ordering_replay_artifact.py new file mode 100644 index 0000000..d8a0b29 --- /dev/null +++ b/scripts/generate_tool_ordering_replay_artifact.py @@ -0,0 +1,228 @@ +"""Generate deterministic tool-ordering replay artifact from manifest fixtures.""" + +from __future__ import annotations + +import json +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from src.comptext_v7.graph import compare_edges, find_order_violations, nodes_from_edges, normalize_edges + +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +OUTPUT_PATH = REPO_ROOT / "artifacts" / "tool_ordering_replay_results.json" + +ORDERING_KEYS = { + "tool_calls", + "tool_call_order", + "tools", + "actions", + "action_sequence", + "ordered_steps", + "policy_steps", +} +REQUIRED_BEFORE_KEYS = { + "required_before", + "before", + "must_precede", + "validation_before_unsafe_action", +} +IDENTIFIER_KEYS = ("id", "name", "tool", "action", "step") + + +def _load_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def _discover_payload_files(base_dir: Path) -> list[Path]: + return sorted(path for path in base_dir.glob("*.json") if path.is_file()) + + +def _id_from_obj(item: object) -> str | None: + if not isinstance(item, dict): + return None + for key in IDENTIFIER_KEYS: + value = item.get(key) + if isinstance(value, str) and value: + return value + return None + + +def _pair(item: object) -> tuple[str, str] | None: + if isinstance(item, (list, tuple)) and len(item) == 2 and all(isinstance(v, str) and v for v in item): + return (item[0], item[1]) + return None + + +def _edges_from_ordered_list(value: list[object]) -> list[tuple[str, str]]: + ordered: list[str] = [] + for item in value: + if isinstance(item, str) and item: + ordered.append(item) + continue + obj_id = _id_from_obj(item) + if obj_id is not None: + ordered.append(obj_id) + continue + return [] + return [(ordered[i], ordered[i + 1]) for i in range(len(ordered) - 1)] + + +def _edges_from_required_before(value: object) -> list[tuple[str, str]]: + edges: list[tuple[str, str]] = [] + + one_pair = _pair(value) + if one_pair is not None: + return [one_pair] + + if isinstance(value, list): + pair_list = [_pair(item) for item in value] + if value and all(item is not None for item in pair_list): + return [item for item in pair_list if item is not None] + + if isinstance(value, dict): + for left, right in value.items(): + if not isinstance(left, str) or not left: + continue + if isinstance(right, str) and right: + edges.append((left, right)) + elif isinstance(right, list) and all(isinstance(v, str) and v for v in right): + edges.extend((left, v) for v in right) + return edges + + +def _walk(payload: object, edges: list[tuple[str, str]], required_before: list[tuple[str, str]]) -> None: + if isinstance(payload, dict): + for key, value in payload.items(): + if key in ORDERING_KEYS and isinstance(value, list): + edges.extend(_edges_from_ordered_list(value)) + if key in REQUIRED_BEFORE_KEYS: + required_before.extend(_edges_from_required_before(value)) + _walk(value, edges, required_before) + elif isinstance(payload, list): + for item in payload: + _walk(item, edges, required_before) + + +def _extract_tool_ordering(payloads: list[dict[str, Any]]) -> tuple[tuple[tuple[str, str], ...], tuple[tuple[str, str], ...]]: + edges: list[tuple[str, str]] = [] + required_before: list[tuple[str, str]] = [] + for payload in payloads: + _walk(payload, edges, required_before) + return normalize_edges(edges), normalize_edges(required_before) + + +def _extract_sequence(payloads: list[dict[str, Any]]) -> tuple[str, ...]: + sequence_values: set[str] = set() + for payload in payloads: + edges: list[tuple[str, str]] = [] + required: list[tuple[str, str]] = [] + _walk(payload, edges, required) + for left, right in edges: + sequence_values.add(left) + sequence_values.add(right) + return tuple(sorted(sequence_values)) + + +def generate_tool_ordering_replay_artifact(output_path: Path = OUTPUT_PATH) -> Path: + manifest = _load_json(MANIFEST_PATH) + fixtures: list[dict[str, Any]] = manifest["fixtures"] + + by_family: dict[str, list[dict[str, Any]]] = defaultdict(list) + for fixture in fixtures: + by_family[str(fixture["family"])].append(fixture) + + families_payload: list[dict[str, Any]] = [] + fixture_count = 0 + fixtures_with_tool_ordering_data = 0 + fixtures_with_tool_ordering_drift = 0 + total_missing_tool_order_edges = 0 + total_added_tool_order_edges = 0 + total_required_before_violations = 0 + + for family in sorted(by_family): + fixture_payloads: list[dict[str, Any]] = [] + for fixture in sorted(by_family[family], key=lambda item: str(item["fixture_id"])): + fixture_root = REPO_ROOT / str(fixture["path"]) + original_payloads = [_load_json(path) for path in _discover_payload_files(fixture_root / "original")] + replay_payloads = [_load_json(path) for path in _discover_payload_files(fixture_root / "reconstructed")] + + original_edges, original_required = _extract_tool_ordering(original_payloads) + replay_edges, _ = _extract_tool_ordering(replay_payloads) + replay_sequence = _extract_sequence(replay_payloads) + + diff = compare_edges(original_edges, replay_edges) + original_nodes = nodes_from_edges(original_edges) if original_edges else tuple() + replay_nodes = nodes_from_edges(replay_edges) if replay_edges else tuple() + missing_nodes = tuple(sorted(set(original_nodes) - set(replay_nodes))) + added_nodes = tuple(sorted(set(replay_nodes) - set(original_nodes))) + violations = find_order_violations(replay_sequence, original_required) + + if original_edges or replay_edges or original_required: + fixtures_with_tool_ordering_data += 1 + + drift_detected = bool(diff.missing_edges or diff.added_edges or missing_nodes or added_nodes or violations) + if drift_detected: + fixtures_with_tool_ordering_drift += 1 + + total_missing_tool_order_edges += len(diff.missing_edges) + total_added_tool_order_edges += len(diff.added_edges) + total_required_before_violations += len(violations) + + fixture_payloads.append({ + "fixture_id": fixture["fixture_id"], + "degradation_level": fixture["degradation_level"], + "expected_admissible": fixture["expected_admissible"], + "expected_failure_labels": fixture["expected_failure_labels"], + "tool_ordering": { + "original_edge_count": len(original_edges), + "replay_edge_count": len(replay_edges), + "missing_edges": [list(edge) for edge in diff.missing_edges], + "added_edges": [list(edge) for edge in diff.added_edges], + "original_node_count": len(original_nodes), + "replay_node_count": len(replay_nodes), + "missing_nodes": list(missing_nodes), + "added_nodes": list(added_nodes), + "required_before_violations": [list(edge) for edge in violations], + "drift_detected": drift_detected, + }, + }) + fixture_count += 1 + + families_payload.append({"family": family, "fixtures": fixture_payloads}) + + artifact = { + "artifact_id": "tool_ordering_replay_results_v1", + "generated_by": "ToolOrderingReplayArtifactGenerator", + "version": "1.0", + "evaluation_mode": "deterministic", + "llm_judges": "none", + "external_apis": "none", + "families": families_payload, + "global_summary": { + "family_count": len(families_payload), + "fixture_count": fixture_count, + "fixtures_with_tool_ordering_data": fixtures_with_tool_ordering_data, + "fixtures_with_tool_ordering_drift": fixtures_with_tool_ordering_drift, + "total_missing_tool_order_edges": total_missing_tool_order_edges, + "total_added_tool_order_edges": total_added_tool_order_edges, + "total_required_before_violations": total_required_before_violations, + "deterministic_evaluation": True, + "llm_judges": "none", + "external_apis": "none", + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(artifact, indent=2) + "\n", encoding="utf-8") + return output_path + + +if __name__ == "__main__": + path = generate_tool_ordering_replay_artifact() + print(path.relative_to(REPO_ROOT).as_posix()) diff --git a/tests/test_tool_ordering_replay_artifact.py b/tests/test_tool_ordering_replay_artifact.py new file mode 100644 index 0000000..4605851 --- /dev/null +++ b/tests/test_tool_ordering_replay_artifact.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from scripts.generate_tool_ordering_replay_artifact import generate_tool_ordering_replay_artifact + +REPO_ROOT = Path(__file__).resolve().parents[1] +ARTIFACT_PATH = REPO_ROOT / "artifacts" / "tool_ordering_replay_results.json" +MANIFEST_PATH = REPO_ROOT / "fixtures" / "manifest.json" +ALLOWED_FAILURE_LABELS = { + "INVARIANT_VIOLATION", + "CAUSAL_DEPENDENCY_LOSS", + "RECOVERY_PATH_INVALID", + "POLICY_ORDER_BROKEN", +} + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def test_artifact_exists() -> None: + assert ARTIFACT_PATH.exists() + + +def test_generator_output_matches_committed_artifact(tmp_path: Path) -> None: + output = tmp_path / "tool_ordering_replay_results.json" + generate_tool_ordering_replay_artifact(output) + assert output.read_text(encoding="utf-8") == ARTIFACT_PATH.read_text(encoding="utf-8") + + +def test_top_level_schema_is_stable() -> None: + artifact = _load_json(ARTIFACT_PATH) + assert list(artifact) == [ + "artifact_id", + "generated_by", + "version", + "evaluation_mode", + "llm_judges", + "external_apis", + "families", + "global_summary", + ] + + +def test_determinism_and_sanitization(tmp_path: Path) -> None: + a_path = tmp_path / "a.json" + b_path = tmp_path / "b.json" + generate_tool_ordering_replay_artifact(a_path) + generate_tool_ordering_replay_artifact(b_path) + + a_text = a_path.read_text(encoding="utf-8") + b_text = b_path.read_text(encoding="utf-8") + assert a_text == b_text + + blob = a_text.lower() + assert "timestamp" not in blob + assert "generated_at" not in blob + assert "environment" not in blob + assert "hostname" not in blob + assert "cwd" not in blob + assert "score" not in blob + assert "average" not in blob + assert "/workspace/" not in a_text + assert str(Path.home()) not in a_text + + +def test_manifest_alignment() -> None: + manifest = _load_json(MANIFEST_PATH) + artifact = _load_json(ARTIFACT_PATH) + + manifest_fixtures = manifest["fixtures"] + manifest_family_count = len({item["family"] for item in manifest_fixtures}) + manifest_fixture_ids = sorted(item["fixture_id"] for item in manifest_fixtures) + + artifact_fixtures = [fixture for family in artifact["families"] for fixture in family["fixtures"]] + artifact_fixture_ids = sorted(fixture["fixture_id"] for fixture in artifact_fixtures) + + assert artifact["global_summary"]["family_count"] == manifest_family_count + assert artifact["global_summary"]["fixture_count"] == len(manifest_fixtures) + assert artifact_fixture_ids == manifest_fixture_ids + + +def test_tool_ordering_evidence_behavior() -> None: + artifact = _load_json(ARTIFACT_PATH) + fixtures = [fixture for family in artifact["families"] for fixture in family["fixtures"]] + + with_data = [ + fixture + for fixture in fixtures + if fixture["tool_ordering"]["original_edge_count"] > 0 + or fixture["tool_ordering"]["replay_edge_count"] > 0 + ] + + if with_data: + assert artifact["global_summary"]["fixtures_with_tool_ordering_data"] > 0 + else: + assert artifact["global_summary"]["fixtures_with_tool_ordering_data"] == 0 + + drift_count = sum(1 for fixture in fixtures if fixture["tool_ordering"]["drift_detected"]) + assert artifact["global_summary"]["fixtures_with_tool_ordering_drift"] == drift_count + + if drift_count > 0: + assert any(fixture["tool_ordering"]["drift_detected"] for fixture in fixtures) + + +def test_label_discipline() -> None: + manifest = _load_json(MANIFEST_PATH) + artifact = _load_json(ARTIFACT_PATH) + + expected_by_fixture = {fixture["fixture_id"]: fixture["expected_failure_labels"] for fixture in manifest["fixtures"]} + + for family in artifact["families"]: + for fixture in family["fixtures"]: + labels = fixture["expected_failure_labels"] + assert labels == expected_by_fixture[fixture["fixture_id"]] + for label in labels: + assert label in ALLOWED_FAILURE_LABELS + + +def test_no_runtime_behavior_fields() -> None: + artifact = _load_json(ARTIFACT_PATH) + assert artifact["evaluation_mode"] == "deterministic" + assert artifact["llm_judges"] == "none" + assert artifact["external_apis"] == "none" + assert artifact["global_summary"]["deterministic_evaluation"] is True + assert artifact["global_summary"]["llm_judges"] == "none" + assert artifact["global_summary"]["external_apis"] == "none"