Harden MCP capability boundary fixtures#155
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a more granular failure taxonomy for contract validation, replacing generic labels like CAUSAL_DEPENDENCY_LOSS and INVARIANT_VIOLATION with specific ones such as APPROVAL_GATE_LOSS, CAPABILITY_BOUNDARY_LOSS, and POLICY_ENFORCEMENT_GAP. The changes span across artifact results, test fixtures, and the core validation logic in src/validation/contract_validator.py, where a new failure_label_on_violation field is now supported in contract definitions. Feedback focuses on improving the robustness of this new logic by suggesting early normalization of the failure label to a string, which would eliminate redundant type conversions and allow for more precise type hints across the internal validation methods.
| failure_label_on_violation = contract.get("failure_label_on_violation") | ||
| if failure_label_on_violation is not None and str(failure_label_on_violation) not in FAILURE_TAXONOMY: | ||
| raise ContractValidationError( | ||
| f"contract '{contract_id}' has unregistered failure_label_on_violation: {failure_label_on_violation}" | ||
| ) | ||
|
|
||
| if contract_type == ContractType.ORDERING: | ||
| return self._validate_ordering(contract_id, layer, severity, definition, reconstructed) | ||
| return self._validate_ordering(contract_id, layer, severity, definition, reconstructed, failure_label_on_violation) | ||
| if contract_type == ContractType.REACHABILITY: | ||
| return self._validate_reachability(contract_id, layer, severity, definition, original, reconstructed) | ||
| return self._validate_reachability(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | ||
| if contract_type == ContractType.CAUSALITY: | ||
| return self._validate_causality(contract_id, layer, severity, definition, original, reconstructed) | ||
| return self._validate_causality(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | ||
| if contract_type == ContractType.INVARIANT: | ||
| return self._validate_invariant(contract_id, layer, severity, definition, original, reconstructed) | ||
| return self._validate_invariant(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) |
There was a problem hiding this comment.
Normalizing failure_label_on_violation to a string early ensures consistent type handling and allows for more precise type hints in the internal validation methods.
| failure_label_on_violation = contract.get("failure_label_on_violation") | |
| if failure_label_on_violation is not None and str(failure_label_on_violation) not in FAILURE_TAXONOMY: | |
| raise ContractValidationError( | |
| f"contract '{contract_id}' has unregistered failure_label_on_violation: {failure_label_on_violation}" | |
| ) | |
| if contract_type == ContractType.ORDERING: | |
| return self._validate_ordering(contract_id, layer, severity, definition, reconstructed) | |
| return self._validate_ordering(contract_id, layer, severity, definition, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.REACHABILITY: | |
| return self._validate_reachability(contract_id, layer, severity, definition, original, reconstructed) | |
| return self._validate_reachability(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.CAUSALITY: | |
| return self._validate_causality(contract_id, layer, severity, definition, original, reconstructed) | |
| return self._validate_causality(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.INVARIANT: | |
| return self._validate_invariant(contract_id, layer, severity, definition, original, reconstructed) | |
| return self._validate_invariant(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | |
| failure_label_on_violation = contract.get("failure_label_on_violation") | |
| if failure_label_on_violation is not None: | |
| failure_label_on_violation = str(failure_label_on_violation) | |
| if failure_label_on_violation not in FAILURE_TAXONOMY: | |
| raise ContractValidationError( | |
| f"contract '{contract_id}' has unregistered failure_label_on_violation: {failure_label_on_violation}" | |
| ) | |
| if contract_type == ContractType.ORDERING: | |
| return self._validate_ordering(contract_id, layer, severity, definition, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.REACHABILITY: | |
| return self._validate_reachability(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.CAUSALITY: | |
| return self._validate_causality(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) | |
| if contract_type == ContractType.INVARIANT: | |
| return self._validate_invariant(contract_id, layer, severity, definition, original, reconstructed, failure_label_on_violation) |
| return False | ||
|
|
||
| def _validate_ordering(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], reconstructed: dict[str, Any]) -> ValidationResult: | ||
| def _validate_ordering(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: |
There was a problem hiding this comment.
Use a specific type hint for failure_label_on_violation instead of Any to improve type safety.
| def _validate_ordering(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: | |
| def _validate_ordering(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: str | None) -> ValidationResult: |
| passed=passed, | ||
| severity=severity, | ||
| failure_label=None if passed else "POLICY_ORDER_BROKEN", | ||
| failure_label=None if passed else str(failure_label_on_violation or "POLICY_ORDER_BROKEN"), |
There was a problem hiding this comment.
| return sorted(target for target in targets if target in visited) | ||
|
|
||
| def _validate_reachability(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any]) -> ValidationResult: | ||
| def _validate_reachability(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: |
There was a problem hiding this comment.
Use a specific type hint for failure_label_on_violation instead of Any.
| def _validate_reachability(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: | |
| def _validate_reachability(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: str | None) -> ValidationResult: |
| passed=passed, | ||
| severity=severity, | ||
| failure_label=None if passed else "RECOVERY_PATH_INVALID", | ||
| failure_label=None if passed else str(failure_label_on_violation or "RECOVERY_PATH_INVALID"), |
There was a problem hiding this comment.
| ) | ||
|
|
||
| def _validate_causality(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any]) -> ValidationResult: | ||
| def _validate_causality(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: |
There was a problem hiding this comment.
Use a specific type hint for failure_label_on_violation instead of Any.
| def _validate_causality(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: | |
| def _validate_causality(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: str | None) -> ValidationResult: |
| passed=passed, | ||
| severity=severity, | ||
| failure_label=None if passed else "CAUSAL_DEPENDENCY_LOSS", | ||
| failure_label=None if passed else str(failure_label_on_violation or "CAUSAL_DEPENDENCY_LOSS"), |
There was a problem hiding this comment.
| ) | ||
|
|
||
| def _validate_invariant(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any]) -> ValidationResult: | ||
| def _validate_invariant(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: |
There was a problem hiding this comment.
Use a specific type hint for failure_label_on_violation instead of Any.
| def _validate_invariant(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: Any) -> ValidationResult: | |
| def _validate_invariant(self, contract_id: str, layer: Layer, severity: str, definition: dict[str, Any], original: dict[str, Any], reconstructed: dict[str, Any], failure_label_on_violation: str | None) -> ValidationResult: |
| passed=passed, | ||
| severity=severity, | ||
| failure_label=None if passed else "INVARIANT_VIOLATION", | ||
| failure_label=None if passed else str(failure_label_on_violation or "INVARIANT_VIOLATION"), |
There was a problem hiding this comment.
|
Review fixes addressed:
Follow-up commit: 239aa6a Only follow-up file changed: tests/test_contract_validator.py No README, workflow, package, taxonomy, fixture, or artifact changes were made by this follow-up fix. |
Summary:
mcp_trace_replayfixture family by adding explicit structured capability-boundary data to fixture states (capability_boundaries,allowed_tools,resource_boundaries,permission_scopes,capability_scope) and by introducing deterministic boundary drift in reconstructed degraded variants; add deterministic mapping from contract failures to capability/security labels via optionalfailure_label_on_violationin selected contracts; regenerate affected artifacts.Changed files:
originalandreconstructedstate.jsonfiles formcp_trace_replay_v1,mcp_trace_replay_mild_v1,mcp_trace_replay_moderate_v1,mcp_trace_replay_degraded_v1and updatedexpected/failures.jsonandexpected/admissibility.jsonwhere applicable.fixtures/manifest.json(alignedexpected_failure_labelsfor hardened fixtures).failure_label_on_violationto selected contracts infixtures/mcp_trace_replay_*/original/contracts/*.jsonto deterministically map contract violations to registered capability/security labels.src/validation/contract_validator.py(accept and validate optionalfailure_label_on_violationand use it when emitting failure labels).artifacts/mcp_trace_replay_results.json,artifacts/replay_semantic_integrity_results.json,artifacts/graph_diff_results.json,artifacts/capability_boundary_replay_results.json,artifacts/tool_ordering_replay_results.json,artifacts/multi_family_admissibility_results.json, andartifacts/multi_family_admissibility_curves.svg.tests/test_capability_boundary_replay_artifact.pyandtests/test_tool_ordering_replay_artifact.py.Testing:
npm run generate:mcp-trace-replay(Python generator used),npm run generate:replay-semantic-integrity,npm run generate:graph-diff.python scripts/generate_capability_boundary_replay_artifact.py(used directly when npm script missing).python scripts/generate_tool_ordering_replay_artifact.py.python scripts/generate_multi_family_admissibility_artifact.pyandpython scripts/render_multi_family_admissibility_svg.py.pytest tests/test_failure_taxonomy.py— passed.pytest tests/test_fixture_manifest.py— passed.pytest tests/test_mcp_trace_replay_artifact.py— passed.pytest tests/test_replay_semantic_integrity_artifact.py— passed.pytest tests/test_graph_diff_artifact.py— passed.pytest tests/test_capability_boundary_replay_artifact.py— passed.pytest tests/test_tool_ordering_replay_artifact.py— passed.pytest tests/test_multi_family_admissibility_artifact.py— passed.pytest tests/test_multi_family_svg_renderer.py— passed.npm run check(which runs layout/typecheck/validate/build/test) completed successfully and the repository test suite passed (271 tests).Scope note:
mcp_trace_replayfixture hardening only; no new fixture family was added.Risks:
Next:
failure_label_on_violationusage in contracts, and confirm the regenerated artifacts are acceptable for downstream validators.Codex Task