fix(mcp-proxy): harden verification integrity

oleg-bk · oleg-bk · commit e0659e079eaf · 2026-05-11T16:56:32.000+02:00
Align compound risk inference with policy risk rank so destructive matches win over financial or production terms.

Expand offline DecisionReceipt cross-checks to include risk class and policy context hash. Action and resource receipt fields remain intentionally out of scope because the backend sees privacy-filtered values and local export does not reconstruct that filter.

Export evidence using stored prev_event_hash values and fail clearly when a stored chain link diverges from the expected chain. The chain-break message tells operators to run doctor or inspect evidence DB integrity.

Warn verify users when no explicit --trusted-signer-did is provided and verification falls back to the bundle's embedded signer list.

Implemented with assistance from Codex.
diff --git a/agentveil_mcp_proxy/classification.py b/agentveil_mcp_proxy/classification.py
@@ -214,12 +214,13 @@ def infer_risk_class(
             text_parts.append(environment)
     text = " ".join(text_parts).lower()
     tokens = tuple(item for item in re.split(r"[^a-z0-9]+", text) if item)
+    # Keep compound-keyword inference aligned with policy._RISK_RANK.
+    if _has_prefix(tokens, _DESTRUCTIVE_PREFIXES):
+        return RiskClass.DESTRUCTIVE
     if _has_prefix(tokens, _FINANCIAL_WORDS):
         return RiskClass.FINANCIAL
     if _has_prefix(tokens, _PRODUCTION_WORDS):
         return RiskClass.PRODUCTION
-    if _has_prefix(tokens, _DESTRUCTIVE_PREFIXES):
-        return RiskClass.DESTRUCTIVE
     if _has_prefix(tokens, _WRITE_PREFIXES):
         return RiskClass.WRITE
     if _has_prefix(tokens, _READ_PREFIXES):
diff --git a/agentveil_mcp_proxy/cli.py b/agentveil_mcp_proxy/cli.py
@@ -35,6 +35,7 @@
 )
 from agentveil_mcp_proxy.classification import ToolCallClassifier
 from agentveil_mcp_proxy.evidence import (
+    ApprovalEvidenceError,
     ApprovalEvidenceStore,
     EvidenceExportError,
     EvidenceVerificationError,
@@ -71,6 +72,10 @@
 REISSUE_GRANT_FORCE_THRESHOLD_SECONDS = 24 * 60 * 60
 DEFAULT_ALLOWED_CATEGORIES = ("mcp_proxy",)
 DEFAULT_EVIDENCE_VACUUM_MAX_AGE_DAYS = 90
+DEFAULT_TRUST_FROM_BUNDLE_WARNING = (
+    "default_trust_from_bundle: trusting bundle's embedded signer list; "
+    "pass --trusted-signer-did to verify against your own pinned set"
+)
 AGENTVEIL_DEV_SIGNER_DIDS = (
     "did:key:z6MkkvQQ9SxaNX9eEVHd5NtEamVY3YiZSpHZE567Vxs5jQQ3",
     "did:key:z6Mkjw22249tpNN4LJGLyq1oGSq1Skh3ks94fiMrgi4oqveo",
@@ -881,17 +886,21 @@ def verify_evidence(
     """Verify an evidence bundle offline."""
 
     out = out or sys.stdout
+    explicit_trusted_signers = tuple(trusted_signer_dids or ())
     result = verify_evidence_bundle_file(
         bundle_path,
-        trusted_signer_dids=trusted_signer_dids,
+        trusted_signer_dids=explicit_trusted_signers,
     )
+    warnings = list(result.warnings)
+    if not explicit_trusted_signers:
+        warnings.append(DEFAULT_TRUST_FROM_BUNDLE_WARNING)
     if output_format == "json":
         print(json.dumps({
             "status": "ok",
             "record_count": result.record_count,
             "signed_receipt_count": result.signed_receipt_count,
             "unverified_receipt_count": result.unverified_receipt_count,
-            "warnings": list(result.warnings),
+            "warnings": warnings,
             "chain_root_hash": result.chain_root_hash,
         }, sort_keys=True), file=out)
     else:
@@ -907,7 +916,7 @@ def verify_evidence(
                 "but no matching signed receipt in bundle",
                 file=out,
             )
-        for warning in result.warnings:
+        for warning in warnings:
             print(f"WARN: {warning}", file=out)
     return 0
 
@@ -1187,7 +1196,7 @@ def main(argv: list[str] | None = None) -> int:
                 before=args.before,
             )
             return 0
-    except (ProxyCliError, EvidenceExportError, EvidenceVerificationError) as exc:
+    except (ProxyCliError, ApprovalEvidenceError, EvidenceExportError, EvidenceVerificationError) as exc:
         print(f"ERROR: {exc}", file=sys.stderr)
         return exc.exit_code if isinstance(exc, ProxyCliError) else 1
     raise AssertionError(f"unhandled command: {args.command}")
diff --git a/agentveil_mcp_proxy/evidence/proof.py b/agentveil_mcp_proxy/evidence/proof.py
@@ -21,6 +21,11 @@
 
 
 EVIDENCE_EXPORT_SCHEMA_VERSION = 1
+_RECEIPT_RECORD_CROSS_CHECK_FIELDS = (
+    ("payload_hash", "payload_hash"),
+    ("risk_class", "client_risk_class"),
+    ("policy_context_hash", "client_policy_context_hash"),
+)
 
 
 class EvidenceProofError(RuntimeError):
@@ -82,7 +87,8 @@ def build_evidence_bundle(
         request_ids=request_ids,
     )
     signed_receipts: dict[str, str] = {}
-    export_records = _bundle_records(records)
+    has_filter = since_timestamp is not None or until_timestamp is not None or request_ids is not None
+    export_records = _bundle_records(records, require_genesis=not has_filter)
     unverified_receipt_count = 0
     for record in records:
         if not record.decision_audit_id or not record.decision_receipt_sha256:
@@ -164,7 +170,12 @@ def verify_evidence_bundle(
     records = bundle.get("records")
     if not isinstance(records, list):
         raise EvidenceVerificationError("records must be a list")
-    expected_prev = GENESIS_PREV_EVENT_HASH
+    has_filter = _bundle_uses_filter(bundle.get("filter"))
+    expected_prev = (
+        records[0].get("prev_event_hash")
+        if records and has_filter and isinstance(records[0], dict)
+        else GENESIS_PREV_EVENT_HASH
+    )
     last_hash = GENESIS_PREV_EVENT_HASH
     for index, record in enumerate(records):
         if not isinstance(record, dict):
@@ -222,10 +233,15 @@ def verify_evidence_bundle(
         if receipt_digest not in verified_bodies:
             continue
         receipt_body = verified_bodies[receipt_digest]
-        expected_payload = record.get("payload_hash")
-        receipt_payload = receipt_body.get("payload_hash")
-        if receipt_payload is not None and receipt_payload != expected_payload:
-            raise EvidenceVerificationError("DecisionReceipt payload_hash mismatch")
+        for record_field, receipt_field in _RECEIPT_RECORD_CROSS_CHECK_FIELDS:
+            record_value = record.get(record_field)
+            receipt_value = receipt_body.get(receipt_field)
+            if record_value is None or receipt_value is None:
+                continue
+            if receipt_value != record_value:
+                raise EvidenceVerificationError(
+                    f"DecisionReceipt {receipt_field} mismatch with record {record_field}"
+                )
 
     return EvidenceVerificationResult(
         valid=True,
@@ -254,18 +270,36 @@ def verify_evidence_bundle_file(
     return verify_evidence_bundle(bundle, trusted_signer_dids=trusted_signer_dids)
 
 
-def _bundle_records(records: list[PendingApproval]) -> list[dict[str, Any]]:
+def _bundle_records(records: list[PendingApproval], *, require_genesis: bool = True) -> list[dict[str, Any]]:
     export_records: list[dict[str, Any]] = []
-    prev_hash = GENESIS_PREV_EVENT_HASH
+    expected_prev_hash = (
+        GENESIS_PREV_EVENT_HASH
+        if require_genesis or not records
+        else records[0].prev_event_hash
+    )
     for record in records:
+        if record.prev_event_hash != expected_prev_hash:
+            raise EvidenceExportError(
+                f"chain integrity broken at request_id {record.request_id}: "
+                "stored prev_event_hash diverges from expected chain link; "
+                "run doctor or inspect evidence DB integrity"
+            )
         data = asdict(record)
-        data["prev_event_hash"] = prev_hash
         data["record_hash"] = record_hash(data)
-        prev_hash = data["record_hash"]
+        expected_prev_hash = data["record_hash"]
         export_records.append(data)
     return export_records
 
 
+def _bundle_uses_filter(filter_value: Any) -> bool:
+    if not isinstance(filter_value, Mapping):
+        return False
+    return any(
+        filter_value.get(key) is not None
+        for key in ("since_timestamp", "until_timestamp", "request_ids")
+    )
+
+
 def _verify_receipt_with_pinned_signers(
     receipt_jcs: str,
     trusted_signer_dids: Iterable[str],
diff --git a/docs/MCP_PROXY_OPERATIONS.md b/docs/MCP_PROXY_OPERATIONS.md
@@ -82,9 +82,23 @@ verification can validate backend signatures. If receipt fetch fails during
 export, the bundle remains valid and reports the unverified receipt count.
 
 `verify` performs offline checks only: record hashes, chain linkage, signed
-receipt signatures against pinned trusted signer DIDs, and payload-hash binding
-between records and DecisionReceipts when those receipts are present. It does
-not call the AVP backend.
+receipt signatures against pinned trusted signer DIDs, and signed-field binding
+between records and DecisionReceipts when those receipts are present. It checks
+payload hash, client risk class, and client policy context hash. It does not
+call the AVP backend.
+
+Auditors should pass their own pinned signer DID set:
+
+```bash
+agentveil-mcp-proxy verify /secure/path/evidence-bundle.json \
+  --trusted-signer-did did:key:z6Mk...
+```
+
+Without `--trusted-signer-did`, verification falls back to the signer list
+embedded in the bundle and prints a warning. That mode confirms internal bundle
+consistency, but it does not prove the bundle's signer list is the auditor's
+trusted set. A malicious bundle can include an attacker-controlled signer DID
+and a matching attacker-signed receipt.
 
 To prune old terminal records and rebuild the local chain:
 
diff --git a/tests/test_mcp_proxy_classification.py b/tests/test_mcp_proxy_classification.py
@@ -214,6 +214,21 @@ def test_risk_inference_covers_core_vocab():
     assert infer_risk_class("custom.inspect", tool="custom_action") is RiskClass.UNKNOWN
 
 
+def test_risk_inference_destructive_wins_over_financial_compounds():
+    assert infer_risk_class("billing.delete_payment", tool="delete_payment") is RiskClass.DESTRUCTIVE
+    assert infer_risk_class("billing.drop_billing_table", tool="drop_billing_table") is RiskClass.DESTRUCTIVE
+    assert infer_risk_class("auth.revoke_payment_token", tool="revoke_payment_token") is RiskClass.DESTRUCTIVE
+    assert (
+        infer_risk_class("bank.transfer_to_destroy_account", tool="transfer_to_destroy_account")
+        is RiskClass.DESTRUCTIVE
+    )
+
+
+def test_risk_inference_destructive_wins_over_production_compounds():
+    assert infer_risk_class("deploy.drop_prod_db", tool="drop_prod_db") is RiskClass.DESTRUCTIVE
+    assert infer_risk_class("auth.revoke_prod_access", tool="revoke_prod_access") is RiskClass.DESTRUCTIVE
+
+
 def test_risk_inference_does_not_over_classify_substring_collisions():
     assert infer_risk_class("github.get_infrastructure", tool="get_infrastructure") is RiskClass.READ
     assert infer_risk_class("github.list_endpoints", tool="list_endpoints") is RiskClass.READ
diff --git a/tests/test_mcp_proxy_proof.py b/tests/test_mcp_proxy_proof.py