fix(governance): PR-118 review feedback + default mode DISABLED

viswa-uipath · claude · viswa-uipath · commit 9e041285c2b7 · 2026-06-10T19:08:07.000+05:30
Addresses all 7 Copilot review comments on PR #118 and switches the default enforcement mode so empty-policy tenants pay zero per-call audit overhead. PR-118 review comments: - policy_api_client docstring no longer claims "one retry on transient failures" — _get_once is and remains single-shot by design. - Policy fetch GET drops Content-Type: application/json (was sent via json_body=True). Strict origin servers can 415 on unexpected Content-Type for GETs; the helper's own docstring recommends omitting it on reads. - _extract_governable_text dumper loop now CONTINUES instead of BREAKS when model_dump() raises, so dict() is tried as documented ("fall through to other extractors"). - loader.get_policy_index distinguishes "prefetch did not complete in Xs" from "prefetch completed but produced no PolicyIndex" — prod triage can now tell a hung fetch from an auth / parse failure. - disabled_guardrails defensively re-checks mapped_to_uipath=True AND policy_enabled=False on every guardrail_fallback condition. Matches the function's docstring and protects against multi-condition rules or any future code path that bypasses the evaluator gate. - request_governance pre-checks UIPATH_ACCESS_TOKEN and skips when missing. Sending without a bearer guarantees a 401 per compensation call and pollutes logs; mirrors the org-id / tenant-id skip pattern already in place. - AuditManager.flush(timeout=...) now honors its timeout via a time.monotonic() poll loop and warns if drain doesn't complete. Previously called queue.Queue.join() with no timeout argument, allowing indefinite block — risky at process exit where _cleanup_audit_manager supplies a 2-second timeout that was being silently ignored. Default enforcement mode: - get_enforcement_mode default fallback flipped from AUDIT to DISABLED. The server-supplied mode (applied by the policy loader on every successful fetch) still wins; the env-var override still works. Empty-policy / failed-fetch / pre-fetch tenants now short-circuit at evaluator.py:332 with no _emit_audit call, no OTel spans, no AuditManager queue traffic. Previously these scenarios silently fell through to AUDIT and produced ~40 empty governance spans per turn for an N=10 LLM-call agent. Tests (245 passing, +7 new): - test_enforcement_mode_default.py pins the resolution order (programmatic > env > DISABLED default) and the invalid-env-falls-back-to-DISABLED behavior. - test_request_governance_skipped_when_token_missing pins the new bearer-token skip path. - _govern_env fixture now sets UIPATH_ACCESS_TOKEN; the headers test asserts the Authorization header is present (was a side-effect of the no-token test, which is now moved out). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/src/uipath/runtime/governance/audit/base.py b/src/uipath/runtime/governance/audit/base.py
@@ -532,18 +532,40 @@ def emit_session_end(
     def flush(self, timeout: float = 5.0) -> None:
         """Flush all pending events and sinks.
 
-        In async mode, waits for the event queue to drain before
-        flushing individual sinks.
+        In async mode, polls the queue until it drains or ``timeout``
+        seconds elapse, whichever comes first. ``queue.Queue.join`` has
+        no timeout argument — using it would block indefinitely on a
+        wedged sink, which defeats the bounded-shutdown contract that
+        :func:`_cleanup_audit_manager` relies on at process exit.
 
         Args:
             timeout: Maximum seconds to wait for queue to drain (default 5.0)
         """
         if self._async_mode:
-            # Wait for queue to drain
-            try:
-                self._queue.join()
-            except Exception:
-                pass
+            import time
+
+            deadline = time.monotonic() + max(0.0, timeout)
+            poll_interval = min(0.05, timeout) if timeout > 0 else 0.0
+            while time.monotonic() < deadline:
+                try:
+                    if self._queue.unfinished_tasks == 0:
+                        break
+                except Exception:  # noqa: BLE001 - queue introspection is best-effort
+                    break
+                time.sleep(poll_interval)
+            else:
+                # Loop didn't break — drain timed out. Log so a wedged
+                # sink is surfaced rather than swallowed.
+                try:
+                    pending = self._queue.unfinished_tasks
+                except Exception:  # noqa: BLE001
+                    pending = -1
+                if pending:
+                    logger.warning(
+                        "Audit queue did not drain within %.2fs "
+                        "(unfinished tasks=%s); sink may be wedged",
+                        timeout, pending,
+                    )
 
         with self._sinks_lock:
             sinks = list(self._sinks)
diff --git a/src/uipath/runtime/governance/config.py b/src/uipath/runtime/governance/config.py
@@ -37,19 +37,24 @@ def get_enforcement_mode() -> EnforcementMode:
     The mode is cached after first read. Resolution order:
 
     1. A value previously set via :func:`set_enforcement_mode` (the
-       policy loader calls this with the backend-supplied mode).
+       policy loader calls this with the backend-supplied mode on every
+       successful policy fetch — that's the canonical source).
     2. ``UIPATH_GOVERNANCE_MODE`` env var (developer override).
-    3. Default :attr:`EnforcementMode.AUDIT` — log but never block.
+    3. Default :attr:`EnforcementMode.DISABLED` — skip evaluation
+       entirely until the server explicitly opts the tenant in. This
+       keeps empty-policy / failed-fetch / pre-fetch scenarios free of
+       per-call audit overhead; a tenant with policies wins the cache
+       on the first ``set_enforcement_mode`` call from the loader.
     """
     global _enforcement_mode
     if _enforcement_mode is not None:
         return _enforcement_mode
 
-    mode_str = os.getenv(ENV_ENFORCEMENT_MODE, "audit").lower()
+    mode_str = os.getenv(ENV_ENFORCEMENT_MODE, "disabled").lower()
     try:
         _enforcement_mode = EnforcementMode(mode_str)
     except ValueError:
-        _enforcement_mode = EnforcementMode.AUDIT
+        _enforcement_mode = EnforcementMode.DISABLED
 
     return _enforcement_mode
 
diff --git a/src/uipath/runtime/governance/native/guardrail_compensation.py b/src/uipath/runtime/governance/native/guardrail_compensation.py
@@ -28,6 +28,7 @@
 import atexit
 import json
 import logging
+import os
 import threading
 import urllib.error
 import urllib.request
@@ -37,6 +38,7 @@
 from uipath.runtime.governance.native.backend_client import (
     BACKEND_REQUEST_TIMEOUT_SECONDS,
     COMPENSATION_MAX_WORKERS,
+    ENV_ACCESS_TOKEN,
     ENV_ORGANIZATION_ID,
     ENV_TENANT_ID,
     GOVERN_API_PATH,
@@ -132,19 +134,31 @@ def disabled_guardrails(audit: Any, policy_index: Any) -> list[FiredRule]:
             continue
         for check in rule.checks:
             for cond in check.conditions:
-                if cond.operator == "guardrail_fallback" and isinstance(
-                    cond.value, dict
-                ):
-                    validator = str(cond.value.get("validator", ""))
-                    if validator:
-                        out.append(
-                            {
-                                "ruleId": ev.rule_id,
-                                "ruleName": ev.rule_name,
-                                "packName": getattr(rule, "pack_name", "") or "",
-                                "validator": validator,
-                            }
-                        )
+                if cond.operator != "guardrail_fallback":
+                    continue
+                if not isinstance(cond.value, dict):
+                    continue
+                # The ``guardrail_fallback`` operator at evaluation time
+                # only matches when ``mapped_to_uipath=True`` AND
+                # ``policy_enabled=False``. We re-check here defensively
+                # so a future code path that bypasses the evaluator (or
+                # a multi-condition rule that fired on a sibling check)
+                # can't trigger a compensation call for a guardrail
+                # that isn't actually disabled.
+                if not bool(cond.value.get("mapped_to_uipath", False)):
+                    continue
+                if bool(cond.value.get("policy_enabled", True)):
+                    continue
+                validator = str(cond.value.get("validator", ""))
+                if validator:
+                    out.append(
+                        {
+                            "ruleId": ev.rule_id,
+                            "ruleName": ev.rule_name,
+                            "packName": getattr(rule, "pack_name", "") or "",
+                            "validator": validator,
+                        }
+                    )
     return out
 
 
@@ -303,6 +317,19 @@ def request_governance(
         )
         return
 
+    # Bearer token is required by the backend; sending without one
+    # produces a 401 per call and pollutes logs. Skip cleanly when the
+    # token isn't present (e.g. local dev, missing host bootstrap)
+    # rather than burning quota on guaranteed auth failures.
+    if not os.environ.get(ENV_ACCESS_TOKEN):
+        logger.warning(
+            "Govern call skipped: %s is not set in the environment; "
+            "compensation requires a bearer token. validators=[%s]",
+            ENV_ACCESS_TOKEN,
+            ", ".join(validators),
+        )
+        return
+
     try:
         payload = json.dumps(
             {
diff --git a/src/uipath/runtime/governance/native/loader.py b/src/uipath/runtime/governance/native/loader.py
@@ -142,11 +142,21 @@ def get_policy_index() -> PolicyIndex:
         completed = event.wait(timeout=_PREFETCH_WAIT_SECONDS)
         if completed and _policy_index is not None:
             return _policy_index
-        logger.warning(
-            "Policy prefetch did not complete in %.1fs; "
-            "agent will run without any policies",
-            _PREFETCH_WAIT_SECONDS,
-        )
+        if not completed:
+            logger.warning(
+                "Policy prefetch did not complete in %.1fs; "
+                "agent will run without any policies",
+                _PREFETCH_WAIT_SECONDS,
+            )
+        else:
+            # Distinguish from the timeout path so production triage
+            # can tell "prefetch hung" from "prefetch returned empty"
+            # (auth failure, server error, parse failure).
+            logger.warning(
+                "Policy prefetch completed but produced no PolicyIndex "
+                "(see prior WARN for the root cause); agent will run "
+                "without any policies"
+            )
         _policy_index = PolicyIndex()
         return _policy_index
 
diff --git a/src/uipath/runtime/governance/native/policy_api_client.py b/src/uipath/runtime/governance/native/policy_api_client.py
@@ -18,9 +18,12 @@
 is the YAML the evaluator compiles into a :class:`PolicyIndex`.
 
 Failure mode is fail-open: when the organization id is unknown, the
-access token is missing, the backend errors (one retry on transient
-failures), or the body can't be parsed, the caller falls back to an
-empty PolicyIndex. Nothing in this module ever raises to the caller.
+access token is missing, the backend errors, or the body can't be
+parsed, the caller falls back to an empty PolicyIndex. The fetch is
+single-shot (no retry by design — see :func:`_get_once`) so a slow
+backend can't extend agent startup beyond
+:data:`BACKEND_REQUEST_TIMEOUT_SECONDS`. Nothing in this module ever
+raises to the caller.
 """
 
 from __future__ import annotations
@@ -147,7 +150,10 @@ def _fetch_policy_response_inner() -> PolicyResponse | None:
         )
         return None
 
-    headers = governance_request_headers(json_body=True)
+    # Policy fetch is a GET; ``json_body=False`` so ``Content-Type`` is
+    # omitted. Strict origin servers may 415 on unexpected Content-Type
+    # for GETs (see :func:`governance_request_headers` docstring).
+    headers = governance_request_headers(json_body=False)
     headers[TENANT_HEADER] = tenant_id
     logger.info("Policy fetch starting (org=%s, tenant=%s)", org_id, tenant_id)
 
diff --git a/src/uipath/runtime/governance/wrapper.py b/src/uipath/runtime/governance/wrapper.py
@@ -138,7 +138,10 @@ def _extract_governable_text(
         return ""
 
     # Pydantic / dataclass-like shapes are easier to walk via their
-    # dict form than via attribute introspection.
+    # dict form than via attribute introspection. If the first dumper
+    # raises (e.g. ``model_dump`` blows up on a partial pydantic v1
+    # model), fall through to the next one rather than abandoning the
+    # whole pydantic/dataclass path.
     for dumper in ("model_dump", "dict"):
         fn = getattr(value, dumper, None)
         if callable(fn):
@@ -150,8 +153,8 @@ def _extract_governable_text(
                     depth=depth + 1,
                     latest_only=latest_only,
                 )
-            except Exception:  # noqa: BLE001 - fall through to other extractors
-                break
+            except Exception:  # noqa: BLE001 - try the next dumper
+                continue
 
     obj_id = id(value)
     if seen is None:
diff --git a/tests/test_enforcement_mode_default.py b/tests/test_enforcement_mode_default.py
@@ -0,0 +1,87 @@
+"""Tests for the default enforcement-mode resolution.
+
+The default is :attr:`EnforcementMode.DISABLED` — until the policy
+loader successfully fetches a backend response and calls
+``set_enforcement_mode`` with the server-supplied value, governance
+short-circuits cheaply with no per-call audit overhead.
+
+Resolution order (per :func:`get_enforcement_mode`):
+1. Previously-cached programmatic value (set via ``set_enforcement_mode``).
+2. ``UIPATH_GOVERNANCE_MODE`` env var.
+3. Default ``DISABLED``.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from uipath.runtime.governance.config import (
+    EnforcementMode,
+    get_enforcement_mode,
+    reset_enforcement_mode,
+    set_enforcement_mode,
+)
+
+
+@pytest.fixture(autouse=True)
+def _isolate_mode(monkeypatch: pytest.MonkeyPatch):
+    """Each test starts from a clean module-state slate."""
+    monkeypatch.delenv("UIPATH_GOVERNANCE_MODE", raising=False)
+    reset_enforcement_mode()
+    yield
+    reset_enforcement_mode()
+
+
+def test_default_mode_is_disabled() -> None:
+    """No programmatic mode + no env var → DISABLED.
+
+    Replaces the prior AUDIT default. Empty-policy / failed-fetch /
+    pre-fetch tenants pay zero audit overhead until the backend
+    explicitly enables governance on the next policy fetch.
+    """
+    assert get_enforcement_mode() is EnforcementMode.DISABLED
+
+
+def test_env_var_audit_wins_over_default(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Developer override via env var still works."""
+    monkeypatch.setenv("UIPATH_GOVERNANCE_MODE", "audit")
+    reset_enforcement_mode()  # clear cached default
+    assert get_enforcement_mode() is EnforcementMode.AUDIT
+
+
+def test_env_var_enforce_wins_over_default(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("UIPATH_GOVERNANCE_MODE", "enforce")
+    reset_enforcement_mode()
+    assert get_enforcement_mode() is EnforcementMode.ENFORCE
+
+
+def test_invalid_env_var_falls_back_to_disabled(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("UIPATH_GOVERNANCE_MODE", "garbage-value")
+    reset_enforcement_mode()
+    assert get_enforcement_mode() is EnforcementMode.DISABLED
+
+
+def test_programmatic_set_wins_over_env_and_default(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """The policy loader's ``set_enforcement_mode`` call is canonical."""
+    monkeypatch.setenv("UIPATH_GOVERNANCE_MODE", "audit")
+    set_enforcement_mode(EnforcementMode.ENFORCE)
+    assert get_enforcement_mode() is EnforcementMode.ENFORCE
+
+
+def test_reset_returns_to_default() -> None:
+    """``reset_enforcement_mode`` clears the cache so the default re-applies."""
+    set_enforcement_mode(EnforcementMode.ENFORCE)
+    assert get_enforcement_mode() is EnforcementMode.ENFORCE
+    reset_enforcement_mode()
+    assert get_enforcement_mode() is EnforcementMode.DISABLED
+
+
+def test_disabled_mode_is_cached_after_first_read() -> None:
+    """First call computes; subsequent calls hit the cache."""
+    assert get_enforcement_mode() is EnforcementMode.DISABLED
+    # A second call returns the same instance — the cache survives.
+    assert get_enforcement_mode() is EnforcementMode.DISABLED
diff --git a/tests/test_guardrail_compensation.py b/tests/test_guardrail_compensation.py
@@ -89,15 +89,17 @@ def _reset_enforcement_mode():
 
 @pytest.fixture
 def _govern_env(monkeypatch):
-    """Provide the org/tenant env vars that request_governance requires.
+    """Provide the env vars that request_governance requires.
 
-    The compensating call now mirrors the policy fetch — it skips when
-    ``UIPATH_ORGANIZATION_ID`` / ``UIPATH_TENANT_ID`` are missing.
-    Tests that need the network path to actually fire must opt into
-    this fixture.
+    The compensating call mirrors the policy fetch — it skips when
+    ``UIPATH_ORGANIZATION_ID`` / ``UIPATH_TENANT_ID`` /
+    ``UIPATH_ACCESS_TOKEN`` are missing (sending without a bearer
+    token would generate a guaranteed 401 per call). Tests that need
+    the network path to actually fire must opt into this fixture.
     """
     monkeypatch.setenv("UIPATH_ORGANIZATION_ID", "appsdev")
     monkeypatch.setenv("UIPATH_TENANT_ID", "tenant-xyz")
+    monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "test-token")
     yield
 
 
@@ -215,9 +217,8 @@ def test_request_governance_posts_expected_payload_and_returns_none(
     }
 
 
-def test_request_governance_sends_shared_headers(monkeypatch, _govern_env):
-    """Headers must come from the shared helper — UA + Accept + Content-Type."""
-    monkeypatch.delenv("UIPATH_ACCESS_TOKEN", raising=False)
+def test_request_governance_sends_shared_headers(_govern_env):
+    """Headers must come from the shared helper — UA + Accept + Content-Type + Auth."""
     with patch.object(
         guardrail_compensation.urllib.request,
         "urlopen",
@@ -232,8 +233,8 @@ def test_request_governance_sends_shared_headers(monkeypatch, _govern_env):
     assert request_arg.get_header("Accept") == "application/json"
     assert request_arg.get_header("Content-type") == "application/json"
     assert request_arg.get_header("User-agent") == USER_AGENT
-    # No token in env → no Authorization header.
-    assert request_arg.get_header("Authorization") is None
+    # Bearer is required (see ``test_request_governance_skipped_when_token_missing``).
+    assert request_arg.get_header("Authorization") == "Bearer test-token"
     # Tenant header must travel on the compensating POST (same as the
     # policy GET) — the agenticgovernance ingress validates it.
     assert request_arg.get_header("X-uipath-internal-tenantid") == "tenant-xyz"
@@ -252,6 +253,24 @@ def test_request_governance_includes_bearer_token_when_set(monkeypatch, _govern_
     assert request_arg.get_header("Authorization") == "Bearer the-token"
 
 
+def test_request_governance_skipped_when_token_missing(monkeypatch):
+    """Missing bearer → skip cleanly instead of sending a guaranteed-401 request.
+
+    Sending without a token would produce a 401 per compensation event
+    and pollute logs. Mirrors the org-id / tenant-id skip paths above.
+    """
+    monkeypatch.setenv("UIPATH_ORGANIZATION_ID", "appsdev")
+    monkeypatch.setenv("UIPATH_TENANT_ID", "tenant-xyz")
+    monkeypatch.delenv("UIPATH_ACCESS_TOKEN", raising=False)
+    with patch.object(
+        guardrail_compensation.urllib.request, "urlopen"
+    ) as mock_urlopen:
+        request_governance(_rules("x"), {}, "before_model", "t", "ts", "a", "r")
+    assert not mock_urlopen.called, (
+        "request_governance must NOT POST when bearer token is missing"
+    )
+
+
 def test_request_governance_skipped_when_org_id_missing(monkeypatch):
     """Without an org id, we cannot build the URL — skip the call entirely."""
     monkeypatch.delenv("UIPATH_ORGANIZATION_ID", raising=False)