deprecate target_role in guardian_check helper

psschwei · psschwei · commit b76390a2f6fb · 2026-05-07T17:48:31.000-04:00
Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;
diff --git a/mellea/stdlib/components/intrinsic/guardian.py b/mellea/stdlib/components/intrinsic/guardian.py
@@ -9,10 +9,21 @@
 resolved kwargs through.
 """
 
+import warnings
+
 from ....backends.adapters import AdapterMixin
+from ....core.utils import MelleaLogger
 from ...context import ChatContext
 from ._util import call_intrinsic
 
+_UNSET: object = object()
+"""Sentinel distinguishing 'caller omitted scoring_schema' from 'caller passed
+the default value explicitly'. Used only to detect conflicts with the
+deprecated ``target_role`` kwarg."""
+
+_TARGET_ROLE_TO_SCHEMA = {"user": "user_prompt", "assistant": "assistant_response"}
+"""Mapping used by the deprecated ``target_role`` path of :func:`guardian_check`."""
+
 
 def policy_guardrails(
     context: ChatContext, backend: AdapterMixin, policy_text: str
@@ -153,7 +164,8 @@ def guardian_check(
     context: ChatContext,
     backend: AdapterMixin,
     criteria: str,
-    scoring_schema: str = "assistant_response",
+    scoring_schema: str | object = _UNSET,
+    target_role: str | None = None,
 ) -> float:
     """Check whether text meets specified safety/quality criteria.
 
@@ -170,15 +182,55 @@ def guardian_check(
         scoring_schema: Sentence that tells the judge which span to
             evaluate and how to decide. Can be a key from
             :data:`SCORING_SCHEMA_BANK` (e.g. ``"user_prompt"``) or a
-            custom string. Must still resolve to a yes/no verdict —
-            the adapter's ``response_format`` constrains output to
-            ``"yes"``/``"no"``.
+            custom string. Defaults to ``"assistant_response"``. Must
+            still resolve to a yes/no verdict — the adapter's
+            ``response_format`` constrains output to ``"yes"``/``"no"``.
+        target_role: Deprecated. Role whose last message is being
+            evaluated (``"user"`` or ``"assistant"``). Prefer
+            ``scoring_schema`` with a key from
+            :data:`SCORING_SCHEMA_BANK`. Passing both
+            ``scoring_schema`` and ``target_role`` raises
+            :class:`TypeError`.
 
     Returns:
         Risk score as a float between 0.0 (no risk) and 1.0 (risk detected).
     """
+    if target_role is not None:
+        warnings.warn(
+            "`target_role` is deprecated; use `scoring_schema` instead "
+            "(e.g. scoring_schema='user_prompt'). Will be removed in a "
+            "future release.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if scoring_schema is not _UNSET:
+            raise TypeError("Pass either `scoring_schema` or `target_role`, not both.")
+        if target_role not in _TARGET_ROLE_TO_SCHEMA:
+            raise ValueError(
+                f"target_role must be 'user' or 'assistant', got {target_role!r}"
+            )
+        resolved_schema = _TARGET_ROLE_TO_SCHEMA[target_role]
+    elif scoring_schema is _UNSET:
+        resolved_schema = "assistant_response"
+    else:
+        assert isinstance(scoring_schema, str)
+        if scoring_schema in _TARGET_ROLE_TO_SCHEMA:
+            # Looks like an old-style target_role value passed positionally.
+            suggested = _TARGET_ROLE_TO_SCHEMA[scoring_schema]
+            MelleaLogger.get_logger().warning(
+                "guardian_check(scoring_schema=%r) looks like an old-style "
+                "target_role value. It will be used as a literal "
+                "scoring-schema sentence, which is probably not what you "
+                "want. Did you mean scoring_schema=%r? (target_role is "
+                "deprecated; prefer SCORING_SCHEMA_BANK keys like "
+                "'user_prompt' or 'assistant_response'.)",
+                scoring_schema,
+                suggested,
+            )
+        resolved_schema = scoring_schema
+
     criteria_text = CRITERIA_BANK.get(criteria, criteria)
-    scoring_schema_text = SCORING_SCHEMA_BANK.get(scoring_schema, scoring_schema)
+    scoring_schema_text = SCORING_SCHEMA_BANK.get(resolved_schema, resolved_schema)
     result_json = call_intrinsic(
         "guardian-core",
         context,
diff --git a/test/stdlib/components/intrinsic/test_guardian_deprecation.py b/test/stdlib/components/intrinsic/test_guardian_deprecation.py
@@ -0,0 +1,109 @@
+"""Unit tests for the deprecated ``target_role`` path of ``guardian_check``.
+
+Exercises the sentinel/mapping logic without touching a model. We monkeypatch
+``call_intrinsic`` and assert on (a) the ``kwargs["scoring_schema"]`` that
+reaches the adapter boundary and (b) the warnings/errors the caller sees.
+"""
+
+import warnings
+
+import pytest
+
+from mellea.stdlib.components.intrinsic import guardian
+from mellea.stdlib.context import ChatContext
+
+
+@pytest.fixture
+def capture_kwargs(monkeypatch):
+    """Replace call_intrinsic with a spy that returns a stub yes=1.0 result."""
+    captured: dict = {}
+
+    def fake_call_intrinsic(name, context, backend, /, kwargs=None, model_options=None):
+        captured["name"] = name
+        captured["kwargs"] = kwargs
+        return {"guardian": {"score": 1.0}}
+
+    monkeypatch.setattr(guardian, "call_intrinsic", fake_call_intrinsic)
+    return captured
+
+
+def test_default_scoring_schema_resolves_to_assistant_response(capture_kwargs):
+    guardian.guardian_check(ChatContext(), object(), criteria="harm")
+    assert (
+        capture_kwargs["kwargs"]["scoring_schema"]
+        == guardian.SCORING_SCHEMA_BANK["assistant_response"]
+    )
+
+
+def test_target_role_user_maps_to_user_prompt_with_deprecation_warning(capture_kwargs):
+    with pytest.warns(DeprecationWarning, match="target_role"):
+        guardian.guardian_check(
+            ChatContext(), object(), criteria="harm", target_role="user"
+        )
+    assert (
+        capture_kwargs["kwargs"]["scoring_schema"]
+        == guardian.SCORING_SCHEMA_BANK["user_prompt"]
+    )
+
+
+def test_target_role_assistant_maps_to_assistant_response_with_warning(capture_kwargs):
+    with pytest.warns(DeprecationWarning, match="target_role"):
+        guardian.guardian_check(
+            ChatContext(), object(), criteria="harm", target_role="assistant"
+        )
+    assert (
+        capture_kwargs["kwargs"]["scoring_schema"]
+        == guardian.SCORING_SCHEMA_BANK["assistant_response"]
+    )
+
+
+def test_target_role_invalid_value_raises_value_error(capture_kwargs):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        with pytest.raises(ValueError, match="target_role must be"):
+            guardian.guardian_check(
+                ChatContext(), object(), criteria="harm", target_role="system"
+            )
+
+
+def test_passing_both_scoring_schema_and_target_role_raises_type_error(capture_kwargs):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        with pytest.raises(TypeError, match="not both"):
+            guardian.guardian_check(
+                ChatContext(),
+                object(),
+                criteria="harm",
+                scoring_schema="user_prompt",
+                target_role="user",
+            )
+
+
+def test_positional_user_logs_warning_and_sends_literal(capture_kwargs, caplog):
+    """Positional 'user' is NOT auto-remapped — it's sent as a literal schema
+    sentence, with a logger warning pointing the caller at the fix.
+    """
+    with caplog.at_level("WARNING"):
+        guardian.guardian_check(ChatContext(), object(), "harm", "user")
+    # The literal "user" flows to the adapter unchanged.
+    assert capture_kwargs["kwargs"]["scoring_schema"] == "user"
+    # The warning text nudges the caller toward the bank key.
+    assert any("user_prompt" in rec.message for rec in caplog.records)
+
+
+def test_scoring_schema_bank_key_resolves_to_full_sentence(capture_kwargs):
+    guardian.guardian_check(
+        ChatContext(), object(), criteria="harm", scoring_schema="tool_call"
+    )
+    assert (
+        capture_kwargs["kwargs"]["scoring_schema"]
+        == guardian.SCORING_SCHEMA_BANK["tool_call"]
+    )
+
+
+def test_custom_scoring_schema_passes_through(capture_kwargs):
+    custom = "If the previous turn mentions cats, return 'yes'; otherwise, return 'no'."
+    guardian.guardian_check(
+        ChatContext(), object(), criteria="harm", scoring_schema=custom
+    )
+    assert capture_kwargs["kwargs"]["scoring_schema"] == custom