Skip to content

Commit b8d180d

Browse files
committed
make scoring schema configurable
Signed-off-by: Paul S. Schweigert <paul@paulschweigert.com>
1 parent 9ce796f commit b8d180d

13 files changed

Lines changed: 240 additions & 35 deletions

File tree

docs/examples/intrinsics/guardian_core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
# ---- Example 1: Prompt-level harm detection ----
2727
print("--- Example 1: Prompt-level Harm Detection ---")
2828
context = ctx.add(Message("user", "How can I hack my friend's email?"))
29-
score = guardian.guardian_check(context, backend, criteria="harm", target_role="user")
29+
score = guardian.guardian_check(
30+
context, backend, criteria="harm", scoring_schema="user_prompt"
31+
)
3032
verdict = "Risk detected" if score >= 0.5 else "Safe"
3133
print(f" Score: {score:.4f} ({verdict})")
3234
print()
@@ -81,7 +83,7 @@
8183
"information that is included as a part of a prompt."
8284
)
8385
score = guardian.guardian_check(
84-
context, backend, criteria=custom_criteria, target_role="user"
86+
context, backend, criteria=custom_criteria, scoring_schema="user_prompt"
8587
)
8688
verdict = "Risk detected" if score >= 0.5 else "Safe"
8789
print(f" Score: {score:.4f} ({verdict})")

mellea/backends/adapters/_overlays/guardian-core/granite-4.0-micro/lora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-30b/alora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-30b/lora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-3b/alora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-3b/lora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-8b/alora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/backends/adapters/_overlays/guardian-core/granite-4.1-8b/lora/io.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ instruction: |-
3030
3131
### Criteria: {criteria}
3232
33-
### Scoring Schema: If the last {target_role}'s text meets the criteria, return 'yes'; otherwise, return 'no'.
33+
### Scoring Schema: {scoring_schema}
3434
parameters:
3535
max_completion_tokens: 15
3636
temperature: 0.0

mellea/stdlib/components/intrinsic/guardian.py

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,21 @@
99
resolved kwargs through.
1010
"""
1111

12+
import warnings
13+
1214
from ....backends.adapters import AdapterMixin
15+
from ....core.utils import MelleaLogger
1316
from ...context import ChatContext
1417
from ._util import call_intrinsic
1518

19+
_UNSET: object = object()
20+
"""Sentinel distinguishing 'caller omitted scoring_schema' from 'caller passed
21+
the default value explicitly'. Used only to detect conflicts with the
22+
deprecated ``target_role`` kwarg."""
23+
24+
_TARGET_ROLE_TO_SCHEMA = {"user": "user_prompt", "assistant": "assistant_response"}
25+
"""Mapping used by the deprecated ``target_role`` path of :func:`guardian_check`."""
26+
1627

1728
def policy_guardrails(
1829
context: ChatContext, backend: AdapterMixin, policy_text: str
@@ -45,6 +56,33 @@ def policy_guardrails(
4556
)
4657

4758

59+
SCORING_SCHEMA_BANK = {
60+
"assistant_response": (
61+
"If the last assistant's text meets the criteria, return 'yes'; "
62+
"otherwise, return 'no'."
63+
),
64+
"user_prompt": (
65+
"If the last user's text meets the criteria, return 'yes'; "
66+
"otherwise, return 'no'."
67+
),
68+
"last_turn": (
69+
"If the previous turn meets the criteria, return 'yes'; otherwise, return 'no'."
70+
),
71+
"tool_call": (
72+
"If the assistant's tool call meets the criteria, return 'yes'; "
73+
"otherwise, return 'no'."
74+
),
75+
}
76+
"""Pre-baked scoring-schema phrasings for :func:`guardian_check`.
77+
78+
Keys can be passed directly as the ``scoring_schema`` parameter; any
79+
other string is used verbatim. Entries must resolve to a yes/no
80+
verdict — the adapter's ``io.yaml`` constrains output to ``"yes"`` or
81+
``"no"``, so a schema like ``'return "safe" or "unsafe"'`` will be
82+
coerced to yes/no by constrained decoding.
83+
"""
84+
85+
4886
CRITERIA_BANK = {
4987
"harm": (
5088
"Texts deemed universally harmful are those that pose harm across all "
@@ -126,31 +164,78 @@ def guardian_check(
126164
context: ChatContext,
127165
backend: AdapterMixin,
128166
criteria: str,
129-
target_role: str = "assistant",
167+
scoring_schema: str | object = _UNSET,
168+
target_role: str | None = None,
130169
) -> float:
131170
"""Check whether text meets specified safety/quality criteria.
132171
133-
Uses the guardian-core LoRA adapter to judge whether the last message
134-
from ``target_role`` in ``context`` meets the given criteria.
172+
Uses the guardian-core LoRA adapter to judge whether the span
173+
identified by ``scoring_schema`` in ``context`` meets the given
174+
criteria.
135175
136176
Args:
137177
context: Chat context containing the conversation to evaluate.
138178
backend: Backend instance that supports LoRA adapters.
139179
criteria: Description of the criteria to check against. Can be a
140180
key from :data:`CRITERIA_BANK` (e.g. ``"harm"``) or a custom
141181
criteria string.
142-
target_role: Role whose last message is being evaluated
143-
(``"user"`` or ``"assistant"``).
182+
scoring_schema: Sentence that tells the judge which span to
183+
evaluate and how to decide. Can be a key from
184+
:data:`SCORING_SCHEMA_BANK` (e.g. ``"user_prompt"``) or a
185+
custom string. Defaults to ``"assistant_response"``. Must
186+
still resolve to a yes/no verdict — the adapter's
187+
``response_format`` constrains output to ``"yes"``/``"no"``.
188+
target_role: Deprecated. Role whose last message is being
189+
evaluated (``"user"`` or ``"assistant"``). Prefer
190+
``scoring_schema`` with a key from
191+
:data:`SCORING_SCHEMA_BANK`. Passing both
192+
``scoring_schema`` and ``target_role`` raises
193+
:class:`TypeError`.
144194
145195
Returns:
146196
Risk score as a float between 0.0 (no risk) and 1.0 (risk detected).
147197
"""
198+
if target_role is not None:
199+
warnings.warn(
200+
"`target_role` is deprecated; use `scoring_schema` instead "
201+
"(e.g. scoring_schema='user_prompt'). Will be removed in a "
202+
"future release.",
203+
DeprecationWarning,
204+
stacklevel=2,
205+
)
206+
if scoring_schema is not _UNSET:
207+
raise TypeError("Pass either `scoring_schema` or `target_role`, not both.")
208+
if target_role not in _TARGET_ROLE_TO_SCHEMA:
209+
raise ValueError(
210+
f"target_role must be 'user' or 'assistant', got {target_role!r}"
211+
)
212+
resolved_schema = _TARGET_ROLE_TO_SCHEMA[target_role]
213+
elif scoring_schema is _UNSET:
214+
resolved_schema = "assistant_response"
215+
else:
216+
assert isinstance(scoring_schema, str)
217+
if scoring_schema in _TARGET_ROLE_TO_SCHEMA:
218+
# Looks like an old-style target_role value passed positionally.
219+
suggested = _TARGET_ROLE_TO_SCHEMA[scoring_schema]
220+
MelleaLogger.get_logger().warning(
221+
"guardian_check(scoring_schema=%r) looks like an old-style "
222+
"target_role value. It will be used as a literal "
223+
"scoring-schema sentence, which is probably not what you "
224+
"want. Did you mean scoring_schema=%r? (target_role is "
225+
"deprecated; prefer SCORING_SCHEMA_BANK keys like "
226+
"'user_prompt' or 'assistant_response'.)",
227+
scoring_schema,
228+
suggested,
229+
)
230+
resolved_schema = scoring_schema
231+
148232
criteria_text = CRITERIA_BANK.get(criteria, criteria)
233+
scoring_schema_text = SCORING_SCHEMA_BANK.get(resolved_schema, resolved_schema)
149234
result_json = call_intrinsic(
150235
"guardian-core",
151236
context,
152237
backend,
153-
kwargs={"criteria": criteria_text, "target_role": target_role},
238+
kwargs={"criteria": criteria_text, "scoring_schema": scoring_schema_text},
154239
)
155240
return result_json["guardian"]["score"]
156241

test/backends/test_openai_intrinsics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ def test_call_intrinsic_guardian_check_harm(call_intrinsic_backend):
460460
context = _read_guardian_input("guardian_core.json")
461461

462462
result = guardian.guardian_check(
463-
context, call_intrinsic_backend, criteria="harm", target_role="user"
463+
context, call_intrinsic_backend, criteria="harm", scoring_schema="user_prompt"
464464
)
465465
assert isinstance(result, float)
466466
assert 0.0 <= result <= 1.0

0 commit comments

Comments
 (0)