Skip to content

Commit 7b0a15d

Browse files
update the schemas and corresponding tests
1 parent 74fcbeb commit 7b0a15d

6 files changed

Lines changed: 40 additions & 65 deletions

File tree

evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,22 +66,22 @@ def _as_float_or_none(value: JSONValue) -> float | None:
6666
return None
6767

6868

69-
ScorerStepType = Literal["session", "trace", "span"]
69+
RootType = Literal["session", "trace", "span"]
7070

7171

7272
class ScorerInvokeRequest(BaseModel):
7373
"""Request payload for Galileo Luna scorer invocation.
7474
7575
Attributes:
76-
step_type: Runtime step shape used by Galileo scorer input normalization.
76+
root_type: Runtime step shape used by Galileo scorer input normalization.
7777
input: Optional user/system prompt text.
7878
output: Optional model response text.
7979
scorer_label: Preset, registered, or fine-tuned scorer label.
8080
project_id: Optional Galileo project UUID for project-scoped scorer resolution.
8181
config: Optional scorer-specific configuration.
8282
"""
8383

84-
step_type: ScorerStepType = Field(default="span")
84+
root_type: RootType = Field(default="span")
8585
input: JSONValue = None
8686
output: JSONValue = None
8787
scorer_label: str = Field(min_length=1)
@@ -117,18 +117,6 @@ class ScorerInvokeResponse(BaseModel):
117117
error_message: str | None = None
118118
_raw_response: JSONObject = PrivateAttr(default_factory=dict)
119119

120-
@model_validator(mode="before")
121-
@classmethod
122-
def allow_legacy_metric_response(cls, data: object) -> object:
123-
if isinstance(data, dict) and "scorer_label" not in data and "metric" in data:
124-
return data | {"scorer_label": data["metric"]}
125-
return data
126-
127-
@property
128-
def metric(self) -> str:
129-
"""Backward-compatible alias for existing evaluator metadata code."""
130-
return self.scorer_label
131-
132120
@property
133121
def raw_response(self) -> JSONObject:
134122
return self._raw_response
@@ -243,10 +231,10 @@ def _endpoint_and_headers(
243231
async def invoke(
244232
self,
245233
*,
246-
metric: str,
234+
scorer_label: str,
247235
input: JSONValue = None,
248236
output: JSONValue = None,
249-
step_type: ScorerStepType = "span",
237+
root_type: RootType = "span",
250238
project_id: str | UUID | None = None,
251239
config: JSONObject | None = None,
252240
timeout: float = DEFAULT_TIMEOUT_SECS,
@@ -255,10 +243,10 @@ async def invoke(
255243
"""Invoke a Galileo Luna scorer.
256244
257245
Args:
258-
metric: Preset, registered, or fine-tuned scorer label.
246+
scorer_label: Preset, registered, or fine-tuned scorer label.
259247
input: Optional user/system prompt text.
260248
output: Optional model response text.
261-
step_type: Runtime step shape used by Galileo scorer input normalization.
249+
root_type: Runtime step shape used by Galileo scorer input normalization.
262250
project_id: Optional Galileo project UUID for project-scoped scorer resolution.
263251
config: Optional scorer-specific configuration.
264252
timeout: Request timeout in seconds.
@@ -277,10 +265,10 @@ async def invoke(
277265
raise ValueError("At least one of input or output must be provided.")
278266

279267
request_body = ScorerInvokeRequest(
280-
scorer_label=metric,
268+
scorer_label=scorer_label,
281269
input=input,
282270
output=output,
283-
step_type=step_type,
271+
root_type=root_type,
284272
project_id=project_id,
285273
config=config,
286274
).to_dict()

evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,19 @@ class LunaEvaluatorConfig(EvaluatorConfig):
3232
"""Configuration for direct Luna scorer evaluation.
3333
3434
Attributes:
35-
metric: Preset, registered, or fine-tuned scorer name.
35+
scorer_label: Preset, registered, or fine-tuned scorer label.
3636
project_id: Optional Galileo project UUID for project-scoped scorer resolution.
3737
threshold: Local threshold used by the evaluator for comparison.
3838
operator: Local comparison operator. Numeric operators use threshold as a number.
3939
scorer_config: Optional scorer-specific config sent as ``config``.
4040
timeout_ms: Request timeout in milliseconds.
4141
on_error: Error policy: allow=fail open, deny=fail closed.
4242
payload_field: Force selected data into input or output. If omitted, root step
43-
payloads with input/output use both fields; scalar data is inferred from metric name.
43+
payloads with input/output use both fields; scalar data is inferred from scorer label.
4444
include_raw_response: Include the raw API response in EvaluatorResult metadata.
4545
"""
4646

47-
metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate")
47+
scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
4848
project_id: UUID | None = Field(
4949
default=None,
5050
description="Optional Galileo project UUID for project-scoped scorer resolution.",

evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
139139
return input_text, output_text
140140

141141
text = _coerce_payload_text(data)
142-
if "output" in self.config.metric:
142+
if "output" in self.config.scorer_label:
143143
return None, text
144144
return text, None
145145

@@ -190,12 +190,12 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
190190
matched=False,
191191
confidence=1.0,
192192
message="No data to score with Luna",
193-
metadata={"metric": self.config.metric},
193+
metadata={"scorer_label": self.config.scorer_label},
194194
)
195195

196196
try:
197197
response = await self._get_client().invoke(
198-
metric=self.config.metric,
198+
scorer_label=self.config.scorer_label,
199199
input=input_text if _has_text(input_text) else None,
200200
output=output_text if _has_text(output_text) else None,
201201
project_id=self.config.project_id,
@@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
227227

228228
def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
229229
metadata: dict[str, Any] = {
230-
"metric": response.scorer_label or self.config.metric,
230+
"scorer_label": response.scorer_label or self.config.scorer_label,
231231
"project_id": str(self.config.project_id) if self.config.project_id else None,
232232
"score": response.score,
233233
"threshold": self.config.threshold,
@@ -251,7 +251,7 @@ def _handle_error(self, error: Exception) -> EvaluatorResult:
251251
metadata={
252252
"error": error_detail,
253253
"error_type": type(error).__name__,
254-
"metric": self.config.metric,
254+
"scorer_label": self.config.scorer_label,
255255
"fallback_action": fallback,
256256
},
257257
error=None if matched else error_detail,

evaluators/contrib/galileo/tests/test_luna_evaluator.py

Lines changed: 19 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
2727

2828
# Given: a direct scorer config with local thresholding
2929
config = LunaEvaluatorConfig(
30-
metric="toxicity",
30+
scorer_label="toxicity",
3131
project_id="12345678-1234-5678-1234-567812345678",
3232
threshold=0.7,
3333
operator="gte",
3434
config={"temperature": 0},
3535
)
3636

3737
# Then: config is retained without Protect concepts
38-
assert config.metric == "toxicity"
38+
assert config.scorer_label == "toxicity"
3939
assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
4040
assert config.threshold == 0.7
4141
assert config.operator == "gte"
@@ -46,7 +46,7 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
4646

4747
# Given/When/Then: numeric local comparison rejects non-numeric thresholds
4848
with pytest.raises(ValidationError, match="numeric threshold"):
49-
LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte")
49+
LunaEvaluatorConfig(scorer_label="toxicity", threshold="high", operator="gte")
5050

5151

5252
class TestGalileoLunaClient:
@@ -65,7 +65,7 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
6565

6666
# Then: the serialized payload uses the Orbit scorer invoke fields
6767
assert request.to_dict() == {
68-
"step_type": "span",
68+
"root_type": "span",
6969
"input": {"messages": [{"role": "user", "content": "hello"}]},
7070
"scorer_label": "toxicity",
7171
"project_id": "12345678-1234-5678-1234-567812345678",
@@ -102,21 +102,8 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
102102
"error_message": None,
103103
}
104104
assert response.scorer_label == "toxicity"
105-
assert response.metric == "toxicity"
106105
assert response.raw_response["scorer_label"] == "toxicity"
107106

108-
def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None:
109-
from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
110-
111-
# Given/When: an older API response uses metric instead of scorer_label
112-
response = ScorerInvokeResponse.from_dict(
113-
{"metric": "toxicity", "score": 0.82, "status": "success"}
114-
)
115-
116-
# Then: the client still normalizes it to the current response contract
117-
assert response.scorer_label == "toxicity"
118-
assert response.model_dump()["scorer_label"] == "toxicity"
119-
120107
def test_client_uses_protect_api_url_derivation(self) -> None:
121108
from agent_control_evaluator_galileo.luna import GalileoLunaClient
122109

@@ -187,7 +174,7 @@ def handler(request: httpx.Request) -> httpx.Response:
187174
try:
188175
# When: invoking a scorer
189176
response = await client.invoke(
190-
metric="toxicity",
177+
scorer_label="toxicity",
191178
input="user prompt",
192179
output="model answer",
193180
project_id="12345678-1234-5678-1234-567812345678",
@@ -204,7 +191,7 @@ def handler(request: httpx.Request) -> httpx.Response:
204191
"output": "model answer",
205192
"scorer_label": "toxicity",
206193
"project_id": "12345678-1234-5678-1234-567812345678",
207-
"step_type": "span",
194+
"root_type": "span",
208195
"config": {"top_k": 1},
209196
}
210197
assert "stage_name" not in captured["body"]
@@ -241,7 +228,7 @@ def handler(request: httpx.Request) -> httpx.Response:
241228
try:
242229
# When: invoking a scorer with project context
243230
response = await client.invoke(
244-
metric="toxicity",
231+
scorer_label="toxicity",
245232
output="model answer",
246233
project_id="12345678-1234-5678-1234-567812345678",
247234
)
@@ -255,7 +242,7 @@ def handler(request: httpx.Request) -> httpx.Response:
255242
"output": "model answer",
256243
"scorer_label": "toxicity",
257244
"project_id": "12345678-1234-5678-1234-567812345678",
258-
"step_type": "span",
245+
"root_type": "span",
259246
}
260247
headers = captured["headers"]
261248
assert isinstance(headers, dict)
@@ -278,7 +265,7 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
278265

279266
# When/Then: project_id is required because API uses it as the internal auth context
280267
with pytest.raises(ValueError, match="project_id is required"):
281-
await client.invoke(metric="toxicity", output="model answer")
268+
await client.invoke(scorer_label="toxicity", output="model answer")
282269

283270

284271
class TestLunaEvaluator:
@@ -296,15 +283,15 @@ def test_evaluator_init_without_auth_raises(self) -> None:
296283
from agent_control_evaluator_galileo.luna import LunaEvaluator
297284

298285
with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"):
299-
LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
286+
LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
300287

301288
@patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True)
302289
def test_evaluator_init_accepts_api_secret(self) -> None:
303290
from agent_control_evaluator_galileo.luna import LunaEvaluator
304291

305292
evaluator = LunaEvaluator.from_dict(
306293
{
307-
"metric": "toxicity",
294+
"scorer_label": "toxicity",
308295
"project_id": "12345678-1234-5678-1234-567812345678",
309296
"threshold": 0.5,
310297
}
@@ -321,7 +308,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
321308
# Given: a direct Luna evaluator and a raw successful scorer response
322309
evaluator = LunaEvaluator.from_dict(
323310
{
324-
"metric": "toxicity",
311+
"scorer_label": "toxicity",
325312
"project_id": "12345678-1234-5678-1234-567812345678",
326313
"threshold": 0.7,
327314
"operator": "gte",
@@ -350,7 +337,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
350337
assert result.matched is True
351338
assert result.confidence == 0.82
352339
assert result.metadata == {
353-
"metric": "toxicity",
340+
"scorer_label": "toxicity",
354341
"project_id": "12345678-1234-5678-1234-567812345678",
355342
"score": 0.82,
356343
"threshold": 0.7,
@@ -360,7 +347,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
360347
"error_message": None,
361348
}
362349
mock_invoke.assert_awaited_once_with(
363-
metric="toxicity",
350+
scorer_label="toxicity",
364351
input="user prompt",
365352
output="model answer",
366353
project_id=evaluator.config.project_id,
@@ -376,7 +363,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
376363

377364
# Given: a raw scorer value below the local threshold
378365
evaluator = LunaEvaluator.from_dict(
379-
{"metric": "toxicity", "threshold": 0.7, "operator": "gte"}
366+
{"scorer_label": "toxicity", "threshold": 0.7, "operator": "gte"}
380367
)
381368

382369
with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
@@ -393,7 +380,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
393380
assert result.matched is False
394381
assert result.confidence == 0.2
395382
mock_invoke.assert_awaited_once_with(
396-
metric="toxicity",
383+
scorer_label="toxicity",
397384
input="hello",
398385
output=None,
399386
project_id=None,
@@ -408,7 +395,7 @@ async def test_evaluator_does_not_call_api_for_empty_data(self) -> None:
408395
from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
409396

410397
# Given: an evaluator and empty selected data
411-
evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
398+
evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
412399

413400
with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
414401
# When: evaluating empty data
@@ -427,7 +414,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
427414
from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
428415

429416
# Given: default fail-open behavior
430-
evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
417+
evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
431418

432419
with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
433420
mock_invoke.side_effect = RuntimeError("service unavailable")
@@ -449,7 +436,7 @@ async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
449436

450437
# Given: fail-closed behavior for scorer errors
451438
evaluator = LunaEvaluator.from_dict(
452-
{"metric": "toxicity", "threshold": 0.5, "on_error": "deny"}
439+
{"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"}
453440
)
454441

455442
with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:

examples/galileo_luna/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000"
3333
Optional scorer settings:
3434

3535
```bash
36-
export GALILEO_LUNA_METRIC="toxicity"
36+
export GALILEO_LUNA_SCORER_LABEL="toxicity"
3737
export GALILEO_LUNA_THRESHOLD="0.5"
3838
```
3939

examples/galileo_luna/setup_controls.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls"
2424
SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
2525

26-
LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity")
26+
LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity")
2727
LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5"))
2828
GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID")
2929

@@ -41,7 +41,7 @@
4141
def luna_config() -> dict[str, Any]:
4242
"""Build the direct Luna evaluator config used by the composite control."""
4343
config: dict[str, Any] = {
44-
"metric": LUNA_METRIC,
44+
"scorer_label": LUNA_SCORER_LABEL,
4545
"threshold": LUNA_THRESHOLD,
4646
"operator": "gte",
4747
"payload_field": "output",
@@ -158,7 +158,7 @@ async def setup_demo() -> None:
158158
print("Setting up direct Galileo Luna demo controls")
159159
print(f"Server: {SERVER_URL}")
160160
print(f"Agent: {AGENT_NAME}")
161-
print(f"Luna: metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}")
161+
print(f"Luna: scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}")
162162
if GALILEO_PROJECT_ID:
163163
print(f"Project ID: {GALILEO_PROJECT_ID}")
164164

0 commit comments

Comments
 (0)