@@ -27,15 +27,15 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
2727
2828 # Given: a direct scorer config with local thresholding
2929 config = LunaEvaluatorConfig (
30- metric = "toxicity" ,
30+ scorer_label = "toxicity" ,
3131 project_id = "12345678-1234-5678-1234-567812345678" ,
3232 threshold = 0.7 ,
3333 operator = "gte" ,
3434 config = {"temperature" : 0 },
3535 )
3636
3737 # Then: config is retained without Protect concepts
38- assert config .metric == "toxicity"
38+ assert config .scorer_label == "toxicity"
3939 assert str (config .project_id ) == "12345678-1234-5678-1234-567812345678"
4040 assert config .threshold == 0.7
4141 assert config .operator == "gte"
@@ -46,7 +46,7 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
4646
4747 # Given/When/Then: numeric local comparison rejects non-numeric thresholds
4848 with pytest .raises (ValidationError , match = "numeric threshold" ):
49- LunaEvaluatorConfig (metric = "toxicity" , threshold = "high" , operator = "gte" )
49+ LunaEvaluatorConfig (scorer_label = "toxicity" , threshold = "high" , operator = "gte" )
5050
5151
5252class TestGalileoLunaClient :
@@ -65,7 +65,7 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
6565
6666 # Then: the serialized payload uses the Orbit scorer invoke fields
6767 assert request .to_dict () == {
68- "step_type " : "span" ,
68+ "root_type " : "span" ,
6969 "input" : {"messages" : [{"role" : "user" , "content" : "hello" }]},
7070 "scorer_label" : "toxicity" ,
7171 "project_id" : "12345678-1234-5678-1234-567812345678" ,
@@ -102,21 +102,8 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
102102 "error_message" : None ,
103103 }
104104 assert response .scorer_label == "toxicity"
105- assert response .metric == "toxicity"
106105 assert response .raw_response ["scorer_label" ] == "toxicity"
107106
108- def test_scorer_invoke_response_accepts_legacy_metric_field (self ) -> None :
109- from agent_control_evaluator_galileo .luna import ScorerInvokeResponse
110-
111- # Given/When: an older API response uses metric instead of scorer_label
112- response = ScorerInvokeResponse .from_dict (
113- {"metric" : "toxicity" , "score" : 0.82 , "status" : "success" }
114- )
115-
116- # Then: the client still normalizes it to the current response contract
117- assert response .scorer_label == "toxicity"
118- assert response .model_dump ()["scorer_label" ] == "toxicity"
119-
120107 def test_client_uses_protect_api_url_derivation (self ) -> None :
121108 from agent_control_evaluator_galileo .luna import GalileoLunaClient
122109
@@ -187,7 +174,7 @@ def handler(request: httpx.Request) -> httpx.Response:
187174 try :
188175 # When: invoking a scorer
189176 response = await client .invoke (
190- metric = "toxicity" ,
177+ scorer_label = "toxicity" ,
191178 input = "user prompt" ,
192179 output = "model answer" ,
193180 project_id = "12345678-1234-5678-1234-567812345678" ,
@@ -204,7 +191,7 @@ def handler(request: httpx.Request) -> httpx.Response:
204191 "output" : "model answer" ,
205192 "scorer_label" : "toxicity" ,
206193 "project_id" : "12345678-1234-5678-1234-567812345678" ,
207- "step_type " : "span" ,
194+ "root_type " : "span" ,
208195 "config" : {"top_k" : 1 },
209196 }
210197 assert "stage_name" not in captured ["body" ]
@@ -241,7 +228,7 @@ def handler(request: httpx.Request) -> httpx.Response:
241228 try :
242229 # When: invoking a scorer with project context
243230 response = await client .invoke (
244- metric = "toxicity" ,
231+ scorer_label = "toxicity" ,
245232 output = "model answer" ,
246233 project_id = "12345678-1234-5678-1234-567812345678" ,
247234 )
@@ -255,7 +242,7 @@ def handler(request: httpx.Request) -> httpx.Response:
255242 "output" : "model answer" ,
256243 "scorer_label" : "toxicity" ,
257244 "project_id" : "12345678-1234-5678-1234-567812345678" ,
258- "step_type " : "span" ,
245+ "root_type " : "span" ,
259246 }
260247 headers = captured ["headers" ]
261248 assert isinstance (headers , dict )
@@ -278,7 +265,7 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
278265
279266 # When/Then: project_id is required because API uses it as the internal auth context
280267 with pytest .raises (ValueError , match = "project_id is required" ):
281- await client .invoke (metric = "toxicity" , output = "model answer" )
268+ await client .invoke (scorer_label = "toxicity" , output = "model answer" )
282269
283270
284271class TestLunaEvaluator :
@@ -296,15 +283,15 @@ def test_evaluator_init_without_auth_raises(self) -> None:
296283 from agent_control_evaluator_galileo .luna import LunaEvaluator
297284
298285 with pytest .raises (ValueError , match = "GALILEO_API_SECRET_KEY or GALILEO_API_KEY" ):
299- LunaEvaluator .from_dict ({"metric " : "toxicity" , "threshold" : 0.5 })
286+ LunaEvaluator .from_dict ({"scorer_label " : "toxicity" , "threshold" : 0.5 })
300287
301288 @patch .dict (os .environ , {"GALILEO_API_SECRET_KEY" : "test-secret" }, clear = True )
302289 def test_evaluator_init_accepts_api_secret (self ) -> None :
303290 from agent_control_evaluator_galileo .luna import LunaEvaluator
304291
305292 evaluator = LunaEvaluator .from_dict (
306293 {
307- "metric " : "toxicity" ,
294+ "scorer_label " : "toxicity" ,
308295 "project_id" : "12345678-1234-5678-1234-567812345678" ,
309296 "threshold" : 0.5 ,
310297 }
@@ -321,7 +308,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
321308 # Given: a direct Luna evaluator and a raw successful scorer response
322309 evaluator = LunaEvaluator .from_dict (
323310 {
324- "metric " : "toxicity" ,
311+ "scorer_label " : "toxicity" ,
325312 "project_id" : "12345678-1234-5678-1234-567812345678" ,
326313 "threshold" : 0.7 ,
327314 "operator" : "gte" ,
@@ -350,7 +337,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
350337 assert result .matched is True
351338 assert result .confidence == 0.82
352339 assert result .metadata == {
353- "metric " : "toxicity" ,
340+ "scorer_label " : "toxicity" ,
354341 "project_id" : "12345678-1234-5678-1234-567812345678" ,
355342 "score" : 0.82 ,
356343 "threshold" : 0.7 ,
@@ -360,7 +347,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
360347 "error_message" : None ,
361348 }
362349 mock_invoke .assert_awaited_once_with (
363- metric = "toxicity" ,
350+ scorer_label = "toxicity" ,
364351 input = "user prompt" ,
365352 output = "model answer" ,
366353 project_id = evaluator .config .project_id ,
@@ -376,7 +363,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
376363
377364 # Given: a raw scorer value below the local threshold
378365 evaluator = LunaEvaluator .from_dict (
379- {"metric " : "toxicity" , "threshold" : 0.7 , "operator" : "gte" }
366+ {"scorer_label " : "toxicity" , "threshold" : 0.7 , "operator" : "gte" }
380367 )
381368
382369 with patch .object (GalileoLunaClient , "invoke" , new_callable = AsyncMock ) as mock_invoke :
@@ -393,7 +380,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
393380 assert result .matched is False
394381 assert result .confidence == 0.2
395382 mock_invoke .assert_awaited_once_with (
396- metric = "toxicity" ,
383+ scorer_label = "toxicity" ,
397384 input = "hello" ,
398385 output = None ,
399386 project_id = None ,
@@ -408,7 +395,7 @@ async def test_evaluator_does_not_call_api_for_empty_data(self) -> None:
408395 from agent_control_evaluator_galileo .luna .client import GalileoLunaClient
409396
410397 # Given: an evaluator and empty selected data
411- evaluator = LunaEvaluator .from_dict ({"metric " : "toxicity" , "threshold" : 0.5 })
398+ evaluator = LunaEvaluator .from_dict ({"scorer_label " : "toxicity" , "threshold" : 0.5 })
412399
413400 with patch .object (GalileoLunaClient , "invoke" , new_callable = AsyncMock ) as mock_invoke :
414401 # When: evaluating empty data
@@ -427,7 +414,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
427414 from agent_control_evaluator_galileo .luna .client import GalileoLunaClient
428415
429416 # Given: default fail-open behavior
430- evaluator = LunaEvaluator .from_dict ({"metric " : "toxicity" , "threshold" : 0.5 })
417+ evaluator = LunaEvaluator .from_dict ({"scorer_label " : "toxicity" , "threshold" : 0.5 })
431418
432419 with patch .object (GalileoLunaClient , "invoke" , new_callable = AsyncMock ) as mock_invoke :
433420 mock_invoke .side_effect = RuntimeError ("service unavailable" )
@@ -449,7 +436,7 @@ async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
449436
450437 # Given: fail-closed behavior for scorer errors
451438 evaluator = LunaEvaluator .from_dict (
452- {"metric " : "toxicity" , "threshold" : 0.5 , "on_error" : "deny" }
439+ {"scorer_label " : "toxicity" , "threshold" : 0.5 , "on_error" : "deny" }
453440 )
454441
455442 with patch .object (GalileoLunaClient , "invoke" , new_callable = AsyncMock ) as mock_invoke :
0 commit comments