update the schemas and corresponding tests

namrataghadi-galileo · namrataghadi-galileo · commit 7b0a15d2b6d8 · 2026-05-13T12:01:56.000-07:00
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/client.py
@@ -66,22 +66,22 @@ def _as_float_or_none(value: JSONValue) -> float | None:
     return None
 
 
-ScorerStepType = Literal["session", "trace", "span"]
+RootType = Literal["session", "trace", "span"]
 
 
 class ScorerInvokeRequest(BaseModel):
     """Request payload for Galileo Luna scorer invocation.
 
     Attributes:
-        step_type: Runtime step shape used by Galileo scorer input normalization.
+        root_type: Runtime step shape used by Galileo scorer input normalization.
         input: Optional user/system prompt text.
         output: Optional model response text.
         scorer_label: Preset, registered, or fine-tuned scorer label.
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         config: Optional scorer-specific configuration.
     """
 
-    step_type: ScorerStepType = Field(default="span")
+    root_type: RootType = Field(default="span")
     input: JSONValue = None
     output: JSONValue = None
     scorer_label: str = Field(min_length=1)
@@ -117,18 +117,6 @@ class ScorerInvokeResponse(BaseModel):
     error_message: str | None = None
     _raw_response: JSONObject = PrivateAttr(default_factory=dict)
 
-    @model_validator(mode="before")
-    @classmethod
-    def allow_legacy_metric_response(cls, data: object) -> object:
-        if isinstance(data, dict) and "scorer_label" not in data and "metric" in data:
-            return data | {"scorer_label": data["metric"]}
-        return data
-
-    @property
-    def metric(self) -> str:
-        """Backward-compatible alias for existing evaluator metadata code."""
-        return self.scorer_label
-
     @property
     def raw_response(self) -> JSONObject:
         return self._raw_response
@@ -243,10 +231,10 @@ def _endpoint_and_headers(
     async def invoke(
         self,
         *,
-        metric: str,
+        scorer_label: str,
         input: JSONValue = None,
         output: JSONValue = None,
-        step_type: ScorerStepType = "span",
+        root_type: RootType = "span",
         project_id: str | UUID | None = None,
         config: JSONObject | None = None,
         timeout: float = DEFAULT_TIMEOUT_SECS,
@@ -255,10 +243,10 @@ async def invoke(
         """Invoke a Galileo Luna scorer.
 
         Args:
-            metric: Preset, registered, or fine-tuned scorer label.
+            scorer_label: Preset, registered, or fine-tuned scorer label.
             input: Optional user/system prompt text.
             output: Optional model response text.
-            step_type: Runtime step shape used by Galileo scorer input normalization.
+            root_type: Runtime step shape used by Galileo scorer input normalization.
             project_id: Optional Galileo project UUID for project-scoped scorer resolution.
             config: Optional scorer-specific configuration.
             timeout: Request timeout in seconds.
@@ -277,10 +265,10 @@ async def invoke(
             raise ValueError("At least one of input or output must be provided.")
 
         request_body = ScorerInvokeRequest(
-            scorer_label=metric,
+            scorer_label=scorer_label,
             input=input,
             output=output,
-            step_type=step_type,
+            root_type=root_type,
             project_id=project_id,
             config=config,
         ).to_dict()
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/config.py
@@ -32,19 +32,19 @@ class LunaEvaluatorConfig(EvaluatorConfig):
     """Configuration for direct Luna scorer evaluation.
 
     Attributes:
-        metric: Preset, registered, or fine-tuned scorer name.
+        scorer_label: Preset, registered, or fine-tuned scorer label.
         project_id: Optional Galileo project UUID for project-scoped scorer resolution.
         threshold: Local threshold used by the evaluator for comparison.
         operator: Local comparison operator. Numeric operators use threshold as a number.
         scorer_config: Optional scorer-specific config sent as ``config``.
         timeout_ms: Request timeout in milliseconds.
         on_error: Error policy: allow=fail open, deny=fail closed.
         payload_field: Force selected data into input or output. If omitted, root step
-            payloads with input/output use both fields; scalar data is inferred from metric name.
+            payloads with input/output use both fields; scalar data is inferred from scorer label.
         include_raw_response: Include the raw API response in EvaluatorResult metadata.
     """
 
-    metric: str = Field(..., min_length=1, description="Luna metric/scorer name to evaluate")
+    scorer_label: str = Field(..., min_length=1, description="Luna scorer label to invoke")
     project_id: UUID | None = Field(
         default=None,
         description="Optional Galileo project UUID for project-scoped scorer resolution.",
diff --git a/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna/evaluator.py
@@ -139,7 +139,7 @@ def _prepare_payload(self, data: Any) -> tuple[str | None, str | None]:
                 return input_text, output_text
 
         text = _coerce_payload_text(data)
-        if "output" in self.config.metric:
+        if "output" in self.config.scorer_label:
             return None, text
         return text, None
 
@@ -190,12 +190,12 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 matched=False,
                 confidence=1.0,
                 message="No data to score with Luna",
-                metadata={"metric": self.config.metric},
+                metadata={"scorer_label": self.config.scorer_label},
             )
 
         try:
             response = await self._get_client().invoke(
-                metric=self.config.metric,
+                scorer_label=self.config.scorer_label,
                 input=input_text if _has_text(input_text) else None,
                 output=output_text if _has_text(output_text) else None,
                 project_id=self.config.project_id,
@@ -227,7 +227,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
 
     def _metadata(self, response: ScorerInvokeResponse) -> dict[str, Any]:
         metadata: dict[str, Any] = {
-            "metric": response.scorer_label or self.config.metric,
+            "scorer_label": response.scorer_label or self.config.scorer_label,
             "project_id": str(self.config.project_id) if self.config.project_id else None,
             "score": response.score,
             "threshold": self.config.threshold,
@@ -251,7 +251,7 @@ def _handle_error(self, error: Exception) -> EvaluatorResult:
             metadata={
                 "error": error_detail,
                 "error_type": type(error).__name__,
-                "metric": self.config.metric,
+                "scorer_label": self.config.scorer_label,
                 "fallback_action": fallback,
             },
             error=None if matched else error_detail,
diff --git a/evaluators/contrib/galileo/tests/test_luna_evaluator.py b/evaluators/contrib/galileo/tests/test_luna_evaluator.py
@@ -27,15 +27,15 @@ def test_config_accepts_direct_scorer_fields(self) -> None:
 
         # Given: a direct scorer config with local thresholding
         config = LunaEvaluatorConfig(
-            metric="toxicity",
+            scorer_label="toxicity",
             project_id="12345678-1234-5678-1234-567812345678",
             threshold=0.7,
             operator="gte",
             config={"temperature": 0},
         )
 
         # Then: config is retained without Protect concepts
-        assert config.metric == "toxicity"
+        assert config.scorer_label == "toxicity"
         assert str(config.project_id) == "12345678-1234-5678-1234-567812345678"
         assert config.threshold == 0.7
         assert config.operator == "gte"
@@ -46,7 +46,7 @@ def test_numeric_operator_requires_numeric_threshold(self) -> None:
 
         # Given/When/Then: numeric local comparison rejects non-numeric thresholds
         with pytest.raises(ValidationError, match="numeric threshold"):
-            LunaEvaluatorConfig(metric="toxicity", threshold="high", operator="gte")
+            LunaEvaluatorConfig(scorer_label="toxicity", threshold="high", operator="gte")
 
 
 class TestGalileoLunaClient:
@@ -65,7 +65,7 @@ def test_scorer_invoke_request_matches_orbit_schema_shape(self) -> None:
 
         # Then: the serialized payload uses the Orbit scorer invoke fields
         assert request.to_dict() == {
-            "step_type": "span",
+            "root_type": "span",
             "input": {"messages": [{"role": "user", "content": "hello"}]},
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
@@ -102,21 +102,8 @@ def test_scorer_invoke_response_matches_orbit_schema_shape(self) -> None:
             "error_message": None,
         }
         assert response.scorer_label == "toxicity"
-        assert response.metric == "toxicity"
         assert response.raw_response["scorer_label"] == "toxicity"
 
-    def test_scorer_invoke_response_accepts_legacy_metric_field(self) -> None:
-        from agent_control_evaluator_galileo.luna import ScorerInvokeResponse
-
-        # Given/When: an older API response uses metric instead of scorer_label
-        response = ScorerInvokeResponse.from_dict(
-            {"metric": "toxicity", "score": 0.82, "status": "success"}
-        )
-
-        # Then: the client still normalizes it to the current response contract
-        assert response.scorer_label == "toxicity"
-        assert response.model_dump()["scorer_label"] == "toxicity"
-
     def test_client_uses_protect_api_url_derivation(self) -> None:
         from agent_control_evaluator_galileo.luna import GalileoLunaClient
 
@@ -187,7 +174,7 @@ def handler(request: httpx.Request) -> httpx.Response:
         try:
             # When: invoking a scorer
             response = await client.invoke(
-                metric="toxicity",
+                scorer_label="toxicity",
                 input="user prompt",
                 output="model answer",
                 project_id="12345678-1234-5678-1234-567812345678",
@@ -204,7 +191,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "step_type": "span",
+            "root_type": "span",
             "config": {"top_k": 1},
         }
         assert "stage_name" not in captured["body"]
@@ -241,7 +228,7 @@ def handler(request: httpx.Request) -> httpx.Response:
         try:
             # When: invoking a scorer with project context
             response = await client.invoke(
-                metric="toxicity",
+                scorer_label="toxicity",
                 output="model answer",
                 project_id="12345678-1234-5678-1234-567812345678",
             )
@@ -255,7 +242,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             "output": "model answer",
             "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
-            "step_type": "span",
+            "root_type": "span",
         }
         headers = captured["headers"]
         assert isinstance(headers, dict)
@@ -278,7 +265,7 @@ async def test_client_requires_project_id_for_internal_jwt(self) -> None:
 
         # When/Then: project_id is required because API uses it as the internal auth context
         with pytest.raises(ValueError, match="project_id is required"):
-            await client.invoke(metric="toxicity", output="model answer")
+            await client.invoke(scorer_label="toxicity", output="model answer")
 
 
 class TestLunaEvaluator:
@@ -296,15 +283,15 @@ def test_evaluator_init_without_auth_raises(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluator
 
         with pytest.raises(ValueError, match="GALILEO_API_SECRET_KEY or GALILEO_API_KEY"):
-            LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+            LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
     @patch.dict(os.environ, {"GALILEO_API_SECRET_KEY": "test-secret"}, clear=True)
     def test_evaluator_init_accepts_api_secret(self) -> None:
         from agent_control_evaluator_galileo.luna import LunaEvaluator
 
         evaluator = LunaEvaluator.from_dict(
             {
-                "metric": "toxicity",
+                "scorer_label": "toxicity",
                 "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.5,
             }
@@ -321,7 +308,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         # Given: a direct Luna evaluator and a raw successful scorer response
         evaluator = LunaEvaluator.from_dict(
             {
-                "metric": "toxicity",
+                "scorer_label": "toxicity",
                 "project_id": "12345678-1234-5678-1234-567812345678",
                 "threshold": 0.7,
                 "operator": "gte",
@@ -350,7 +337,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
         assert result.matched is True
         assert result.confidence == 0.82
         assert result.metadata == {
-            "metric": "toxicity",
+            "scorer_label": "toxicity",
             "project_id": "12345678-1234-5678-1234-567812345678",
             "score": 0.82,
             "threshold": 0.7,
@@ -360,7 +347,7 @@ async def test_evaluator_applies_threshold_locally_to_raw_score(self) -> None:
             "error_message": None,
         }
         mock_invoke.assert_awaited_once_with(
-            metric="toxicity",
+            scorer_label="toxicity",
             input="user prompt",
             output="model answer",
             project_id=evaluator.config.project_id,
@@ -376,7 +363,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
 
         # Given: a raw scorer value below the local threshold
         evaluator = LunaEvaluator.from_dict(
-            {"metric": "toxicity", "threshold": 0.7, "operator": "gte"}
+            {"scorer_label": "toxicity", "threshold": 0.7, "operator": "gte"}
         )
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
@@ -393,7 +380,7 @@ async def test_evaluator_returns_non_match_below_threshold(self) -> None:
         assert result.matched is False
         assert result.confidence == 0.2
         mock_invoke.assert_awaited_once_with(
-            metric="toxicity",
+            scorer_label="toxicity",
             input="hello",
             output=None,
             project_id=None,
@@ -408,7 +395,7 @@ async def test_evaluator_does_not_call_api_for_empty_data(self) -> None:
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
         # Given: an evaluator and empty selected data
-        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+        evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             # When: evaluating empty data
@@ -427,7 +414,7 @@ async def test_evaluator_fail_open_sets_error(self) -> None:
         from agent_control_evaluator_galileo.luna.client import GalileoLunaClient
 
         # Given: default fail-open behavior
-        evaluator = LunaEvaluator.from_dict({"metric": "toxicity", "threshold": 0.5})
+        evaluator = LunaEvaluator.from_dict({"scorer_label": "toxicity", "threshold": 0.5})
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
             mock_invoke.side_effect = RuntimeError("service unavailable")
@@ -449,7 +436,7 @@ async def test_evaluator_fail_closed_matches_without_error_field(self) -> None:
 
         # Given: fail-closed behavior for scorer errors
         evaluator = LunaEvaluator.from_dict(
-            {"metric": "toxicity", "threshold": 0.5, "on_error": "deny"}
+            {"scorer_label": "toxicity", "threshold": 0.5, "on_error": "deny"}
         )
 
         with patch.object(GalileoLunaClient, "invoke", new_callable=AsyncMock) as mock_invoke:
diff --git a/examples/galileo_luna/README.md b/examples/galileo_luna/README.md
@@ -33,7 +33,7 @@ export GALILEO_PROJECT_ID="00000000-0000-0000-0000-000000000000"
 Optional scorer settings:
 
 ```bash
-export GALILEO_LUNA_METRIC="toxicity"
+export GALILEO_LUNA_SCORER_LABEL="toxicity"
 export GALILEO_LUNA_THRESHOLD="0.5"
 ```
 
diff --git a/examples/galileo_luna/setup_controls.py b/examples/galileo_luna/setup_controls.py
@@ -23,7 +23,7 @@
 AGENT_DESCRIPTION = "Demo agent protected by direct Galileo Luna scorer controls"
 SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000")
 
-LUNA_METRIC = os.getenv("GALILEO_LUNA_METRIC", "toxicity")
+LUNA_SCORER_LABEL = os.getenv("GALILEO_LUNA_SCORER_LABEL", "toxicity")
 LUNA_THRESHOLD = float(os.getenv("GALILEO_LUNA_THRESHOLD", "0.5"))
 GALILEO_PROJECT_ID = os.getenv("GALILEO_PROJECT_ID")
 
@@ -41,7 +41,7 @@
 def luna_config() -> dict[str, Any]:
     """Build the direct Luna evaluator config used by the composite control."""
     config: dict[str, Any] = {
-        "metric": LUNA_METRIC,
+        "scorer_label": LUNA_SCORER_LABEL,
         "threshold": LUNA_THRESHOLD,
         "operator": "gte",
         "payload_field": "output",
@@ -158,7 +158,7 @@ async def setup_demo() -> None:
     print("Setting up direct Galileo Luna demo controls")
     print(f"Server: {SERVER_URL}")
     print(f"Agent:  {AGENT_NAME}")
-    print(f"Luna:   metric={LUNA_METRIC!r}, threshold={LUNA_THRESHOLD}")
+    print(f"Luna:   scorer_label={LUNA_SCORER_LABEL!r}, threshold={LUNA_THRESHOLD}")
     if GALILEO_PROJECT_ID:
         print(f"Project ID: {GALILEO_PROJECT_ID}")