fix(traceloop-sdk): Add evaluator config to the evaluator validator (traceloop#3706)

nina-kollman · web-flow · commit 4cd6a97d0f48 · 2026-02-23T17:20:07.000+02:00
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py
@@ -18,7 +18,7 @@
 from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
 
 # Initialize Traceloop
-client = Traceloop.init()
+client = Traceloop.init(endpoint_is_traceloop=True)
 
 
 async def generate_agent_trace(task_description: str) -> dict:
@@ -137,9 +137,12 @@ async def run_agents_experiment():
     evaluators = [
         EvaluatorMadeByTraceloop.agent_goal_accuracy(),
         EvaluatorMadeByTraceloop.agent_tool_error_detector(),
-        EvaluatorMadeByTraceloop.agent_flow_quality(),
+        EvaluatorMadeByTraceloop.agent_flow_quality(
+            conditions=["create_itinerary tool should be called last"],
+            threshold=0.8,
+        ),
         EvaluatorMadeByTraceloop.agent_efficiency(),
-        EvaluatorMadeByTraceloop.agent_goal_completeness(),
+        EvaluatorMadeByTraceloop.agent_goal_completeness(threshold=0.8),
     ]
 
     print("Running experiment with evaluators:")
diff --git a/packages/traceloop-sdk/tests/evaluator/test_evaluator.py b/packages/traceloop-sdk/tests/evaluator/test_evaluator.py
@@ -1,5 +1,8 @@
 import pytest
-from traceloop.sdk.evaluator.evaluator import validate_and_normalize_task_output
+from traceloop.sdk.evaluator.evaluator import (
+    validate_and_normalize_task_output,
+    _validate_evaluator_input,
+)
 from traceloop.sdk.evaluator.config import EvaluatorDetails
 
 
@@ -221,3 +224,106 @@ def test_validate_task_output_duplicate_required_fields(self):
         assert "pii-detector requires:" in error_message
         assert "tone-analyzer requires:" in error_message
         assert "sentiment-analyzer requires:" in error_message
+
+
+class TestValidateEvaluatorInput:
+    """Tests for _validate_evaluator_input function"""
+
+    def test_validate_input_no_request_model(self):
+        """Validation passes for unknown slugs (no request model registered)."""
+        # Should not raise - unknown slug has no model to validate against
+        _validate_evaluator_input("unknown-evaluator", {"text": "hello"})
+
+    def test_validate_input_valid_input_only(self):
+        """Validation passes for evaluators that only require input (no config)."""
+        _validate_evaluator_input(
+            "pii-detector",
+            {"text": "Please contact John at john@email.com"},
+        )
+
+    def test_validate_input_missing_required_input_field(self):
+        """Validation fails when a required input field is missing."""
+        with pytest.raises(ValueError, match="Invalid input for 'pii-detector'"):
+            _validate_evaluator_input("pii-detector", {"wrong_field": "value"})
+
+    def test_validate_input_with_optional_config(self):
+        """Validation passes for evaluators with optional config when config is provided."""
+        _validate_evaluator_input(
+            "pii-detector",
+            {"text": "Some text"},
+            evaluator_config={"probability_threshold": 0.8},
+        )
+
+    def test_validate_input_with_optional_config_omitted(self):
+        """Validation passes for evaluators with optional config when config is omitted."""
+        _validate_evaluator_input(
+            "toxicity-detector",
+            {"text": "Some text"},
+        )
+
+    def test_validate_agent_flow_quality_with_required_config(self):
+        """Validation passes for agent-flow-quality when config is provided."""
+        _validate_evaluator_input(
+            "agent-flow-quality",
+            {
+                "trajectory_completions": '["Found 5 flights"]',
+                "trajectory_prompts": '["Search for flights"]',
+            },
+            evaluator_config={
+                "conditions": ["no tools called"],
+                "threshold": 0.5,
+            },
+        )
+
+    def test_validate_agent_flow_quality_missing_config_fails(self):
+        """Validation fails for agent-flow-quality when required config is missing."""
+        with pytest.raises(ValueError, match="Invalid input for 'agent-flow-quality'"):
+            _validate_evaluator_input(
+                "agent-flow-quality",
+                {
+                    "trajectory_completions": '["Found 5 flights"]',
+                    "trajectory_prompts": '["Search for flights"]',
+                },
+            )
+
+    def test_validate_agent_flow_quality_missing_input_fields(self):
+        """Validation fails for agent-flow-quality when required input fields are missing."""
+        with pytest.raises(ValueError, match="Invalid input for 'agent-flow-quality'"):
+            _validate_evaluator_input(
+                "agent-flow-quality",
+                {"wrong_field": "value"},
+                evaluator_config={
+                    "conditions": ["no tools called"],
+                    "threshold": 0.5,
+                },
+            )
+
+    def test_validate_agent_goal_completeness_optional_config(self):
+        """agent-goal-completeness has optional config - passes with or without it."""
+        input_data = {
+            "trajectory_completions": '["Account created"]',
+            "trajectory_prompts": '["Create new account"]',
+        }
+        # Without config
+        _validate_evaluator_input("agent-goal-completeness", input_data)
+        # With config
+        _validate_evaluator_input(
+            "agent-goal-completeness",
+            input_data,
+            evaluator_config={"threshold": 0.5},
+        )
+
+    def test_validate_agent_tool_trajectory_optional_config(self):
+        """agent-tool-trajectory has optional config - passes with or without it."""
+        input_data = {
+            "executed_tool_calls": '[{"name": "search", "input": {"query": "weather"}}]',
+            "expected_tool_calls": '[{"name": "search", "input": {"query": "weather"}}]',
+        }
+        # Without config
+        _validate_evaluator_input("agent-tool-trajectory", input_data)
+        # With config
+        _validate_evaluator_input(
+            "agent-tool-trajectory",
+            input_data,
+            evaluator_config={"order_sensitive": True, "threshold": 0.5},
+        )
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py
@@ -15,21 +15,29 @@
 from ..generated.evaluators import get_request_model
 
 
-def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None:
+def _validate_evaluator_input(
+    slug: str,
+    input: Dict[str, str],
+    evaluator_config: Optional[Dict[str, Any]] = None,
+) -> None:
     """Validate input against the evaluator's request model if available.
 
     Args:
         slug: The evaluator slug (e.g., "pii-detector")
         input: Dictionary of input field names to values
+        evaluator_config: Optional configuration for the evaluator
 
     Raises:
         ValueError: If input fails validation against the request model
     """
     request_model = get_request_model(slug)
     if request_model:
         try:
-            # Request models expect data nested under 'input' field
-            request_model(input=input)
+            # Build kwargs for request model validation
+            kwargs: Dict[str, Any] = {"input": input}
+            if evaluator_config is not None:
+                kwargs["config"] = evaluator_config
+            request_model(**kwargs)
         except ValidationError as e:
             raise ValueError(f"Invalid input for '{slug}': {e}") from e
 
@@ -115,7 +123,7 @@ async def run_experiment_evaluator(
         Returns:
             ExecutionResponse: The evaluation result from SSE stream
         """
-        _validate_evaluator_input(evaluator_slug, input)
+        _validate_evaluator_input(evaluator_slug, input, evaluator_config)
 
         request = self._build_evaluator_request(
             task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config
@@ -159,7 +167,7 @@ async def trigger_experiment_evaluator(
         Returns:
             str: The execution_id that can be used to check results later
         """
-        _validate_evaluator_input(evaluator_slug, input)
+        _validate_evaluator_input(evaluator_slug, input, evaluator_config)
 
         request = self._build_evaluator_request(
             task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config