Skip to content

Commit 4cd6a97

Browse files
authored
fix(traceloop-sdk): Add evaluator config to the evaluator validator (traceloop#3706)
1 parent 4ee45c6 commit 4cd6a97

3 files changed

Lines changed: 126 additions & 9 deletions

File tree

packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
1919

2020
# Initialize Traceloop
21-
client = Traceloop.init()
21+
client = Traceloop.init(endpoint_is_traceloop=True)
2222

2323

2424
async def generate_agent_trace(task_description: str) -> dict:
@@ -137,9 +137,12 @@ async def run_agents_experiment():
137137
evaluators = [
138138
EvaluatorMadeByTraceloop.agent_goal_accuracy(),
139139
EvaluatorMadeByTraceloop.agent_tool_error_detector(),
140-
EvaluatorMadeByTraceloop.agent_flow_quality(),
140+
EvaluatorMadeByTraceloop.agent_flow_quality(
141+
conditions=["create_itinerary tool should be called last"],
142+
threshold=0.8,
143+
),
141144
EvaluatorMadeByTraceloop.agent_efficiency(),
142-
EvaluatorMadeByTraceloop.agent_goal_completeness(),
145+
EvaluatorMadeByTraceloop.agent_goal_completeness(threshold=0.8),
143146
]
144147

145148
print("Running experiment with evaluators:")

packages/traceloop-sdk/tests/evaluator/test_evaluator.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import pytest
2-
from traceloop.sdk.evaluator.evaluator import validate_and_normalize_task_output
2+
from traceloop.sdk.evaluator.evaluator import (
3+
validate_and_normalize_task_output,
4+
_validate_evaluator_input,
5+
)
36
from traceloop.sdk.evaluator.config import EvaluatorDetails
47

58

@@ -221,3 +224,106 @@ def test_validate_task_output_duplicate_required_fields(self):
221224
assert "pii-detector requires:" in error_message
222225
assert "tone-analyzer requires:" in error_message
223226
assert "sentiment-analyzer requires:" in error_message
227+
228+
229+
class TestValidateEvaluatorInput:
230+
"""Tests for _validate_evaluator_input function"""
231+
232+
def test_validate_input_no_request_model(self):
233+
"""Validation passes for unknown slugs (no request model registered)."""
234+
# Should not raise - unknown slug has no model to validate against
235+
_validate_evaluator_input("unknown-evaluator", {"text": "hello"})
236+
237+
def test_validate_input_valid_input_only(self):
238+
"""Validation passes for evaluators that only require input (no config)."""
239+
_validate_evaluator_input(
240+
"pii-detector",
241+
{"text": "Please contact John at john@email.com"},
242+
)
243+
244+
def test_validate_input_missing_required_input_field(self):
245+
"""Validation fails when a required input field is missing."""
246+
with pytest.raises(ValueError, match="Invalid input for 'pii-detector'"):
247+
_validate_evaluator_input("pii-detector", {"wrong_field": "value"})
248+
249+
def test_validate_input_with_optional_config(self):
250+
"""Validation passes for evaluators with optional config when config is provided."""
251+
_validate_evaluator_input(
252+
"pii-detector",
253+
{"text": "Some text"},
254+
evaluator_config={"probability_threshold": 0.8},
255+
)
256+
257+
def test_validate_input_with_optional_config_omitted(self):
258+
"""Validation passes for evaluators with optional config when config is omitted."""
259+
_validate_evaluator_input(
260+
"toxicity-detector",
261+
{"text": "Some text"},
262+
)
263+
264+
def test_validate_agent_flow_quality_with_required_config(self):
265+
"""Validation passes for agent-flow-quality when config is provided."""
266+
_validate_evaluator_input(
267+
"agent-flow-quality",
268+
{
269+
"trajectory_completions": '["Found 5 flights"]',
270+
"trajectory_prompts": '["Search for flights"]',
271+
},
272+
evaluator_config={
273+
"conditions": ["no tools called"],
274+
"threshold": 0.5,
275+
},
276+
)
277+
278+
def test_validate_agent_flow_quality_missing_config_fails(self):
279+
"""Validation fails for agent-flow-quality when required config is missing."""
280+
with pytest.raises(ValueError, match="Invalid input for 'agent-flow-quality'"):
281+
_validate_evaluator_input(
282+
"agent-flow-quality",
283+
{
284+
"trajectory_completions": '["Found 5 flights"]',
285+
"trajectory_prompts": '["Search for flights"]',
286+
},
287+
)
288+
289+
def test_validate_agent_flow_quality_missing_input_fields(self):
290+
"""Validation fails for agent-flow-quality when required input fields are missing."""
291+
with pytest.raises(ValueError, match="Invalid input for 'agent-flow-quality'"):
292+
_validate_evaluator_input(
293+
"agent-flow-quality",
294+
{"wrong_field": "value"},
295+
evaluator_config={
296+
"conditions": ["no tools called"],
297+
"threshold": 0.5,
298+
},
299+
)
300+
301+
def test_validate_agent_goal_completeness_optional_config(self):
302+
"""agent-goal-completeness has optional config - passes with or without it."""
303+
input_data = {
304+
"trajectory_completions": '["Account created"]',
305+
"trajectory_prompts": '["Create new account"]',
306+
}
307+
# Without config
308+
_validate_evaluator_input("agent-goal-completeness", input_data)
309+
# With config
310+
_validate_evaluator_input(
311+
"agent-goal-completeness",
312+
input_data,
313+
evaluator_config={"threshold": 0.5},
314+
)
315+
316+
def test_validate_agent_tool_trajectory_optional_config(self):
317+
"""agent-tool-trajectory has optional config - passes with or without it."""
318+
input_data = {
319+
"executed_tool_calls": '[{"name": "search", "input": {"query": "weather"}}]',
320+
"expected_tool_calls": '[{"name": "search", "input": {"query": "weather"}}]',
321+
}
322+
# Without config
323+
_validate_evaluator_input("agent-tool-trajectory", input_data)
324+
# With config
325+
_validate_evaluator_input(
326+
"agent-tool-trajectory",
327+
input_data,
328+
evaluator_config={"order_sensitive": True, "threshold": 0.5},
329+
)

packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,29 @@
1515
from ..generated.evaluators import get_request_model
1616

1717

18-
def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None:
18+
def _validate_evaluator_input(
19+
slug: str,
20+
input: Dict[str, str],
21+
evaluator_config: Optional[Dict[str, Any]] = None,
22+
) -> None:
1923
"""Validate input against the evaluator's request model if available.
2024
2125
Args:
2226
slug: The evaluator slug (e.g., "pii-detector")
2327
input: Dictionary of input field names to values
28+
evaluator_config: Optional configuration for the evaluator
2429
2530
Raises:
2631
ValueError: If input fails validation against the request model
2732
"""
2833
request_model = get_request_model(slug)
2934
if request_model:
3035
try:
31-
# Request models expect data nested under 'input' field
32-
request_model(input=input)
36+
# Build kwargs for request model validation
37+
kwargs: Dict[str, Any] = {"input": input}
38+
if evaluator_config is not None:
39+
kwargs["config"] = evaluator_config
40+
request_model(**kwargs)
3341
except ValidationError as e:
3442
raise ValueError(f"Invalid input for '{slug}': {e}") from e
3543

@@ -115,7 +123,7 @@ async def run_experiment_evaluator(
115123
Returns:
116124
ExecutionResponse: The evaluation result from SSE stream
117125
"""
118-
_validate_evaluator_input(evaluator_slug, input)
126+
_validate_evaluator_input(evaluator_slug, input, evaluator_config)
119127

120128
request = self._build_evaluator_request(
121129
task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config
@@ -159,7 +167,7 @@ async def trigger_experiment_evaluator(
159167
Returns:
160168
str: The execution_id that can be used to check results later
161169
"""
162-
_validate_evaluator_input(evaluator_slug, input)
170+
_validate_evaluator_input(evaluator_slug, input, evaluator_config)
163171

164172
request = self._build_evaluator_request(
165173
task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config

0 commit comments

Comments
 (0)