Skip to content

Commit 91c00a6

Browse files
extend sdk with perf metrics
Signed-off-by: krisztianfekete <git@krisztianfekete.org>
1 parent dc4b9b1 commit 91c00a6

4 files changed

Lines changed: 29 additions & 7 deletions

File tree

packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class ToolResponseData(BaseModel):
2525

2626
name: str
2727
output: str = ""
28+
status: Optional[str] = None
2829

2930

3031
class IntermediateStepData(BaseModel):
@@ -51,6 +52,7 @@ class InvocationData(BaseModel):
5152
user_content: str = ""
5253
final_response: Optional[str] = None
5354
intermediate_steps: IntermediateStepData = Field(default_factory=IntermediateStepData)
55+
performance_metrics: Optional[dict[str, Any]] = None
5456

5557

5658
class EvalInput(BaseModel):

src/agentevals/_protocol.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class ToolResponseData(BaseModel):
3434

3535
name: str
3636
output: str = ""
37+
status: Optional[str] = None
3738

3839

3940
class IntermediateStepData(BaseModel):
@@ -50,6 +51,7 @@ class InvocationData(BaseModel):
5051
user_content: str = ""
5152
final_response: Optional[str] = None
5253
intermediate_steps: IntermediateStepData = Field(default_factory=IntermediateStepData)
54+
performance_metrics: Optional[dict[str, Any]] = None
5355

5456

5557
class EvalInput(BaseModel):

src/agentevals/custom_evaluators.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from agentevals._protocol import (
2626
EvalInput,
2727
EvalResult,
28+
IntermediateStepData,
2829
InvocationData,
2930
ToolCallData,
3031
ToolResponseData,
@@ -296,33 +297,45 @@ def _extract_tool_responses_from_invocation(inv: Invocation) -> list[ToolRespons
296297
for tr in inv.intermediate_data.tool_responses or []:
297298
name = ""
298299
output = ""
300+
status = None
299301
if hasattr(tr, "name"):
300302
name = tr.name or ""
301303
if hasattr(tr, "response"):
302304
output = str(tr.response) if tr.response else ""
303305
elif hasattr(tr, "output"):
304306
output = str(tr.output) if tr.output else ""
305-
responses.append(ToolResponseData(name=name, output=output))
307+
if hasattr(tr, "status") and tr.status:
308+
status = str(tr.status)
309+
responses.append(ToolResponseData(name=name, output=output, status=status))
306310

307311
return responses
308312

309313

310-
def invocation_to_data(inv: Invocation) -> InvocationData:
314+
def invocation_to_data(
315+
inv: Invocation,
316+
performance_metrics: dict[str, Any] | None = None,
317+
) -> InvocationData:
311318
"""Convert an ADK Invocation to a simplified InvocationData for the protocol."""
312319
return InvocationData(
313320
invocation_id=inv.invocation_id or "",
314321
user_content=_content_to_text(inv.user_content),
315322
final_response=_content_to_text(inv.final_response) or None,
316-
tool_calls=_extract_tool_calls_from_invocation(inv),
317-
tool_responses=_extract_tool_responses_from_invocation(inv),
323+
intermediate_steps=IntermediateStepData(
324+
tool_calls=_extract_tool_calls_from_invocation(inv),
325+
tool_responses=_extract_tool_responses_from_invocation(inv),
326+
),
327+
performance_metrics=performance_metrics,
318328
)
319329

320330

321-
def invocations_to_data(invocations: list[Invocation] | None) -> list[InvocationData] | None:
331+
def invocations_to_data(
332+
invocations: list[Invocation] | None,
333+
performance_metrics: dict[str, Any] | None = None,
334+
) -> list[InvocationData] | None:
322335
"""Convert a list of ADK Invocations, or return None."""
323336
if invocations is None:
324337
return None
325-
return [invocation_to_data(inv) for inv in invocations]
338+
return [invocation_to_data(inv, performance_metrics=performance_metrics) for inv in invocations]
326339

327340

328341
# ---------------------------------------------------------------------------
@@ -382,11 +395,13 @@ def __init__(
382395
metric_name: str,
383396
threshold: float = 0.5,
384397
config: dict[str, Any] | None = None,
398+
performance_metrics: dict[str, Any] | None = None,
385399
):
386400
self._backend = backend
387401
self._metric_name = metric_name
388402
self._threshold = threshold
389403
self._config = config or {}
404+
self._performance_metrics = performance_metrics
390405

391406
async def evaluate_invocations(
392407
self,
@@ -399,7 +414,7 @@ async def evaluate_invocations(
399414
metric_name=self._metric_name,
400415
threshold=self._threshold,
401416
config=self._config,
402-
invocations=invocations_to_data(actual_invocations) or [],
417+
invocations=invocations_to_data(actual_invocations, performance_metrics=self._performance_metrics) or [],
403418
expected_invocations=invocations_to_data(expected_invocations),
404419
)
405420

@@ -416,6 +431,7 @@ async def evaluate_custom_evaluator(
416431
evaluator_def,
417432
actual_invocations: list[Invocation],
418433
expected_invocations: list[Invocation] | None,
434+
performance_metrics: dict[str, Any] | None = None,
419435
):
420436
"""Evaluate a single custom evaluator and return a ``MetricResult``.
421437
@@ -468,6 +484,7 @@ async def evaluate_custom_evaluator(
468484
metric_name=evaluator_def.name,
469485
threshold=evaluator_def.threshold,
470486
config=evaluator_def.config,
487+
performance_metrics=performance_metrics,
471488
)
472489

473490
try:

src/agentevals/runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> Metr
261261
evaluator_def=evaluator_def,
262262
actual_invocations=actual_invocations,
263263
expected_invocations=expected_invocations,
264+
performance_metrics=performance_metrics,
264265
)
265266
result.duration_ms = (time.monotonic() - t0) * 1000
266267
return await _append_result(result)

0 commit comments

Comments
 (0)