2525from agentevals ._protocol import (
2626 EvalInput ,
2727 EvalResult ,
28+ IntermediateStepData ,
2829 InvocationData ,
2930 ToolCallData ,
3031 ToolResponseData ,
@@ -296,33 +297,45 @@ def _extract_tool_responses_from_invocation(inv: Invocation) -> list[ToolRespons
296297 for tr in inv .intermediate_data .tool_responses or []:
297298 name = ""
298299 output = ""
300+ status = None
299301 if hasattr (tr , "name" ):
300302 name = tr .name or ""
301303 if hasattr (tr , "response" ):
302304 output = str (tr .response ) if tr .response else ""
303305 elif hasattr (tr , "output" ):
304306 output = str (tr .output ) if tr .output else ""
305- responses .append (ToolResponseData (name = name , output = output ))
307+ if hasattr (tr , "status" ) and tr .status :
308+ status = str (tr .status )
309+ responses .append (ToolResponseData (name = name , output = output , status = status ))
306310
307311 return responses
308312
309313
310- def invocation_to_data (inv : Invocation ) -> InvocationData :
314+ def invocation_to_data (
315+ inv : Invocation ,
316+ performance_metrics : dict [str , Any ] | None = None ,
317+ ) -> InvocationData :
311318 """Convert an ADK Invocation to a simplified InvocationData for the protocol."""
312319 return InvocationData (
313320 invocation_id = inv .invocation_id or "" ,
314321 user_content = _content_to_text (inv .user_content ),
315322 final_response = _content_to_text (inv .final_response ) or None ,
316- tool_calls = _extract_tool_calls_from_invocation (inv ),
317- tool_responses = _extract_tool_responses_from_invocation (inv ),
323+ intermediate_steps = IntermediateStepData (
324+ tool_calls = _extract_tool_calls_from_invocation (inv ),
325+ tool_responses = _extract_tool_responses_from_invocation (inv ),
326+ ),
327+ performance_metrics = performance_metrics ,
318328 )
319329
320330
321- def invocations_to_data (invocations : list [Invocation ] | None ) -> list [InvocationData ] | None :
331+ def invocations_to_data (
332+ invocations : list [Invocation ] | None ,
333+ performance_metrics : dict [str , Any ] | None = None ,
334+ ) -> list [InvocationData ] | None :
322335 """Convert a list of ADK Invocations, or return None."""
323336 if invocations is None :
324337 return None
325- return [invocation_to_data (inv ) for inv in invocations ]
338+ return [invocation_to_data (inv , performance_metrics = performance_metrics ) for inv in invocations ]
326339
327340
328341# ---------------------------------------------------------------------------
@@ -382,11 +395,13 @@ def __init__(
382395 metric_name : str ,
383396 threshold : float = 0.5 ,
384397 config : dict [str , Any ] | None = None ,
398+ performance_metrics : dict [str , Any ] | None = None ,
385399 ):
386400 self ._backend = backend
387401 self ._metric_name = metric_name
388402 self ._threshold = threshold
389403 self ._config = config or {}
404+ self ._performance_metrics = performance_metrics
390405
391406 async def evaluate_invocations (
392407 self ,
@@ -399,7 +414,7 @@ async def evaluate_invocations(
399414 metric_name = self ._metric_name ,
400415 threshold = self ._threshold ,
401416 config = self ._config ,
402- invocations = invocations_to_data (actual_invocations ) or [],
417+ invocations = invocations_to_data (actual_invocations , performance_metrics = self . _performance_metrics ) or [],
403418 expected_invocations = invocations_to_data (expected_invocations ),
404419 )
405420
@@ -416,6 +431,7 @@ async def evaluate_custom_evaluator(
416431 evaluator_def ,
417432 actual_invocations : list [Invocation ],
418433 expected_invocations : list [Invocation ] | None ,
434+ performance_metrics : dict [str , Any ] | None = None ,
419435):
420436 """Evaluate a single custom evaluator and return a ``MetricResult``.
421437
@@ -468,6 +484,7 @@ async def evaluate_custom_evaluator(
468484 metric_name = evaluator_def .name ,
469485 threshold = evaluator_def .threshold ,
470486 config = evaluator_def .config ,
487+ performance_metrics = performance_metrics ,
471488 )
472489
473490 try :
0 commit comments