Skip to content

Commit 21eb92a

Browse files
committed
fix(evaluation): handle None inferences in LocalEvalService
When inference fails (e.g. MCP session drop, timeout, API error), _evaluate_single_inference_result() calls len(inference_result.inferences) without a None guard, causing TypeError. Return EvalStatus.NOT_EVALUATED early when inferences is None. Closes #6071
1 parent 1b030dc commit 21eb92a

2 files changed

Lines changed: 61 additions & 19 deletions

File tree

src/google/adk/evaluation/local_eval_service.py

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -278,26 +278,37 @@ async def _evaluate_single_inference_result(
278278
)
279279

280280
if inference_result.inferences is None:
281-
session_details = None
282-
if inference_result.session_id is not None:
283-
session_details = await self._session_service.get_session(
284-
app_name=inference_result.app_name,
285-
user_id=user_id,
286-
session_id=inference_result.session_id,
287-
)
288-
return (
289-
inference_result,
290-
EvalCaseResult(
291-
eval_set_file=inference_result.eval_set_id,
292-
eval_set_id=inference_result.eval_set_id,
293-
eval_id=inference_result.eval_case_id,
294-
final_eval_status=EvalStatus.FAILED,
295-
overall_eval_metric_results=[],
296-
eval_metric_result_per_invocation=[],
297-
session_id=inference_result.session_id or '',
298-
session_details=session_details,
281+
if inference_result.status == InferenceStatus.FAILURE:
282+
session_details = None
283+
if inference_result.session_id is not None:
284+
session_details = await self._session_service.get_session(
285+
app_name=inference_result.app_name,
299286
user_id=user_id,
300-
),
287+
session_id=inference_result.session_id,
288+
)
289+
return (
290+
inference_result,
291+
EvalCaseResult(
292+
eval_set_file=inference_result.eval_set_id,
293+
eval_set_id=inference_result.eval_set_id,
294+
eval_id=inference_result.eval_case_id,
295+
final_eval_status=EvalStatus.FAILED,
296+
overall_eval_metric_results=[],
297+
eval_metric_result_per_invocation=[],
298+
session_id=inference_result.session_id or '',
299+
session_details=session_details,
300+
user_id=user_id,
301+
),
302+
)
303+
304+
return inference_result, EvalCaseResult(
305+
eval_set_file=inference_result.eval_set_id,
306+
eval_set_id=inference_result.eval_set_id,
307+
eval_id=inference_result.eval_case_id,
308+
final_eval_status=EvalStatus.NOT_EVALUATED,
309+
overall_eval_metric_results=[],
310+
eval_metric_result_per_invocation=[],
311+
session_id=inference_result.session_id or "",
301312
)
302313

303314
if eval_case.conversation_scenario is None and len(

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,37 @@ async def test_evaluate_single_inference_result_failed_without_inferences(
497497
assert result.eval_metric_result_per_invocation == []
498498

499499

500+
@pytest.mark.asyncio
501+
async def test_evaluate_single_inference_result_inferences_none(
502+
eval_service, mock_eval_sets_manager, mocker
503+
):
504+
inference_result = InferenceResult(
505+
app_name="test_app",
506+
eval_set_id="test_eval_set",
507+
eval_case_id="case1",
508+
inferences=None,
509+
session_id="session1",
510+
)
511+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
512+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
513+
514+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
515+
mock_eval_case.conversation = []
516+
mock_eval_case.conversation_scenario = None
517+
mock_eval_case.session_input = None
518+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
519+
520+
_, result = await eval_service._evaluate_single_inference_result(
521+
inference_result=inference_result, evaluate_config=evaluate_config
522+
)
523+
524+
assert isinstance(result, EvalCaseResult)
525+
assert result.eval_id == "case1"
526+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
527+
assert result.overall_eval_metric_results == []
528+
assert result.eval_metric_result_per_invocation == []
529+
530+
500531
@pytest.mark.asyncio
501532
async def test_evaluate_single_inference_result_for_conversation_scenario(
502533
eval_service, mock_eval_sets_manager, mocker

0 commit comments

Comments
 (0)