Skip to content

Commit 953304d

Browse files
authored
fix(eval): include intermediate text in final response match (#5698)
1 parent 6a53357 commit 953304d

4 files changed

Lines changed: 125 additions & 4 deletions

File tree

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,13 +159,22 @@ def format_auto_rater_prompt(
159159
if expected_invocation is None:
160160
raise ValueError("expected_invocation is required for this metric.")
161161

162-
reference = get_text_from_content(expected_invocation.final_response)
163-
response = get_text_from_content(actual_invocation.final_response)
162+
include_intermediate = (
163+
self._criterion.include_intermediate_responses_in_final
164+
)
165+
reference = get_text_from_content(
166+
expected_invocation,
167+
include_intermediate_responses_in_final=include_intermediate,
168+
)
169+
response = get_text_from_content(
170+
actual_invocation,
171+
include_intermediate_responses_in_final=include_intermediate,
172+
)
164173
user_prompt = get_text_from_content(expected_invocation.user_content)
165174
return self._auto_rater_prompt_template.format(
166175
prompt=user_prompt,
167-
response=response,
168-
golden_response=reference,
176+
response=response or "",
177+
golden_response=reference or "",
169178
)
170179

171180
@override

src/google/adk/evaluation/llm_as_judge_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from .app_details import AppDetails
2626
from .common import EvalBaseModel
2727
from .eval_case import get_all_tool_calls_with_responses
28+
from .eval_case import IntermediateData
2829
from .eval_case import IntermediateDataType
2930
from .eval_case import Invocation
3031
from .eval_case import InvocationEvents
@@ -71,6 +72,12 @@ def get_text_from_content(
7172
text = get_text_from_content(event.content)
7273
if text:
7374
parts.append(text)
75+
elif isinstance(content.intermediate_data, IntermediateData):
76+
for _, response_parts in content.intermediate_data.intermediate_responses:
77+
text = get_text_from_content(genai_types.Content(parts=response_parts))
78+
if text:
79+
parts.append(text)
80+
7481
# Then fetch the final response text and append it to the end.
7582
final_text = get_text_from_content(content.final_response)
7683
if final_text:

tests/unittests/evaluation/test_final_response_match_v2.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from __future__ import annotations
1616

1717
from google.adk.evaluation.eval_case import Invocation
18+
from google.adk.evaluation.eval_case import InvocationEvent
19+
from google.adk.evaluation.eval_case import InvocationEvents
1820
from google.adk.evaluation.eval_metrics import BaseCriterion
1921
from google.adk.evaluation.eval_metrics import EvalMetric
2022
from google.adk.evaluation.eval_metrics import EvalStatus
@@ -127,13 +129,18 @@ def create_test_template() -> str:
127129

128130
def _create_test_evaluator_gemini(
129131
threshold: float,
132+
*,
133+
include_intermediate_responses_in_final: bool = False,
130134
) -> FinalResponseMatchV2Evaluator:
131135
evaluator = FinalResponseMatchV2Evaluator(
132136
EvalMetric(
133137
metric_name="final_response_match_v2",
134138
threshold=threshold,
135139
criterion=BaseCriterion(
136140
threshold=0.5,
141+
include_intermediate_responses_in_final=(
142+
include_intermediate_responses_in_final
143+
),
137144
),
138145
),
139146
)
@@ -168,6 +175,21 @@ def _create_test_invocations(
168175
return actual_invocation, expected_invocation
169176

170177

178+
def _add_intermediate_text(invocation: Invocation, text: str) -> Invocation:
179+
invocation.intermediate_data = InvocationEvents(
180+
invocation_events=[
181+
InvocationEvent(
182+
author="agent",
183+
content=genai_types.Content(
184+
parts=[genai_types.Part(text=text)],
185+
role="model",
186+
),
187+
),
188+
]
189+
)
190+
return invocation
191+
192+
171193
def test_format_auto_rater_prompt():
172194
evaluator = _create_test_evaluator_gemini(threshold=0.8)
173195
actual_invocation, expected_invocation = _create_test_invocations(
@@ -193,6 +215,59 @@ def test_format_auto_rater_prompt():
193215
"""
194216

195217

218+
def test_format_auto_rater_prompt_uses_empty_text_for_missing_final_response():
219+
evaluator = _create_test_evaluator_gemini(threshold=0.8)
220+
actual_invocation, expected_invocation = _create_test_invocations(
221+
"candidate text", "reference text"
222+
)
223+
actual_invocation.final_response = None
224+
expected_invocation.final_response = None
225+
226+
prompt = evaluator.format_auto_rater_prompt(
227+
actual_invocation, expected_invocation
228+
)
229+
230+
assert "None" not in prompt
231+
assert '"Agent response": ,' in prompt
232+
assert '"Reference response": ,' in prompt
233+
234+
235+
def test_format_auto_rater_prompt_ignores_intermediate_by_default():
236+
evaluator = _create_test_evaluator_gemini(threshold=0.8)
237+
actual_invocation, expected_invocation = _create_test_invocations(
238+
"candidate final", "reference final"
239+
)
240+
_add_intermediate_text(actual_invocation, "candidate intro")
241+
_add_intermediate_text(expected_invocation, "reference intro")
242+
243+
prompt = evaluator.format_auto_rater_prompt(
244+
actual_invocation, expected_invocation
245+
)
246+
247+
assert "candidate final" in prompt
248+
assert "reference final" in prompt
249+
assert "candidate intro" not in prompt
250+
assert "reference intro" not in prompt
251+
252+
253+
def test_format_auto_rater_prompt_includes_intermediate_when_enabled():
254+
evaluator = _create_test_evaluator_gemini(
255+
threshold=0.8, include_intermediate_responses_in_final=True
256+
)
257+
actual_invocation, expected_invocation = _create_test_invocations(
258+
"candidate final", "reference final"
259+
)
260+
_add_intermediate_text(actual_invocation, "candidate intro")
261+
_add_intermediate_text(expected_invocation, "reference intro")
262+
263+
prompt = evaluator.format_auto_rater_prompt(
264+
actual_invocation, expected_invocation
265+
)
266+
267+
assert "candidate intro\ncandidate final" in prompt
268+
assert "reference intro\nreference final" in prompt
269+
270+
196271
def test_convert_auto_rater_response_to_score_valid():
197272
evaluator = _create_test_evaluator_gemini(threshold=0.8)
198273
auto_rater_response = """```json

tests/unittests/evaluation/test_llm_as_judge_utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,36 @@ def test_get_text_from_content_with_invocation_include_intermediate_responses_in
132132
)
133133

134134

135+
def test_get_text_from_content_with_intermediate_data_full_response():
136+
invocation = Invocation(
137+
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
138+
intermediate_data=IntermediateData(
139+
intermediate_responses=[
140+
("agent", [genai_types.Part(text="legacy intro")]),
141+
(
142+
"tool",
143+
[
144+
genai_types.Part(
145+
function_call=genai_types.FunctionCall(name="lookup")
146+
)
147+
],
148+
),
149+
]
150+
),
151+
final_response=genai_types.Content(
152+
parts=[genai_types.Part(text="final answer")]
153+
),
154+
)
155+
156+
assert get_text_from_content(invocation) == "final answer"
157+
assert (
158+
get_text_from_content(
159+
invocation, include_intermediate_responses_in_final=True
160+
)
161+
== "legacy intro\nfinal answer"
162+
)
163+
164+
135165
def test_get_eval_status_with_none_score():
136166
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
137167
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED

0 commit comments

Comments
 (0)