Skip to content

Commit fce1c73

Browse files
authored
Revert "fix(eval): include intermediate text in final response match" (#5887)
1 parent 953304d commit fce1c73

4 files changed

Lines changed: 4 additions & 125 deletions

File tree

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -159,22 +159,13 @@ def format_auto_rater_prompt(
159159
if expected_invocation is None:
160160
raise ValueError("expected_invocation is required for this metric.")
161161

162-
include_intermediate = (
163-
self._criterion.include_intermediate_responses_in_final
164-
)
165-
reference = get_text_from_content(
166-
expected_invocation,
167-
include_intermediate_responses_in_final=include_intermediate,
168-
)
169-
response = get_text_from_content(
170-
actual_invocation,
171-
include_intermediate_responses_in_final=include_intermediate,
172-
)
162+
reference = get_text_from_content(expected_invocation.final_response)
163+
response = get_text_from_content(actual_invocation.final_response)
173164
user_prompt = get_text_from_content(expected_invocation.user_content)
174165
return self._auto_rater_prompt_template.format(
175166
prompt=user_prompt,
176-
response=response or "",
177-
golden_response=reference or "",
167+
response=response,
168+
golden_response=reference,
178169
)
179170

180171
@override

src/google/adk/evaluation/llm_as_judge_utils.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from .app_details import AppDetails
2626
from .common import EvalBaseModel
2727
from .eval_case import get_all_tool_calls_with_responses
28-
from .eval_case import IntermediateData
2928
from .eval_case import IntermediateDataType
3029
from .eval_case import Invocation
3130
from .eval_case import InvocationEvents
@@ -72,12 +71,6 @@ def get_text_from_content(
7271
text = get_text_from_content(event.content)
7372
if text:
7473
parts.append(text)
75-
elif isinstance(content.intermediate_data, IntermediateData):
76-
for _, response_parts in content.intermediate_data.intermediate_responses:
77-
text = get_text_from_content(genai_types.Content(parts=response_parts))
78-
if text:
79-
parts.append(text)
80-
8174
# Then fetch the final response text and append it to the end.
8275
final_text = get_text_from_content(content.final_response)
8376
if final_text:

tests/unittests/evaluation/test_final_response_match_v2.py

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
from __future__ import annotations
1616

1717
from google.adk.evaluation.eval_case import Invocation
18-
from google.adk.evaluation.eval_case import InvocationEvent
19-
from google.adk.evaluation.eval_case import InvocationEvents
2018
from google.adk.evaluation.eval_metrics import BaseCriterion
2119
from google.adk.evaluation.eval_metrics import EvalMetric
2220
from google.adk.evaluation.eval_metrics import EvalStatus
@@ -129,18 +127,13 @@ def create_test_template() -> str:
129127

130128
def _create_test_evaluator_gemini(
131129
threshold: float,
132-
*,
133-
include_intermediate_responses_in_final: bool = False,
134130
) -> FinalResponseMatchV2Evaluator:
135131
evaluator = FinalResponseMatchV2Evaluator(
136132
EvalMetric(
137133
metric_name="final_response_match_v2",
138134
threshold=threshold,
139135
criterion=BaseCriterion(
140136
threshold=0.5,
141-
include_intermediate_responses_in_final=(
142-
include_intermediate_responses_in_final
143-
),
144137
),
145138
),
146139
)
@@ -175,21 +168,6 @@ def _create_test_invocations(
175168
return actual_invocation, expected_invocation
176169

177170

178-
def _add_intermediate_text(invocation: Invocation, text: str) -> Invocation:
179-
invocation.intermediate_data = InvocationEvents(
180-
invocation_events=[
181-
InvocationEvent(
182-
author="agent",
183-
content=genai_types.Content(
184-
parts=[genai_types.Part(text=text)],
185-
role="model",
186-
),
187-
),
188-
]
189-
)
190-
return invocation
191-
192-
193171
def test_format_auto_rater_prompt():
194172
evaluator = _create_test_evaluator_gemini(threshold=0.8)
195173
actual_invocation, expected_invocation = _create_test_invocations(
@@ -215,59 +193,6 @@ def test_format_auto_rater_prompt():
215193
"""
216194

217195

218-
def test_format_auto_rater_prompt_uses_empty_text_for_missing_final_response():
219-
evaluator = _create_test_evaluator_gemini(threshold=0.8)
220-
actual_invocation, expected_invocation = _create_test_invocations(
221-
"candidate text", "reference text"
222-
)
223-
actual_invocation.final_response = None
224-
expected_invocation.final_response = None
225-
226-
prompt = evaluator.format_auto_rater_prompt(
227-
actual_invocation, expected_invocation
228-
)
229-
230-
assert "None" not in prompt
231-
assert '"Agent response": ,' in prompt
232-
assert '"Reference response": ,' in prompt
233-
234-
235-
def test_format_auto_rater_prompt_ignores_intermediate_by_default():
236-
evaluator = _create_test_evaluator_gemini(threshold=0.8)
237-
actual_invocation, expected_invocation = _create_test_invocations(
238-
"candidate final", "reference final"
239-
)
240-
_add_intermediate_text(actual_invocation, "candidate intro")
241-
_add_intermediate_text(expected_invocation, "reference intro")
242-
243-
prompt = evaluator.format_auto_rater_prompt(
244-
actual_invocation, expected_invocation
245-
)
246-
247-
assert "candidate final" in prompt
248-
assert "reference final" in prompt
249-
assert "candidate intro" not in prompt
250-
assert "reference intro" not in prompt
251-
252-
253-
def test_format_auto_rater_prompt_includes_intermediate_when_enabled():
254-
evaluator = _create_test_evaluator_gemini(
255-
threshold=0.8, include_intermediate_responses_in_final=True
256-
)
257-
actual_invocation, expected_invocation = _create_test_invocations(
258-
"candidate final", "reference final"
259-
)
260-
_add_intermediate_text(actual_invocation, "candidate intro")
261-
_add_intermediate_text(expected_invocation, "reference intro")
262-
263-
prompt = evaluator.format_auto_rater_prompt(
264-
actual_invocation, expected_invocation
265-
)
266-
267-
assert "candidate intro\ncandidate final" in prompt
268-
assert "reference intro\nreference final" in prompt
269-
270-
271196
def test_convert_auto_rater_response_to_score_valid():
272197
evaluator = _create_test_evaluator_gemini(threshold=0.8)
273198
auto_rater_response = """```json

tests/unittests/evaluation/test_llm_as_judge_utils.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -132,36 +132,6 @@ def test_get_text_from_content_with_invocation_include_intermediate_responses_in
132132
)
133133

134134

135-
def test_get_text_from_content_with_intermediate_data_full_response():
136-
invocation = Invocation(
137-
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
138-
intermediate_data=IntermediateData(
139-
intermediate_responses=[
140-
("agent", [genai_types.Part(text="legacy intro")]),
141-
(
142-
"tool",
143-
[
144-
genai_types.Part(
145-
function_call=genai_types.FunctionCall(name="lookup")
146-
)
147-
],
148-
),
149-
]
150-
),
151-
final_response=genai_types.Content(
152-
parts=[genai_types.Part(text="final answer")]
153-
),
154-
)
155-
156-
assert get_text_from_content(invocation) == "final answer"
157-
assert (
158-
get_text_from_content(
159-
invocation, include_intermediate_responses_in_final=True
160-
)
161-
== "legacy intro\nfinal answer"
162-
)
163-
164-
165135
def test_get_eval_status_with_none_score():
166136
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
167137
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED

0 commit comments

Comments
 (0)