Skip to content

Commit 20ace4c

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - evaluate Gemini Agents API agents via interaction ids and agent scrape
PiperOrigin-RevId: 939996762
1 parent 1bbd635 commit 20ace4c

5 files changed

Lines changed: 272 additions & 10 deletions

File tree

agentplatform/_genai/_evals_common.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,12 +556,31 @@ def _resolve_inference_configs(
556556
return inference_configs
557557

558558

559+
def _is_gemini_agent_resource(agent: str) -> bool:
560+
"""Returns True if `agent` is a Gemini Agent resource name.
561+
562+
A Gemini Agent resource name has the format
563+
`projects/{project}/locations/{location}/agents/{agent}`, as opposed to an
564+
Agent Engine resource name which uses `.../reasoningEngines/{id}`.
565+
"""
566+
parts = agent.split("/")
567+
return (
568+
len(parts) == 6
569+
and parts[0] == "projects"
570+
and parts[2] == "locations"
571+
and parts[4] == "agents"
572+
and bool(parts[1])
573+
and bool(parts[3])
574+
and bool(parts[5])
575+
)
576+
577+
559578
def _add_evaluation_run_labels(
560579
labels: Optional[dict[str, str]] = None,
561580
agent: Optional[str] = None,
562581
) -> Optional[dict[str, str]]:
563582
"""Adds labels to the evaluation run."""
564-
if agent:
583+
if agent and not _is_gemini_agent_resource(agent):
565584
labels = labels or {}
566585
labels["vertex-ai-evaluation-agent-engine-id"] = agent.split(
567586
"reasoningEngines/"

agentplatform/_genai/_evals_metric_handlers.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,16 @@ def _build_evaluation_instance(
703703
)
704704
)
705705

706+
# An interactions data source is mutually exclusive with agent_data: when
707+
# set, the backend fetches the interaction + Gemini Agent config and parses
708+
# them into agent data server-side, so we must not also send agent_data.
709+
interactions_data_source = getattr(eval_case, "interactions_data_source", None)
710+
agent_data = (
711+
None
712+
if interactions_data_source is not None
713+
else _eval_case_to_agent_data(eval_case, extracted_prompt, response_content)
714+
)
715+
706716
return types.EvaluationInstance(
707717
prompt=prompt_instance_data,
708718
response=_content_to_instance_data(response_content),
@@ -715,9 +725,8 @@ def _build_evaluation_instance(
715725
other_data=(
716726
types.MapInstance(map_instance=other_data_map) if other_data_map else None
717727
),
718-
agent_data=_eval_case_to_agent_data(
719-
eval_case, extracted_prompt, response_content
720-
),
728+
agent_data=agent_data,
729+
interactions_data_source=interactions_data_source,
721730
)
722731

723732

tests/unit/agentplatform/genai/replays/test_create_evaluation_run.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,33 @@ async def test_create_eval_run_async_with_inference_configs(client):
871871
assert evaluation_run.error is None
872872

873873

874+
def test_create_eval_run_with_gemini_agent(client):
875+
gemini_agent = (
876+
"projects/model-evaluation-dev/locations/global/agents/"
877+
"test-agent-eval"
878+
)
879+
eval_set = (
880+
"projects/model-evaluation-dev/locations/global/evaluationSets/"
881+
"7392342128979869696"
882+
)
883+
evaluation_run = client.evals.create_evaluation_run(
884+
name="test_gemini_agent",
885+
display_name="test_gemini_agent",
886+
dataset=types.EvaluationRunDataSource(evaluation_set=eval_set),
887+
dest=GCS_DEST,
888+
metrics=[GENERAL_QUALITY_METRIC],
889+
agent_info=types.evals.AgentInfo(name="gemini-agent"),
890+
agent=gemini_agent,
891+
user_simulator_config=types.evals.UserSimulatorConfig(max_turn=3),
892+
)
893+
assert isinstance(evaluation_run, types.EvaluationRun)
894+
inference_config = evaluation_run.inference_configs["gemini-agent"]
895+
assert (
896+
inference_config.agent_run_config.gemini_agent_config.gemini_agent
897+
== gemini_agent
898+
)
899+
900+
874901
pytestmark = pytest_helper.setup(
875902
file=__file__,
876903
globals_for_file=globals(),

tests/unit/agentplatform/genai/replays/test_evaluate_instances.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,63 @@ def test_run_inference_with_agent(client):
184184
assert inference_result.gcs_source is None
185185

186186

187+
def test_evaluation_with_interaction(client):
188+
instance = types.EvaluationInstance(
189+
interactions_data_source=types.InteractionsDataSource(
190+
interaction=(
191+
"projects/977012026409/locations/global/interactions/"
192+
"ChAzMDY5YjBkOGE5ODcwMDM0EAgaATAqBG1haW4"
193+
),
194+
gemini_agent_config=types.GeminiAgentConfig(
195+
gemini_agent=(
196+
"projects/977012026409/locations/global/agents/"
197+
"test-agent-eval"
198+
),
199+
),
200+
)
201+
)
202+
response = client.evals.evaluate_instances(
203+
metric_config=types._EvaluateInstancesRequestParameters(
204+
metrics=[types.Metric(name="multi_turn_task_success_v1")],
205+
instance=instance,
206+
)
207+
)
208+
assert response is not None
209+
210+
def test_evaluate_method_with_interaction(client):
211+
eval_case = types.EvalCase(
212+
interactions_data_source=types.InteractionsDataSource(
213+
interaction=(
214+
"projects/model-evaluation-dev/locations/global/interactions/"
215+
"ChAzMDY5YjBkOGE5ODcwMDM0EAgaATAqBG1haW4"
216+
),
217+
gemini_agent_config=types.GeminiAgentConfig(
218+
gemini_agent=(
219+
"projects/model-evaluation-dev/locations/global/agents/"
220+
"test-agent-eval"
221+
),
222+
),
223+
)
224+
)
225+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
226+
227+
evaluation_result = client.evals.evaluate(
228+
dataset=eval_dataset,
229+
metrics=[types.RubricMetric.MULTI_TURN_TASK_SUCCESS],
230+
)
231+
232+
assert isinstance(evaluation_result, types.EvaluationResult)
233+
assert evaluation_result.summary_metrics is not None
234+
assert len(evaluation_result.summary_metrics) > 0
235+
for summary in evaluation_result.summary_metrics:
236+
assert isinstance(summary, types.AggregatedMetricResult)
237+
assert summary.metric_name is not None
238+
assert summary.mean_score is not None
239+
240+
assert evaluation_result.eval_case_results is not None
241+
assert len(evaluation_result.eval_case_results) == 1
242+
243+
187244
pytestmark = pytest_helper.setup(
188245
file=__file__,
189246
globals_for_file=globals(),

tests/unit/agentplatform/genai/test_evals.py

Lines changed: 156 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5913,12 +5913,16 @@ def my_plain_tool(query: str) -> str:
59135913

59145914
assert len(agent_info.agents["mock_agent"].tools) == 2
59155915
# First tool: ADK tool with _get_declaration
5916-
adk_declarations = agent_info.agents["mock_agent"].tools[0].function_declarations
5916+
adk_declarations = (
5917+
agent_info.agents["mock_agent"].tools[0].function_declarations
5918+
)
59175919
assert len(adk_declarations) == 1
59185920
assert adk_declarations[0] is mock_adk_declaration
59195921
mock_adk_tool._get_declaration.assert_called_once()
59205922
# Second tool: plain callable converted to FunctionDeclaration
5921-
plain_declarations = agent_info.agents["mock_agent"].tools[1].function_declarations
5923+
plain_declarations = (
5924+
agent_info.agents["mock_agent"].tools[1].function_declarations
5925+
)
59225926
assert len(plain_declarations) == 1
59235927
assert isinstance(plain_declarations[0], genai_types.FunctionDeclaration)
59245928
assert plain_declarations[0].name == "my_plain_tool"
@@ -8335,7 +8339,7 @@ def custom_agg_fn_error(
83358339
with mock.patch(
83368340
"agentplatform._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
83378341
) as mock_llm_process:
8338-
# fmt: on
8342+
# fmt: on
83398343
mock_llm_process.side_effect = [
83408344
agentplatform_genai_types.EvalCaseMetricResult(
83418345
metric_name="error_fallback_quality", score=0.9
@@ -8381,7 +8385,7 @@ def custom_agg_fn_invalid_type(
83818385
with mock.patch(
83828386
"agentplatform._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
83838387
) as mock_llm_process:
8384-
# fmt: on
8388+
# fmt: on
83858389
mock_llm_process.return_value = (
83868390
agentplatform_genai_types.EvalCaseMetricResult(
83878391
metric_name="invalid_type_fallback", score=0.8
@@ -8415,7 +8419,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
84158419
with mock.patch(
84168420
"agentplatform._genai.evals.Evals._evaluate_instances"
84178421
) as mock_evaluate_instances_unified:
8418-
# fmt: on
8422+
# fmt: on
84198423
mock_evaluate_instances_unified.return_value = (
84208424
agentplatform_genai_types.EvaluateInstancesResponse(
84218425
metric_results=[
@@ -8461,7 +8465,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
84618465
with mock.patch(
84628466
"agentplatform._genai.evals.Evals._evaluate_instances"
84638467
) as mock_evaluate_instances_unified:
8464-
# fmt: on
8468+
# fmt: on
84658469
mock_evaluate_instances_unified.return_value = (
84668470
agentplatform_genai_types.EvaluateInstancesResponse(
84678471
metric_results=[
@@ -9839,3 +9843,149 @@ async def test_create_evaluation_run_async_passes_allow_cross_region_model(self)
98399843
request_body.get("evaluationConfig", {}).get("allowCrossRegionModel")
98409844
is True
98419845
)
9846+
9847+
9848+
_TEST_INTERACTION = (
9849+
"projects/test-project/locations/us-central1/interactions/test-interaction"
9850+
)
9851+
_TEST_GEMINI_AGENT = "projects/test-project/locations/us-central1/agents/test-agent"
9852+
_TEST_AGENT_ENGINE = "projects/test-project/locations/us-central1/reasoningEngines/123"
9853+
9854+
9855+
class TestIsGeminiAgentResource:
9856+
"""Tests for the _is_gemini_agent_resource helper."""
9857+
9858+
def test_gemini_agent_resource_is_detected(self):
9859+
assert _evals_common._is_gemini_agent_resource(_TEST_GEMINI_AGENT) is True
9860+
9861+
def test_agent_engine_resource_is_not_gemini(self):
9862+
assert _evals_common._is_gemini_agent_resource(_TEST_AGENT_ENGINE) is False
9863+
9864+
def test_non_resource_string_is_not_gemini(self):
9865+
assert _evals_common._is_gemini_agent_resource("test-agent") is False
9866+
9867+
9868+
class TestEvaluateInstancesInteractionsDataSource:
9869+
"""CUJ1: BYO interaction id evaluated via evaluate_instances."""
9870+
9871+
def setup_method(self, method):
9872+
self.mock_api_client = mock.MagicMock()
9873+
self.mock_api_client.vertexai = True
9874+
self.mock_response = mock.MagicMock()
9875+
self.mock_response.body = json.dumps({})
9876+
self.mock_api_client.request.return_value = self.mock_response
9877+
9878+
def test_evaluate_instances_sends_interactions_data_source(self):
9879+
evals_module = evals.Evals(api_client_=self.mock_api_client)
9880+
9881+
instance = agentplatform_genai_types.EvaluationInstance(
9882+
interactions_data_source=agentplatform_genai_types.InteractionsDataSource(
9883+
interaction=_TEST_INTERACTION,
9884+
gemini_agent_config=agentplatform_genai_types.GeminiAgentConfig(
9885+
gemini_agent=_TEST_GEMINI_AGENT,
9886+
),
9887+
)
9888+
)
9889+
metric_config = agentplatform_genai_types._EvaluateInstancesRequestParameters(
9890+
metrics=[
9891+
agentplatform_genai_types.Metric(name="multi_turn_task_success_v1")
9892+
],
9893+
instance=instance,
9894+
)
9895+
9896+
evals_module.evaluate_instances(metric_config=metric_config)
9897+
9898+
self.mock_api_client.request.assert_called_once()
9899+
call_args = self.mock_api_client.request.call_args
9900+
path = call_args[0][1]
9901+
request_body = call_args[0][2]
9902+
assert path.endswith(":evaluateInstances")
9903+
data_source = request_body["instance"]["interactionsDataSource"]
9904+
assert data_source["interaction"] == _TEST_INTERACTION
9905+
assert data_source["gemini_agent_config"]["gemini_agent"] == _TEST_GEMINI_AGENT
9906+
9907+
9908+
class TestCreateEvaluationRunGeminiAgent:
9909+
"""CUJ2: scrape a Gemini agent via create_evaluation_run."""
9910+
9911+
def setup_method(self, method):
9912+
self.mock_api_client = mock.MagicMock()
9913+
self.mock_api_client.vertexai = True
9914+
self.mock_response = mock.MagicMock()
9915+
self.mock_response.body = json.dumps(
9916+
{
9917+
"name": "projects/123/locations/us-central1/evaluationRuns/456",
9918+
"displayName": "test_run",
9919+
"state": "PENDING",
9920+
}
9921+
)
9922+
self.mock_api_client.request.return_value = self.mock_response
9923+
9924+
def _get_create_run_body(self):
9925+
for call_args in self.mock_api_client.request.call_args_list:
9926+
method, path = call_args[0][0], call_args[0][1]
9927+
if method == "post" and path == "evaluationRuns":
9928+
return call_args[0][2]
9929+
raise AssertionError("evaluationRuns create call was not made")
9930+
9931+
def _agent_run_config(self, request_body):
9932+
inference_configs = request_body["inferenceConfigs"]
9933+
candidate = next(iter(inference_configs.values()))
9934+
return candidate["agentRunConfig"]
9935+
9936+
def test_create_evaluation_run_builds_gemini_agent_config(self):
9937+
evals_module = evals.Evals(api_client_=self.mock_api_client)
9938+
9939+
evals_module.create_evaluation_run(
9940+
dataset=agentplatform_genai_types.EvaluationRunDataSource(
9941+
evaluation_set="projects/123/locations/us-central1/evaluationSets/789"
9942+
),
9943+
metrics=[
9944+
agentplatform_genai_types.EvaluationRunMetric(
9945+
metric="multi_turn_task_success_v1",
9946+
metric_config=agentplatform_genai_types.UnifiedMetric(
9947+
predefined_metric_spec=genai_types.PredefinedMetricSpec(
9948+
metric_spec_name="multi_turn_task_success_v1",
9949+
)
9950+
),
9951+
)
9952+
],
9953+
dest="gs://test-bucket/output",
9954+
agent_info=agentplatform_genai_types.evals.AgentInfo(name="gemini-agent"),
9955+
agent=_TEST_GEMINI_AGENT,
9956+
)
9957+
9958+
request_body = self._get_create_run_body()
9959+
agent_run_config = self._agent_run_config(request_body)
9960+
assert (
9961+
agent_run_config["gemini_agent_config"]["gemini_agent"]
9962+
== _TEST_GEMINI_AGENT
9963+
)
9964+
assert "agent_engine" not in agent_run_config
9965+
9966+
def test_create_evaluation_run_agent_engine_does_not_set_gemini(self):
9967+
evals_module = evals.Evals(api_client_=self.mock_api_client)
9968+
9969+
evals_module.create_evaluation_run(
9970+
dataset=agentplatform_genai_types.EvaluationRunDataSource(
9971+
evaluation_set="projects/123/locations/us-central1/evaluationSets/789"
9972+
),
9973+
metrics=[
9974+
agentplatform_genai_types.EvaluationRunMetric(
9975+
metric="multi_turn_task_success_v1",
9976+
metric_config=agentplatform_genai_types.UnifiedMetric(
9977+
predefined_metric_spec=genai_types.PredefinedMetricSpec(
9978+
metric_spec_name="multi_turn_task_success_v1",
9979+
)
9980+
),
9981+
)
9982+
],
9983+
dest="gs://test-bucket/output",
9984+
agent_info=agentplatform_genai_types.evals.AgentInfo(name="ae-agent"),
9985+
agent=_TEST_AGENT_ENGINE,
9986+
)
9987+
9988+
request_body = self._get_create_run_body()
9989+
agent_run_config = self._agent_run_config(request_body)
9990+
assert "gemini_agent_config" not in agent_run_config
9991+
assert agent_run_config["agent_engine"] == _TEST_AGENT_ENGINE

0 commit comments

Comments
 (0)