chore: GenAI Client(evals) - evaluate Gemini Agents API agents via interaction ids and agent scrape

jsondai · copybara-github · commit 20ace4c48d6b · 2026-06-29T14:51:03.000-07:00
PiperOrigin-RevId: 939996762
diff --git a/agentplatform/_genai/_evals_common.py b/agentplatform/_genai/_evals_common.py
@@ -556,12 +556,31 @@ def _resolve_inference_configs(
     return inference_configs
 
 
+def _is_gemini_agent_resource(agent: str) -> bool:
+    """Returns True if `agent` is a Gemini Agent resource name.
+
+    A Gemini Agent resource name has the format
+    `projects/{project}/locations/{location}/agents/{agent}`, as opposed to an
+    Agent Engine resource name which uses `.../reasoningEngines/{id}`.
+    """
+    parts = agent.split("/")
+    return (
+        len(parts) == 6
+        and parts[0] == "projects"
+        and parts[2] == "locations"
+        and parts[4] == "agents"
+        and bool(parts[1])
+        and bool(parts[3])
+        and bool(parts[5])
+    )
+
+
 def _add_evaluation_run_labels(
     labels: Optional[dict[str, str]] = None,
     agent: Optional[str] = None,
 ) -> Optional[dict[str, str]]:
     """Adds labels to the evaluation run."""
-    if agent:
+    if agent and not _is_gemini_agent_resource(agent):
         labels = labels or {}
         labels["vertex-ai-evaluation-agent-engine-id"] = agent.split(
             "reasoningEngines/"
diff --git a/agentplatform/_genai/_evals_metric_handlers.py b/agentplatform/_genai/_evals_metric_handlers.py
@@ -703,6 +703,16 @@ def _build_evaluation_instance(
                     )
                 )
 
+    # An interactions data source is mutually exclusive with agent_data: when
+    # set, the backend fetches the interaction + Gemini Agent config and parses
+    # them into agent data server-side, so we must not also send agent_data.
+    interactions_data_source = getattr(eval_case, "interactions_data_source", None)
+    agent_data = (
+        None
+        if interactions_data_source is not None
+        else _eval_case_to_agent_data(eval_case, extracted_prompt, response_content)
+    )
+
     return types.EvaluationInstance(
         prompt=prompt_instance_data,
         response=_content_to_instance_data(response_content),
@@ -715,9 +725,8 @@ def _build_evaluation_instance(
         other_data=(
             types.MapInstance(map_instance=other_data_map) if other_data_map else None
         ),
-        agent_data=_eval_case_to_agent_data(
-            eval_case, extracted_prompt, response_content
-        ),
+        agent_data=agent_data,
+        interactions_data_source=interactions_data_source,
     )
 
 
diff --git a/tests/unit/agentplatform/genai/replays/test_create_evaluation_run.py b/tests/unit/agentplatform/genai/replays/test_create_evaluation_run.py
@@ -871,6 +871,33 @@ async def test_create_eval_run_async_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
+def test_create_eval_run_with_gemini_agent(client):
+    gemini_agent = (
+        "projects/model-evaluation-dev/locations/global/agents/"
+        "test-agent-eval"
+    )
+    eval_set = (
+        "projects/model-evaluation-dev/locations/global/evaluationSets/"
+        "7392342128979869696"
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test_gemini_agent",
+        display_name="test_gemini_agent",
+        dataset=types.EvaluationRunDataSource(evaluation_set=eval_set),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        agent_info=types.evals.AgentInfo(name="gemini-agent"),
+        agent=gemini_agent,
+        user_simulator_config=types.evals.UserSimulatorConfig(max_turn=3),
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    inference_config = evaluation_run.inference_configs["gemini-agent"]
+    assert (
+        inference_config.agent_run_config.gemini_agent_config.gemini_agent
+        == gemini_agent
+    )
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),
diff --git a/tests/unit/agentplatform/genai/replays/test_evaluate_instances.py b/tests/unit/agentplatform/genai/replays/test_evaluate_instances.py
@@ -184,6 +184,63 @@ def test_run_inference_with_agent(client):
     assert inference_result.gcs_source is None
 
 
+def test_evaluation_with_interaction(client):
+    instance = types.EvaluationInstance(
+        interactions_data_source=types.InteractionsDataSource(
+            interaction=(
+                "projects/977012026409/locations/global/interactions/"
+                "ChAzMDY5YjBkOGE5ODcwMDM0EAgaATAqBG1haW4"
+            ),
+            gemini_agent_config=types.GeminiAgentConfig(
+                gemini_agent=(
+                    "projects/977012026409/locations/global/agents/"
+                    "test-agent-eval"
+                ),
+            ),
+        )
+    )
+    response = client.evals.evaluate_instances(
+        metric_config=types._EvaluateInstancesRequestParameters(
+            metrics=[types.Metric(name="multi_turn_task_success_v1")],
+            instance=instance,
+        )
+    )
+    assert response is not None
+
+def test_evaluate_method_with_interaction(client):
+    eval_case = types.EvalCase(
+        interactions_data_source=types.InteractionsDataSource(
+            interaction=(
+                "projects/model-evaluation-dev/locations/global/interactions/"
+                "ChAzMDY5YjBkOGE5ODcwMDM0EAgaATAqBG1haW4"
+            ),
+            gemini_agent_config=types.GeminiAgentConfig(
+                gemini_agent=(
+                    "projects/model-evaluation-dev/locations/global/agents/"
+                    "test-agent-eval"
+                ),
+            ),
+        )
+    )
+    eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
+
+    evaluation_result = client.evals.evaluate(
+        dataset=eval_dataset,
+        metrics=[types.RubricMetric.MULTI_TURN_TASK_SUCCESS],
+    )
+
+    assert isinstance(evaluation_result, types.EvaluationResult)
+    assert evaluation_result.summary_metrics is not None
+    assert len(evaluation_result.summary_metrics) > 0
+    for summary in evaluation_result.summary_metrics:
+        assert isinstance(summary, types.AggregatedMetricResult)
+        assert summary.metric_name is not None
+        assert summary.mean_score is not None
+
+    assert evaluation_result.eval_case_results is not None
+    assert len(evaluation_result.eval_case_results) == 1
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),
diff --git a/tests/unit/agentplatform/genai/test_evals.py b/tests/unit/agentplatform/genai/test_evals.py
@@ -5913,12 +5913,16 @@ def my_plain_tool(query: str) -> str:
 
         assert len(agent_info.agents["mock_agent"].tools) == 2
         # First tool: ADK tool with _get_declaration
-        adk_declarations = agent_info.agents["mock_agent"].tools[0].function_declarations
+        adk_declarations = (
+            agent_info.agents["mock_agent"].tools[0].function_declarations
+        )
         assert len(adk_declarations) == 1
         assert adk_declarations[0] is mock_adk_declaration
         mock_adk_tool._get_declaration.assert_called_once()
         # Second tool: plain callable converted to FunctionDeclaration
-        plain_declarations = agent_info.agents["mock_agent"].tools[1].function_declarations
+        plain_declarations = (
+            agent_info.agents["mock_agent"].tools[1].function_declarations
+        )
         assert len(plain_declarations) == 1
         assert isinstance(plain_declarations[0], genai_types.FunctionDeclaration)
         assert plain_declarations[0].name == "my_plain_tool"
@@ -8335,7 +8339,7 @@ def custom_agg_fn_error(
         with mock.patch(
             "agentplatform._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
-        # fmt: on
+            # fmt: on
             mock_llm_process.side_effect = [
                 agentplatform_genai_types.EvalCaseMetricResult(
                     metric_name="error_fallback_quality", score=0.9
@@ -8381,7 +8385,7 @@ def custom_agg_fn_invalid_type(
         with mock.patch(
             "agentplatform._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
-        # fmt: on
+            # fmt: on
             mock_llm_process.return_value = (
                 agentplatform_genai_types.EvalCaseMetricResult(
                     metric_name="invalid_type_fallback", score=0.8
@@ -8415,7 +8419,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
         with mock.patch(
             "agentplatform._genai.evals.Evals._evaluate_instances"
         ) as mock_evaluate_instances_unified:
-        # fmt: on
+            # fmt: on
             mock_evaluate_instances_unified.return_value = (
                 agentplatform_genai_types.EvaluateInstancesResponse(
                     metric_results=[
@@ -8461,7 +8465,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
         with mock.patch(
             "agentplatform._genai.evals.Evals._evaluate_instances"
         ) as mock_evaluate_instances_unified:
-        # fmt: on
+            # fmt: on
             mock_evaluate_instances_unified.return_value = (
                 agentplatform_genai_types.EvaluateInstancesResponse(
                     metric_results=[
@@ -9839,3 +9843,149 @@ async def test_create_evaluation_run_async_passes_allow_cross_region_model(self)
             request_body.get("evaluationConfig", {}).get("allowCrossRegionModel")
             is True
         )
+
+
+_TEST_INTERACTION = (
+    "projects/test-project/locations/us-central1/interactions/test-interaction"
+)
+_TEST_GEMINI_AGENT = "projects/test-project/locations/us-central1/agents/test-agent"
+_TEST_AGENT_ENGINE = "projects/test-project/locations/us-central1/reasoningEngines/123"
+
+
+class TestIsGeminiAgentResource:
+    """Tests for the _is_gemini_agent_resource helper."""
+
+    def test_gemini_agent_resource_is_detected(self):
+        assert _evals_common._is_gemini_agent_resource(_TEST_GEMINI_AGENT) is True
+
+    def test_agent_engine_resource_is_not_gemini(self):
+        assert _evals_common._is_gemini_agent_resource(_TEST_AGENT_ENGINE) is False
+
+    def test_non_resource_string_is_not_gemini(self):
+        assert _evals_common._is_gemini_agent_resource("test-agent") is False
+
+
+class TestEvaluateInstancesInteractionsDataSource:
+    """CUJ1: BYO interaction id evaluated via evaluate_instances."""
+
+    def setup_method(self, method):
+        self.mock_api_client = mock.MagicMock()
+        self.mock_api_client.vertexai = True
+        self.mock_response = mock.MagicMock()
+        self.mock_response.body = json.dumps({})
+        self.mock_api_client.request.return_value = self.mock_response
+
+    def test_evaluate_instances_sends_interactions_data_source(self):
+        evals_module = evals.Evals(api_client_=self.mock_api_client)
+
+        instance = agentplatform_genai_types.EvaluationInstance(
+            interactions_data_source=agentplatform_genai_types.InteractionsDataSource(
+                interaction=_TEST_INTERACTION,
+                gemini_agent_config=agentplatform_genai_types.GeminiAgentConfig(
+                    gemini_agent=_TEST_GEMINI_AGENT,
+                ),
+            )
+        )
+        metric_config = agentplatform_genai_types._EvaluateInstancesRequestParameters(
+            metrics=[
+                agentplatform_genai_types.Metric(name="multi_turn_task_success_v1")
+            ],
+            instance=instance,
+        )
+
+        evals_module.evaluate_instances(metric_config=metric_config)
+
+        self.mock_api_client.request.assert_called_once()
+        call_args = self.mock_api_client.request.call_args
+        path = call_args[0][1]
+        request_body = call_args[0][2]
+        assert path.endswith(":evaluateInstances")
+        data_source = request_body["instance"]["interactionsDataSource"]
+        assert data_source["interaction"] == _TEST_INTERACTION
+        assert data_source["gemini_agent_config"]["gemini_agent"] == _TEST_GEMINI_AGENT
+
+
+class TestCreateEvaluationRunGeminiAgent:
+    """CUJ2: scrape a Gemini agent via create_evaluation_run."""
+
+    def setup_method(self, method):
+        self.mock_api_client = mock.MagicMock()
+        self.mock_api_client.vertexai = True
+        self.mock_response = mock.MagicMock()
+        self.mock_response.body = json.dumps(
+            {
+                "name": "projects/123/locations/us-central1/evaluationRuns/456",
+                "displayName": "test_run",
+                "state": "PENDING",
+            }
+        )
+        self.mock_api_client.request.return_value = self.mock_response
+
+    def _get_create_run_body(self):
+        for call_args in self.mock_api_client.request.call_args_list:
+            method, path = call_args[0][0], call_args[0][1]
+            if method == "post" and path == "evaluationRuns":
+                return call_args[0][2]
+        raise AssertionError("evaluationRuns create call was not made")
+
+    def _agent_run_config(self, request_body):
+        inference_configs = request_body["inferenceConfigs"]
+        candidate = next(iter(inference_configs.values()))
+        return candidate["agentRunConfig"]
+
+    def test_create_evaluation_run_builds_gemini_agent_config(self):
+        evals_module = evals.Evals(api_client_=self.mock_api_client)
+
+        evals_module.create_evaluation_run(
+            dataset=agentplatform_genai_types.EvaluationRunDataSource(
+                evaluation_set="projects/123/locations/us-central1/evaluationSets/789"
+            ),
+            metrics=[
+                agentplatform_genai_types.EvaluationRunMetric(
+                    metric="multi_turn_task_success_v1",
+                    metric_config=agentplatform_genai_types.UnifiedMetric(
+                        predefined_metric_spec=genai_types.PredefinedMetricSpec(
+                            metric_spec_name="multi_turn_task_success_v1",
+                        )
+                    ),
+                )
+            ],
+            dest="gs://test-bucket/output",
+            agent_info=agentplatform_genai_types.evals.AgentInfo(name="gemini-agent"),
+            agent=_TEST_GEMINI_AGENT,
+        )
+
+        request_body = self._get_create_run_body()
+        agent_run_config = self._agent_run_config(request_body)
+        assert (
+            agent_run_config["gemini_agent_config"]["gemini_agent"]
+            == _TEST_GEMINI_AGENT
+        )
+        assert "agent_engine" not in agent_run_config
+
+    def test_create_evaluation_run_agent_engine_does_not_set_gemini(self):
+        evals_module = evals.Evals(api_client_=self.mock_api_client)
+
+        evals_module.create_evaluation_run(
+            dataset=agentplatform_genai_types.EvaluationRunDataSource(
+                evaluation_set="projects/123/locations/us-central1/evaluationSets/789"
+            ),
+            metrics=[
+                agentplatform_genai_types.EvaluationRunMetric(
+                    metric="multi_turn_task_success_v1",
+                    metric_config=agentplatform_genai_types.UnifiedMetric(
+                        predefined_metric_spec=genai_types.PredefinedMetricSpec(
+                            metric_spec_name="multi_turn_task_success_v1",
+                        )
+                    ),
+                )
+            ],
+            dest="gs://test-bucket/output",
+            agent_info=agentplatform_genai_types.evals.AgentInfo(name="ae-agent"),
+            agent=_TEST_AGENT_ENGINE,
+        )
+
+        request_body = self._get_create_run_body()
+        agent_run_config = self._agent_run_config(request_body)
+        assert "gemini_agent_config" not in agent_run_config
+        assert agent_run_config["agent_engine"] == _TEST_AGENT_ENGINE