feat: GenAI Client(evals) - update SDK type definitions for Agent Data

jsondai · copybara-github · commit 6ac28a546218 · 2026-02-20T15:14:25.000-08:00
PiperOrigin-RevId: 873093421
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -81,13 +81,19 @@ def test_rouge_metric(client):
 
 def test_pointwise_metric(client):
     """Tests the _evaluate_instances method with PointwiseMetricInput."""
-    instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
+    instance_dict = {
+        "prompt": "What is the capital of France?",
+        "response": "Paris",
+    }
     json_instance = json.dumps(instance_dict)
 
     test_input = types.PointwiseMetricInput(
         instance=types.PointwiseMetricInstance(json_instance=json_instance),
         metric_spec=genai_types.PointwiseMetricSpec(
-            metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
+            metric_prompt_template=(
+                "Evaluate if the response '{response}' correctly answers the"
+                " prompt '{prompt}'."
+            )
         ),
     )
     response = client.evals.evaluate_instances(
@@ -99,82 +105,37 @@ def test_pointwise_metric(client):
     assert response.pointwise_metric_result.score is not None
 
 
-def test_pointwise_metric_with_agent_data(client):
-    """Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
-    instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
-    json_instance = json.dumps(instance_dict)
-    agent_data = types.evals.AgentData(
-        agent_config=types.evals.AgentConfig(
-            tools=types.evals.Tools(
-                tool=[
-                    genai_types.Tool(
-                        function_declarations=[
-                            genai_types.FunctionDeclaration(name="search")
-                        ]
-                    )
-                ]
-            ),
-            developer_instruction=types.evals.InstanceData(text="instruction"),
-        ),
-        events=types.evals.Events(
-            event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
-        ),
-    )
-    instance = types.EvaluationInstance(
-        prompt=types.evals.InstanceData(text="What is the capital of France?"),
-        response=types.evals.InstanceData(text="Paris"),
-        agent_data=agent_data,
-    )
-
-    test_input = types.PointwiseMetricInput(
-        instance=types.PointwiseMetricInstance(json_instance=json_instance),
-        metric_spec=genai_types.PointwiseMetricSpec(
-            metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
-        ),
-    )
-    response = client.evals.evaluate_instances(
-        metric_config=types._EvaluateInstancesRequestParameters(
-            pointwise_metric_input=test_input,
-            instance=instance,
-        )
-    )
-    assert response.pointwise_metric_result is not None
-    assert response.pointwise_metric_result.score is not None
-
-
-def test_predefined_metric_with_agent_data(client):
-    """Tests the _evaluate_instances method with predefined metric and agent_data."""
-    agent_data = types.evals.AgentData(
-        agent_config=types.evals.AgentConfig(
-            tools=types.evals.Tools(
-                tool=[
-                    genai_types.Tool(
-                        function_declarations=[
-                            genai_types.FunctionDeclaration(name="search")
-                        ]
-                    )
-                ]
-            ),
-            developer_instruction=types.evals.InstanceData(text="instruction"),
-        ),
-        events=types.evals.Events(
-            event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
-        ),
-    )
-    instance = types.EvaluationInstance(
-        prompt=types.evals.InstanceData(text="What is the capital of France?"),
-        response=types.evals.InstanceData(text="Paris"),
-        reference=types.evals.InstanceData(text="Paris"),
-        agent_data=agent_data,
-    )
-
-    response = client.evals.evaluate_instances(
-        metric_config=types._EvaluateInstancesRequestParameters(
-            metrics=[types.Metric(name="general_quality_v1")],
-            instance=instance,
-        )
-    )
-    assert response.metric_results[0].score is not None
+# def test_predefined_metric_with_agent_data(client):
+#     """Tests the _evaluate_instances method with predefined metric and agent_data."""
+#     agent_data = types.evals.AgentData(
+#         agent_config=types.evals.AgentConfig(
+#             tools=[
+#                 genai_types.Tool(
+#                     function_declarations=[
+#                         genai_types.FunctionDeclaration(name="search")
+#                     ]
+#                 )
+#             ],
+#             developer_instruction=types.evals.InstanceData(text="instruction"),
+#         ),
+#         events=types.evals.Events(
+#             event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
+#         ),
+#     )
+#     instance = types.EvaluationInstance(
+#         prompt=types.evals.InstanceData(text="What is the capital of France?"),
+#         response=types.evals.InstanceData(text="Paris"),
+#         reference=types.evals.InstanceData(text="Paris"),
+#         agent_data=agent_data,
+#     )
+
+#     response = client.evals.evaluate_instances(
+#         metric_config=types._EvaluateInstancesRequestParameters(
+#             metrics=[types.Metric(name="general_quality_v1")],
+#             instance=instance,
+#         )
+#     )
+#     assert response.metric_results[0].score is not None
 
 
 def test_pairwise_metric_with_autorater(client):
@@ -189,7 +150,10 @@ def test_pairwise_metric_with_autorater(client):
     test_input = types.PairwiseMetricInput(
         instance=types.PairwiseMetricInstance(json_instance=json_instance),
         metric_spec=genai_types.PairwiseMetricSpec(
-            metric_prompt_template="Which response is a better summary? Baseline: '{baseline_response}' or Candidate: '{candidate_response}'"
+            metric_prompt_template=(
+                "Which response is a better summary? Baseline:"
+                " '{baseline_response}' or Candidate: '{candidate_response}'"
+            )
         ),
     )
     autorater_config = genai_types.AutoraterConfig(sampling_count=2)
@@ -240,7 +204,10 @@ def test_inference_with_prompt_template(client):
 
 def test_run_inference_with_agent(client):
     test_df = pd.DataFrame(
-        {"prompt": ["agent prompt"], "session_inputs": ['{"user_id": "user_123"}']}
+        {
+            "prompt": ["agent prompt"],
+            "session_inputs": ['{"user_id": "user_123"}'],
+        }
     )
     inference_result = client.evals.run_inference(
         agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):
         )
 
         assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert agent_data.agent_config.tools.tool == [tool]
+        assert agent_data.agent_config.legacy_tools.tool == [tool]
         assert agent_data.events.event[0].parts[0].text == "intermediate event"
 
     def test_eval_case_to_agent_data_events_only(self):
@@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):
         )
 
         assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert not agent_data.agent_config.tools.tool
+        assert not agent_data.agent_config.legacy_tools.tool
 
     def test_eval_case_to_agent_data_agent_info_empty(self):
         intermediate_events = [
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
@@ -899,7 +899,7 @@ def _eval_case_to_agent_data(
 
             if tools or developer_instruction:
                 agent_config = types.evals.AgentConfig(
-                    tools=tools,
+                    legacy_tools=tools,
                     developer_instruction=developer_instruction,
                 )
 
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
@@ -1492,6 +1492,10 @@ class EvalCase(_common.BaseModel):
         default=None,
         description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""",
     )
+    agent_data: Optional[evals_types.AgentData] = Field(
+        default=None,
+        description="""This field is experimental and may change in future versions. The agent data of the agent under evaluation.""",
+    )
     # Allow extra fields to support custom metric prompts and stay backward compatible.
     model_config = ConfigDict(frozen=True, extra="allow")
 
@@ -1526,6 +1530,9 @@ class EvalCaseDict(TypedDict, total=False):
     agent_info: Optional[evals_types.AgentInfo]
     """This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation."""
 
+    agent_data: Optional[evals_types.AgentData]
+    """This field is experimental and may change in future versions. The agent data of the agent under evaluation."""
+
 
 EvalCaseOrDict = Union[EvalCase, EvalCaseDict]
 
diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py

Original file line number	Diff line number	Diff line change
`@@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):`
`4056`	`4056`	`)`
`4057`	`4057`
`4058`	`4058`	`assert agent_data.agent_config.developer_instruction.text == "instruction1"`
`4059`		`- assert agent_data.agent_config.tools.tool == [tool]`
	`4059`	`+ assert agent_data.agent_config.legacy_tools.tool == [tool]`
`4060`	`4060`	`assert agent_data.events.event[0].parts[0].text == "intermediate event"`
`4061`	`4061`
`4062`	`4062`	`def test_eval_case_to_agent_data_events_only(self):`
`@@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):`
`4164`	`4164`	`)`
`4165`	`4165`
`4166`	`4166`	`assert agent_data.agent_config.developer_instruction.text == "instruction1"`
`4167`		`- assert not agent_data.agent_config.tools.tool`
	`4167`	`+ assert not agent_data.agent_config.legacy_tools.tool`
`4168`	`4168`
`4169`	`4169`	`def test_eval_case_to_agent_data_agent_info_empty(self):`
`4170`	`4170`	`intermediate_events = [`
Original file line number	Diff line number	Diff line change
`@@ -899,7 +899,7 @@ def _eval_case_to_agent_data(`
`899`	`899`
`900`	`900`	`if tools or developer_instruction:`
`901`	`901`	`agent_config = types.evals.AgentConfig(`
`902`		`- tools=tools,`
	`902`	`+ legacy_tools=tools,`
`903`	`903`	`developer_instruction=developer_instruction,`
`904`	`904`	`)`
`905`	`905`