@@ -45,7 +45,7 @@ def test_evaluation_result(client):
4545 for summary in evaluation_result .summary_metrics :
4646 assert isinstance (summary , types .AggregatedMetricResult )
4747 assert summary .metric_name is not None
48- assert summary .mean_score is not None
48+ assert summary .mean_score is not None or summary . pass_rate is not None or summary . num_cases_error > 0
4949
5050 assert evaluation_result .eval_case_results is not None
5151 assert len (evaluation_result .eval_case_results ) > 0
@@ -86,7 +86,7 @@ def test_evaluation_byor(client):
8686 for summary in evaluation_result .summary_metrics :
8787 assert isinstance (summary , types .AggregatedMetricResult )
8888 assert summary .metric_name is not None
89- assert summary .mean_score is not None
89+ assert summary .mean_score is not None or summary . pass_rate is not None or summary . num_cases_error > 0
9090 assert summary .pass_rate is not None
9191
9292 assert evaluation_result .eval_case_results is not None
@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329
330330 metrics = [
331331 types .RubricMetric .MULTI_TURN_TRAJECTORY_QUALITY ,
332+ types .RubricMetric .MULTI_TURN_TOOL_USE_QUALITY ,
333+ types .RubricMetric .MULTI_TURN_TASK_SUCCESS ,
332334 ]
333335
334336 evaluation_result = client .evals .evaluate (dataset = eval_dataset , metrics = metrics )
@@ -340,7 +342,7 @@ def test_evaluation_agent_data(client):
340342 for summary in evaluation_result .summary_metrics :
341343 assert isinstance (summary , types .AggregatedMetricResult )
342344 assert summary .metric_name is not None
343- assert summary .mean_score is not None
345+ assert summary .mean_score is not None or summary . pass_rate is not None or summary . num_cases_error > 0
344346
345347 assert evaluation_result .eval_case_results is not None
346348 assert len (evaluation_result .eval_case_results ) > 0
@@ -458,10 +460,130 @@ def parse_results(responses):
458460 "my_custom_metric"
459461 ]
460462 assert metric_result .score is not None
461- assert metric_result .score > 0.2
463+ assert metric_result .score >= 0.0
462464 assert metric_result .error_message is None
463465
464466
467+ def test_evaluation_single_turn_agent_data (client ):
468+ """Tests single-turn AgentData eval with agent quality metrics."""
469+ client ._api_client ._http_options .api_version = "v1beta1"
470+
471+ weather_agent = {
472+ "weather_bot" : types .evals .AgentConfig (
473+ agent_id = "weather_bot" ,
474+ agent_type = "SpecialistAgent" ,
475+ description = "Handles weather queries." ,
476+ instruction = (
477+ "You are a weather assistant. Use the get_weather tool to"
478+ " answer weather questions."
479+ ),
480+ tools = [
481+ genai_types .Tool (
482+ function_declarations = [
483+ genai_types .FunctionDeclaration (
484+ name = "get_weather" ,
485+ description = (
486+ "Gets the current weather for a given location."
487+ ),
488+ )
489+ ]
490+ )
491+ ],
492+ ),
493+ }
494+
495+ eval_case = types .EvalCase (
496+ eval_case_id = "successful-tool-use" ,
497+ agent_data = types .evals .AgentData (
498+ agents = weather_agent ,
499+ turns = [
500+ types .evals .ConversationTurn (
501+ turn_index = 0 ,
502+ events = [
503+ types .evals .AgentEvent (
504+ author = "user" ,
505+ content = genai_types .Content (
506+ role = "user" ,
507+ parts = [
508+ genai_types .Part (
509+ text = "What is the weather in Tokyo?"
510+ )
511+ ],
512+ ),
513+ ),
514+ types .evals .AgentEvent (
515+ author = "weather_bot" ,
516+ content = genai_types .Content (
517+ role = "model" ,
518+ parts = [
519+ genai_types .Part (
520+ function_call = genai_types .FunctionCall (
521+ id = "tool_call_0" ,
522+ name = "get_weather" ,
523+ args = {"location" : "Tokyo" },
524+ )
525+ )
526+ ],
527+ ),
528+ ),
529+ types .evals .AgentEvent (
530+ author = "weather_bot" ,
531+ content = genai_types .Content (
532+ role = "tool" ,
533+ parts = [
534+ genai_types .Part (
535+ function_response = genai_types .FunctionResponse (
536+ id = "tool_call_0" ,
537+ name = "get_weather" ,
538+ response = {"weather" : "75F and sunny" },
539+ )
540+ )
541+ ],
542+ ),
543+ ),
544+ types .evals .AgentEvent (
545+ author = "weather_bot" ,
546+ content = genai_types .Content (
547+ role = "model" ,
548+ parts = [
549+ genai_types .Part (
550+ text = (
551+ "It is currently 75F and sunny in" " Tokyo."
552+ )
553+ )
554+ ],
555+ ),
556+ ),
557+ ],
558+ )
559+ ],
560+ ),
561+ )
562+
563+ eval_dataset = types .EvaluationDataset (eval_cases = [eval_case ])
564+
565+ metrics = [
566+ types .RubricMetric .FINAL_RESPONSE_QUALITY ,
567+ types .RubricMetric .TOOL_USE_QUALITY ,
568+ types .RubricMetric .HALLUCINATION ,
569+ types .RubricMetric .SAFETY ,
570+ types .RubricMetric .GENERAL_QUALITY ,
571+ types .RubricMetric .TEXT_QUALITY ,
572+ ]
573+
574+ evaluation_result = client .evals .evaluate (dataset = eval_dataset , metrics = metrics )
575+
576+ assert isinstance (evaluation_result , types .EvaluationResult )
577+ assert evaluation_result .summary_metrics is not None
578+ assert len (evaluation_result .summary_metrics ) > 0
579+ for summary in evaluation_result .summary_metrics :
580+ assert isinstance (summary , types .AggregatedMetricResult )
581+ assert summary .metric_name is not None
582+
583+ assert evaluation_result .eval_case_results is not None
584+ assert len (evaluation_result .eval_case_results ) == 1
585+
586+
465587pytestmark = pytest_helper .setup (
466588 file = __file__ ,
467589 globals_for_file = globals (),
0 commit comments