@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329
330330 metrics = [
331331 types .RubricMetric .MULTI_TURN_TRAJECTORY_QUALITY ,
332+ types .RubricMetric .MULTI_TURN_TOOL_USE_QUALITY ,
333+ types .RubricMetric .MULTI_TURN_TASK_SUCCESS ,
332334 ]
333335
334336 evaluation_result = client .evals .evaluate (dataset = eval_dataset , metrics = metrics )
@@ -458,10 +460,296 @@ def parse_results(responses):
458460 "my_custom_metric"
459461 ]
460462 assert metric_result .score is not None
461- assert metric_result .score > 0.2
463+ assert metric_result .score >= 0.0
462464 assert metric_result .error_message is None
463465
464466
467+ def test_evaluation_agent_data_additional_metrics (client ):
468+ """Tests AgentData eval with MULTI_TURN_TOOL_USE_QUALITY and MULTI_TURN_TASK_SUCCESS."""
469+ client ._api_client ._http_options .api_version = "v1beta1"
470+
471+ agent_data = types .evals .AgentData (
472+ agents = {
473+ "coordinator" : types .evals .AgentConfig (
474+ agent_id = "coordinator" ,
475+ agent_type = "RouterAgent" ,
476+ description = "Root agent that delegates to specialists." ,
477+ instruction = (
478+ "You are a travel coordinator. Delegate flight tasks to"
479+ " 'flight_bot' and hotel tasks to 'hotel_bot'."
480+ ),
481+ sub_agents = ["flight_bot" , "hotel_bot" ],
482+ tools = [
483+ genai_types .Tool (
484+ function_declarations = [
485+ genai_types .FunctionDeclaration (
486+ name = "delegate_to_agent" ,
487+ description = "Delegates conversation to a sub-agent." ,
488+ )
489+ ]
490+ )
491+ ],
492+ ),
493+ "flight_bot" : types .evals .AgentConfig (
494+ agent_id = "flight_bot" ,
495+ agent_type = "SpecialistAgent" ,
496+ description = "Handles flight searches." ,
497+ instruction = "Search for flights using the available tools." ,
498+ tools = [
499+ genai_types .Tool (
500+ function_declarations = [
501+ genai_types .FunctionDeclaration (
502+ name = "search_flights" ,
503+ description = (
504+ "Finds flights based on origin and destination."
505+ ),
506+ )
507+ ]
508+ )
509+ ],
510+ ),
511+ "hotel_bot" : types .evals .AgentConfig (
512+ agent_id = "hotel_bot" ,
513+ agent_type = "SpecialistAgent" ,
514+ description = "Handles hotel searches." ,
515+ instruction = "Search for hotels using the available tools." ,
516+ tools = [
517+ genai_types .Tool (
518+ function_declarations = [
519+ genai_types .FunctionDeclaration (
520+ name = "search_hotels" ,
521+ description = "Finds hotels in a given location." ,
522+ )
523+ ]
524+ )
525+ ],
526+ ),
527+ },
528+ turns = [
529+ types .evals .ConversationTurn (
530+ turn_index = 0 ,
531+ events = [
532+ types .evals .AgentEvent (
533+ author = "user" ,
534+ content = genai_types .Content (
535+ role = "user" ,
536+ parts = [
537+ genai_types .Part (
538+ text = (
539+ "I need to book a flight to NYC for next"
540+ " Monday."
541+ )
542+ )
543+ ],
544+ ),
545+ ),
546+ types .evals .AgentEvent (
547+ author = "coordinator" ,
548+ content = genai_types .Content (
549+ role = "model" ,
550+ parts = [
551+ genai_types .Part (
552+ function_call = genai_types .FunctionCall (
553+ name = "delegate_to_agent" ,
554+ args = {"agent_name" : "flight_bot" },
555+ )
556+ )
557+ ],
558+ ),
559+ ),
560+ types .evals .AgentEvent (
561+ author = "flight_bot" ,
562+ content = genai_types .Content (
563+ role = "model" ,
564+ parts = [
565+ genai_types .Part (
566+ function_call = genai_types .FunctionCall (
567+ name = "search_flights" ,
568+ args = {
569+ "destination" : "NYC" ,
570+ "date" : "next Monday" ,
571+ },
572+ )
573+ )
574+ ],
575+ ),
576+ ),
577+ types .evals .AgentEvent (
578+ author = "flight_bot" ,
579+ content = genai_types .Content (
580+ role = "tool" ,
581+ parts = [
582+ genai_types .Part (
583+ function_response = genai_types .FunctionResponse (
584+ name = "search_flights" ,
585+ response = {
586+ "flights" : [
587+ {"id" : "UA100" , "price" : "$300" }
588+ ]
589+ },
590+ )
591+ )
592+ ],
593+ ),
594+ ),
595+ types .evals .AgentEvent (
596+ author = "flight_bot" ,
597+ content = genai_types .Content (
598+ role = "model" ,
599+ parts = [
600+ genai_types .Part (
601+ text = "I found flight UA100 to NYC for $300."
602+ )
603+ ],
604+ ),
605+ ),
606+ ],
607+ ),
608+ ],
609+ )
610+
611+ eval_case = types .EvalCase (agent_data = agent_data )
612+ eval_dataset = types .EvaluationDataset (eval_cases = [eval_case ])
613+
614+ metrics = [
615+ types .RubricMetric .MULTI_TURN_TOOL_USE_QUALITY ,
616+ types .RubricMetric .MULTI_TURN_TASK_SUCCESS ,
617+ ]
618+
619+ evaluation_result = client .evals .evaluate (dataset = eval_dataset , metrics = metrics )
620+
621+ assert isinstance (evaluation_result , types .EvaluationResult )
622+ assert evaluation_result .summary_metrics is not None
623+ assert len (evaluation_result .summary_metrics ) > 0
624+ for summary in evaluation_result .summary_metrics :
625+ assert isinstance (summary , types .AggregatedMetricResult )
626+ assert summary .metric_name is not None
627+ assert summary .mean_score is not None
628+
629+ assert evaluation_result .eval_case_results is not None
630+ assert len (evaluation_result .eval_case_results ) > 0
631+
632+
633+ def test_evaluation_single_turn_agent_data (client ):
634+ """Tests single-turn AgentData eval with agent quality metrics."""
635+ client ._api_client ._http_options .api_version = "v1beta1"
636+
637+ weather_agent = {
638+ "weather_bot" : types .evals .AgentConfig (
639+ agent_id = "weather_bot" ,
640+ agent_type = "SpecialistAgent" ,
641+ description = "Handles weather queries." ,
642+ instruction = (
643+ "You are a weather assistant. Use the get_weather tool to"
644+ " answer weather questions."
645+ ),
646+ tools = [
647+ genai_types .Tool (
648+ function_declarations = [
649+ genai_types .FunctionDeclaration (
650+ name = "get_weather" ,
651+ description = (
652+ "Gets the current weather for a given location."
653+ ),
654+ )
655+ ]
656+ )
657+ ],
658+ ),
659+ }
660+
661+ eval_case = types .EvalCase (
662+ eval_case_id = "successful-tool-use" ,
663+ agent_data = types .evals .AgentData (
664+ agents = weather_agent ,
665+ turns = [
666+ types .evals .ConversationTurn (
667+ turn_index = 0 ,
668+ events = [
669+ types .evals .AgentEvent (
670+ author = "user" ,
671+ content = genai_types .Content (
672+ role = "user" ,
673+ parts = [
674+ genai_types .Part (
675+ text = "What is the weather in Tokyo?"
676+ )
677+ ],
678+ ),
679+ ),
680+ types .evals .AgentEvent (
681+ author = "weather_bot" ,
682+ content = genai_types .Content (
683+ role = "model" ,
684+ parts = [
685+ genai_types .Part (
686+ function_call = genai_types .FunctionCall (
687+ id = "tool_call_0" ,
688+ name = "get_weather" ,
689+ args = {"location" : "Tokyo" },
690+ )
691+ )
692+ ],
693+ ),
694+ ),
695+ types .evals .AgentEvent (
696+ author = "weather_bot" ,
697+ content = genai_types .Content (
698+ role = "tool" ,
699+ parts = [
700+ genai_types .Part (
701+ function_response = genai_types .FunctionResponse (
702+ id = "tool_call_0" ,
703+ name = "get_weather" ,
704+ response = {"weather" : "75F and sunny" },
705+ )
706+ )
707+ ],
708+ ),
709+ ),
710+ types .evals .AgentEvent (
711+ author = "weather_bot" ,
712+ content = genai_types .Content (
713+ role = "model" ,
714+ parts = [
715+ genai_types .Part (
716+ text = (
717+ "It is currently 75F and sunny in" " Tokyo."
718+ )
719+ )
720+ ],
721+ ),
722+ ),
723+ ],
724+ )
725+ ],
726+ ),
727+ )
728+
729+ eval_dataset = types .EvaluationDataset (eval_cases = [eval_case ])
730+
731+ metrics = [
732+ types .RubricMetric .FINAL_RESPONSE_QUALITY ,
733+ types .RubricMetric .TOOL_USE_QUALITY ,
734+ types .RubricMetric .HALLUCINATION ,
735+ types .RubricMetric .SAFETY ,
736+ types .RubricMetric .GENERAL_QUALITY ,
737+ types .RubricMetric .TEXT_QUALITY ,
738+ ]
739+
740+ evaluation_result = client .evals .evaluate (dataset = eval_dataset , metrics = metrics )
741+
742+ assert isinstance (evaluation_result , types .EvaluationResult )
743+ assert evaluation_result .summary_metrics is not None
744+ assert len (evaluation_result .summary_metrics ) > 0
745+ for summary in evaluation_result .summary_metrics :
746+ assert isinstance (summary , types .AggregatedMetricResult )
747+ assert summary .metric_name is not None
748+
749+ assert evaluation_result .eval_case_results is not None
750+ assert len (evaluation_result .eval_case_results ) == 1
751+
752+
465753pytestmark = pytest_helper .setup (
466754 file = __file__ ,
467755 globals_for_file = globals (),
0 commit comments