Skip to content

Commit a86278f

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client - Add replay tests for 17 RubricMetrics in evals SDK
FUTURE_COPYBARA_INTEGRATE_REVIEW=#6596 from googleapis:release-please--branches--main b82c8bd PiperOrigin-RevId: 900984771
1 parent 3c55f26 commit a86278f

2 files changed

Lines changed: 420 additions & 1 deletion

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 289 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329

330330
metrics = [
331331
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
332+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
333+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
332334
]
333335

334336
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
@@ -458,10 +460,296 @@ def parse_results(responses):
458460
"my_custom_metric"
459461
]
460462
assert metric_result.score is not None
461-
assert metric_result.score > 0.2
463+
assert metric_result.score >= 0.0
462464
assert metric_result.error_message is None
463465

464466

467+
def test_evaluation_agent_data_additional_metrics(client):
468+
"""Tests AgentData eval with MULTI_TURN_TOOL_USE_QUALITY and MULTI_TURN_TASK_SUCCESS."""
469+
client._api_client._http_options.api_version = "v1beta1"
470+
471+
agent_data = types.evals.AgentData(
472+
agents={
473+
"coordinator": types.evals.AgentConfig(
474+
agent_id="coordinator",
475+
agent_type="RouterAgent",
476+
description="Root agent that delegates to specialists.",
477+
instruction=(
478+
"You are a travel coordinator. Delegate flight tasks to"
479+
" 'flight_bot' and hotel tasks to 'hotel_bot'."
480+
),
481+
sub_agents=["flight_bot", "hotel_bot"],
482+
tools=[
483+
genai_types.Tool(
484+
function_declarations=[
485+
genai_types.FunctionDeclaration(
486+
name="delegate_to_agent",
487+
description="Delegates conversation to a sub-agent.",
488+
)
489+
]
490+
)
491+
],
492+
),
493+
"flight_bot": types.evals.AgentConfig(
494+
agent_id="flight_bot",
495+
agent_type="SpecialistAgent",
496+
description="Handles flight searches.",
497+
instruction="Search for flights using the available tools.",
498+
tools=[
499+
genai_types.Tool(
500+
function_declarations=[
501+
genai_types.FunctionDeclaration(
502+
name="search_flights",
503+
description=(
504+
"Finds flights based on origin and destination."
505+
),
506+
)
507+
]
508+
)
509+
],
510+
),
511+
"hotel_bot": types.evals.AgentConfig(
512+
agent_id="hotel_bot",
513+
agent_type="SpecialistAgent",
514+
description="Handles hotel searches.",
515+
instruction="Search for hotels using the available tools.",
516+
tools=[
517+
genai_types.Tool(
518+
function_declarations=[
519+
genai_types.FunctionDeclaration(
520+
name="search_hotels",
521+
description="Finds hotels in a given location.",
522+
)
523+
]
524+
)
525+
],
526+
),
527+
},
528+
turns=[
529+
types.evals.ConversationTurn(
530+
turn_index=0,
531+
events=[
532+
types.evals.AgentEvent(
533+
author="user",
534+
content=genai_types.Content(
535+
role="user",
536+
parts=[
537+
genai_types.Part(
538+
text=(
539+
"I need to book a flight to NYC for next"
540+
" Monday."
541+
)
542+
)
543+
],
544+
),
545+
),
546+
types.evals.AgentEvent(
547+
author="coordinator",
548+
content=genai_types.Content(
549+
role="model",
550+
parts=[
551+
genai_types.Part(
552+
function_call=genai_types.FunctionCall(
553+
name="delegate_to_agent",
554+
args={"agent_name": "flight_bot"},
555+
)
556+
)
557+
],
558+
),
559+
),
560+
types.evals.AgentEvent(
561+
author="flight_bot",
562+
content=genai_types.Content(
563+
role="model",
564+
parts=[
565+
genai_types.Part(
566+
function_call=genai_types.FunctionCall(
567+
name="search_flights",
568+
args={
569+
"destination": "NYC",
570+
"date": "next Monday",
571+
},
572+
)
573+
)
574+
],
575+
),
576+
),
577+
types.evals.AgentEvent(
578+
author="flight_bot",
579+
content=genai_types.Content(
580+
role="tool",
581+
parts=[
582+
genai_types.Part(
583+
function_response=genai_types.FunctionResponse(
584+
name="search_flights",
585+
response={
586+
"flights": [
587+
{"id": "UA100", "price": "$300"}
588+
]
589+
},
590+
)
591+
)
592+
],
593+
),
594+
),
595+
types.evals.AgentEvent(
596+
author="flight_bot",
597+
content=genai_types.Content(
598+
role="model",
599+
parts=[
600+
genai_types.Part(
601+
text="I found flight UA100 to NYC for $300."
602+
)
603+
],
604+
),
605+
),
606+
],
607+
),
608+
],
609+
)
610+
611+
eval_case = types.EvalCase(agent_data=agent_data)
612+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
613+
614+
metrics = [
615+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
616+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
617+
]
618+
619+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
620+
621+
assert isinstance(evaluation_result, types.EvaluationResult)
622+
assert evaluation_result.summary_metrics is not None
623+
assert len(evaluation_result.summary_metrics) > 0
624+
for summary in evaluation_result.summary_metrics:
625+
assert isinstance(summary, types.AggregatedMetricResult)
626+
assert summary.metric_name is not None
627+
assert summary.mean_score is not None
628+
629+
assert evaluation_result.eval_case_results is not None
630+
assert len(evaluation_result.eval_case_results) > 0
631+
632+
633+
def test_evaluation_single_turn_agent_data(client):
634+
"""Tests single-turn AgentData eval with agent quality metrics."""
635+
client._api_client._http_options.api_version = "v1beta1"
636+
637+
weather_agent = {
638+
"weather_bot": types.evals.AgentConfig(
639+
agent_id="weather_bot",
640+
agent_type="SpecialistAgent",
641+
description="Handles weather queries.",
642+
instruction=(
643+
"You are a weather assistant. Use the get_weather tool to"
644+
" answer weather questions."
645+
),
646+
tools=[
647+
genai_types.Tool(
648+
function_declarations=[
649+
genai_types.FunctionDeclaration(
650+
name="get_weather",
651+
description=(
652+
"Gets the current weather for a given location."
653+
),
654+
)
655+
]
656+
)
657+
],
658+
),
659+
}
660+
661+
eval_case = types.EvalCase(
662+
eval_case_id="successful-tool-use",
663+
agent_data=types.evals.AgentData(
664+
agents=weather_agent,
665+
turns=[
666+
types.evals.ConversationTurn(
667+
turn_index=0,
668+
events=[
669+
types.evals.AgentEvent(
670+
author="user",
671+
content=genai_types.Content(
672+
role="user",
673+
parts=[
674+
genai_types.Part(
675+
text="What is the weather in Tokyo?"
676+
)
677+
],
678+
),
679+
),
680+
types.evals.AgentEvent(
681+
author="weather_bot",
682+
content=genai_types.Content(
683+
role="model",
684+
parts=[
685+
genai_types.Part(
686+
function_call=genai_types.FunctionCall(
687+
id="tool_call_0",
688+
name="get_weather",
689+
args={"location": "Tokyo"},
690+
)
691+
)
692+
],
693+
),
694+
),
695+
types.evals.AgentEvent(
696+
author="weather_bot",
697+
content=genai_types.Content(
698+
role="tool",
699+
parts=[
700+
genai_types.Part(
701+
function_response=genai_types.FunctionResponse(
702+
id="tool_call_0",
703+
name="get_weather",
704+
response={"weather": "75F and sunny"},
705+
)
706+
)
707+
],
708+
),
709+
),
710+
types.evals.AgentEvent(
711+
author="weather_bot",
712+
content=genai_types.Content(
713+
role="model",
714+
parts=[
715+
genai_types.Part(
716+
text=(
717+
"It is currently 75F and sunny in" " Tokyo."
718+
)
719+
)
720+
],
721+
),
722+
),
723+
],
724+
)
725+
],
726+
),
727+
)
728+
729+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
730+
731+
metrics = [
732+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
733+
types.RubricMetric.TOOL_USE_QUALITY,
734+
types.RubricMetric.HALLUCINATION,
735+
types.RubricMetric.SAFETY,
736+
types.RubricMetric.GENERAL_QUALITY,
737+
types.RubricMetric.TEXT_QUALITY,
738+
]
739+
740+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
741+
742+
assert isinstance(evaluation_result, types.EvaluationResult)
743+
assert evaluation_result.summary_metrics is not None
744+
assert len(evaluation_result.summary_metrics) > 0
745+
for summary in evaluation_result.summary_metrics:
746+
assert isinstance(summary, types.AggregatedMetricResult)
747+
assert summary.metric_name is not None
748+
749+
assert evaluation_result.eval_case_results is not None
750+
assert len(evaluation_result.eval_case_results) == 1
751+
752+
465753
pytestmark = pytest_helper.setup(
466754
file=__file__,
467755
globals_for_file=globals(),

0 commit comments

Comments
 (0)