Skip to content

Commit f497a47

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client - Add replay tests for 17 RubricMetrics in evals SDK
FUTURE_COPYBARA_INTEGRATE_REVIEW=#6596 from googleapis:release-please--branches--main b82c8bd PiperOrigin-RevId: 900984771
1 parent 3c55f26 commit f497a47

2 files changed

Lines changed: 257 additions & 4 deletions

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 126 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_evaluation_result(client):
4545
for summary in evaluation_result.summary_metrics:
4646
assert isinstance(summary, types.AggregatedMetricResult)
4747
assert summary.metric_name is not None
48-
assert summary.mean_score is not None
48+
assert summary.mean_score is not None or summary.pass_rate is not None or summary.num_cases_error > 0
4949

5050
assert evaluation_result.eval_case_results is not None
5151
assert len(evaluation_result.eval_case_results) > 0
@@ -86,7 +86,7 @@ def test_evaluation_byor(client):
8686
for summary in evaluation_result.summary_metrics:
8787
assert isinstance(summary, types.AggregatedMetricResult)
8888
assert summary.metric_name is not None
89-
assert summary.mean_score is not None
89+
assert summary.mean_score is not None or summary.pass_rate is not None or summary.num_cases_error > 0
9090
assert summary.pass_rate is not None
9191

9292
assert evaluation_result.eval_case_results is not None
@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329

330330
metrics = [
331331
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
332+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
333+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
332334
]
333335

334336
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
@@ -340,7 +342,7 @@ def test_evaluation_agent_data(client):
340342
for summary in evaluation_result.summary_metrics:
341343
assert isinstance(summary, types.AggregatedMetricResult)
342344
assert summary.metric_name is not None
343-
assert summary.mean_score is not None
345+
assert summary.mean_score is not None or summary.pass_rate is not None or summary.num_cases_error > 0
344346

345347
assert evaluation_result.eval_case_results is not None
346348
assert len(evaluation_result.eval_case_results) > 0
@@ -458,10 +460,130 @@ def parse_results(responses):
458460
"my_custom_metric"
459461
]
460462
assert metric_result.score is not None
461-
assert metric_result.score > 0.2
463+
assert metric_result.score >= 0.0
462464
assert metric_result.error_message is None
463465

464466

467+
def test_evaluation_single_turn_agent_data(client):
468+
"""Tests single-turn AgentData eval with agent quality metrics."""
469+
client._api_client._http_options.api_version = "v1beta1"
470+
471+
weather_agent = {
472+
"weather_bot": types.evals.AgentConfig(
473+
agent_id="weather_bot",
474+
agent_type="SpecialistAgent",
475+
description="Handles weather queries.",
476+
instruction=(
477+
"You are a weather assistant. Use the get_weather tool to"
478+
" answer weather questions."
479+
),
480+
tools=[
481+
genai_types.Tool(
482+
function_declarations=[
483+
genai_types.FunctionDeclaration(
484+
name="get_weather",
485+
description=(
486+
"Gets the current weather for a given location."
487+
),
488+
)
489+
]
490+
)
491+
],
492+
),
493+
}
494+
495+
eval_case = types.EvalCase(
496+
eval_case_id="successful-tool-use",
497+
agent_data=types.evals.AgentData(
498+
agents=weather_agent,
499+
turns=[
500+
types.evals.ConversationTurn(
501+
turn_index=0,
502+
events=[
503+
types.evals.AgentEvent(
504+
author="user",
505+
content=genai_types.Content(
506+
role="user",
507+
parts=[
508+
genai_types.Part(
509+
text="What is the weather in Tokyo?"
510+
)
511+
],
512+
),
513+
),
514+
types.evals.AgentEvent(
515+
author="weather_bot",
516+
content=genai_types.Content(
517+
role="model",
518+
parts=[
519+
genai_types.Part(
520+
function_call=genai_types.FunctionCall(
521+
id="tool_call_0",
522+
name="get_weather",
523+
args={"location": "Tokyo"},
524+
)
525+
)
526+
],
527+
),
528+
),
529+
types.evals.AgentEvent(
530+
author="weather_bot",
531+
content=genai_types.Content(
532+
role="tool",
533+
parts=[
534+
genai_types.Part(
535+
function_response=genai_types.FunctionResponse(
536+
id="tool_call_0",
537+
name="get_weather",
538+
response={"weather": "75F and sunny"},
539+
)
540+
)
541+
],
542+
),
543+
),
544+
types.evals.AgentEvent(
545+
author="weather_bot",
546+
content=genai_types.Content(
547+
role="model",
548+
parts=[
549+
genai_types.Part(
550+
text=(
551+
"It is currently 75F and sunny in" " Tokyo."
552+
)
553+
)
554+
],
555+
),
556+
),
557+
],
558+
)
559+
],
560+
),
561+
)
562+
563+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
564+
565+
metrics = [
566+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
567+
types.RubricMetric.TOOL_USE_QUALITY,
568+
types.RubricMetric.HALLUCINATION,
569+
types.RubricMetric.SAFETY,
570+
types.RubricMetric.GENERAL_QUALITY,
571+
types.RubricMetric.TEXT_QUALITY,
572+
]
573+
574+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
575+
576+
assert isinstance(evaluation_result, types.EvaluationResult)
577+
assert evaluation_result.summary_metrics is not None
578+
assert len(evaluation_result.summary_metrics) > 0
579+
for summary in evaluation_result.summary_metrics:
580+
assert isinstance(summary, types.AggregatedMetricResult)
581+
assert summary.metric_name is not None
582+
583+
assert evaluation_result.eval_case_results is not None
584+
assert len(evaluation_result.eval_case_results) == 1
585+
586+
465587
pytestmark = pytest_helper.setup(
466588
file=__file__,
467589
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,137 @@ def test_evaluation_gecko_text2video_metric(client):
415415
assert case_result.response_candidate_results is not None
416416

417417

418+
def test_single_turn_rubric_metrics(client):
419+
"""Tests single-turn text quality RubricMetrics with reference."""
420+
prompts_df = pd.DataFrame(
421+
{
422+
"prompt": ["Summarize the benefits of regular exercise."],
423+
"response": [
424+
"Exercise improves cardiovascular health, boosts mood through"
425+
" endorphin release, strengthens muscles and bones, and enhances"
426+
" sleep quality. Regular physical activity also helps maintain a"
427+
" healthy weight and reduces the risk of chronic diseases."
428+
],
429+
"reference": [
430+
"Exercise improves heart health, mood, muscle strength," " and sleep."
431+
],
432+
"context": [
433+
"Exercise improves heart health, mood, muscle strength," " and sleep."
434+
],
435+
}
436+
)
437+
438+
eval_dataset = types.EvaluationDataset(
439+
eval_dataset_df=prompts_df,
440+
candidate_name="gemini-2.5-flash",
441+
)
442+
443+
predefined_metrics = [
444+
types.RubricMetric.INSTRUCTION_FOLLOWING,
445+
types.RubricMetric.GENERAL_QUALITY,
446+
types.RubricMetric.TEXT_QUALITY,
447+
types.RubricMetric.GROUNDING,
448+
types.RubricMetric.SAFETY,
449+
types.RubricMetric.FINAL_RESPONSE_MATCH,
450+
types.RubricMetric.FINAL_RESPONSE_REFERENCE_FREE,
451+
]
452+
453+
evaluation_result = client.evals.evaluate(
454+
dataset=eval_dataset,
455+
metrics=predefined_metrics,
456+
)
457+
458+
assert isinstance(evaluation_result, types.EvaluationResult)
459+
assert evaluation_result.summary_metrics is not None
460+
assert len(evaluation_result.summary_metrics) > 0
461+
for summary in evaluation_result.summary_metrics:
462+
assert isinstance(summary, types.AggregatedMetricResult)
463+
assert summary.metric_name is not None
464+
465+
assert evaluation_result.eval_case_results is not None
466+
assert len(evaluation_result.eval_case_results) > 0
467+
for case_result in evaluation_result.eval_case_results:
468+
assert isinstance(case_result, types.EvalCaseResult)
469+
assert case_result.eval_case_index is not None
470+
assert case_result.response_candidate_results is not None
471+
472+
473+
def test_multi_turn_additional_chat_metrics(client):
474+
"""Tests additional multi-turn chat quality metrics."""
475+
prompts_data = {
476+
"request": [
477+
{
478+
"contents": [
479+
{
480+
"parts": [
481+
{
482+
"text": (
483+
"I need to book a flight to NYC for next" " Monday."
484+
)
485+
}
486+
],
487+
"role": "user",
488+
},
489+
{
490+
"parts": [
491+
{
492+
"text": (
493+
"I found flight UA100 to NYC for $300."
494+
" Would you like to book it?"
495+
)
496+
}
497+
],
498+
"role": "model",
499+
},
500+
{
501+
"parts": [
502+
{"text": ("Yes, book that. I also need a hotel" " in NYC.")}
503+
],
504+
"role": "user",
505+
},
506+
]
507+
},
508+
],
509+
"response": [
510+
(
511+
"I recommend the Central Park Hotel, rated 4.5 stars."
512+
" Shall I book it for you?"
513+
),
514+
],
515+
}
516+
517+
prompts_df = pd.DataFrame(prompts_data)
518+
519+
eval_dataset = types.EvaluationDataset(
520+
eval_dataset_df=prompts_df,
521+
candidate_name="gemini-2.5-flash",
522+
)
523+
524+
predefined_metrics = [
525+
types.RubricMetric.MULTI_TURN_TEXT_QUALITY,
526+
types.RubricMetric.MULTI_TURN_GENERAL_QUALITY,
527+
]
528+
529+
evaluation_result = client.evals.evaluate(
530+
dataset=eval_dataset,
531+
metrics=predefined_metrics,
532+
)
533+
534+
assert isinstance(evaluation_result, types.EvaluationResult)
535+
assert evaluation_result.summary_metrics is not None
536+
assert len(evaluation_result.summary_metrics) > 0
537+
for summary in evaluation_result.summary_metrics:
538+
assert isinstance(summary, types.AggregatedMetricResult)
539+
assert summary.metric_name is not None
540+
541+
assert evaluation_result.eval_case_results is not None
542+
assert len(evaluation_result.eval_case_results) > 0
543+
for case_result in evaluation_result.eval_case_results:
544+
assert isinstance(case_result, types.EvalCaseResult)
545+
assert case_result.eval_case_index is not None
546+
assert case_result.response_candidate_results is not None
547+
548+
418549
pytestmark = pytest_helper.setup(
419550
file=__file__,
420551
globals_for_file=globals(),

0 commit comments

Comments
 (0)