Skip to content

Commit 5dffb22

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client - Add replay tests for 17 RubricMetrics in evals SDK
FUTURE_COPYBARA_INTEGRATE_REVIEW=#6596 from googleapis:release-please--branches--main b82c8bd PiperOrigin-RevId: 900984771
1 parent e5e6346 commit 5dffb22

2 files changed

Lines changed: 186 additions & 3 deletions

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329

330330
metrics = [
331331
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
332+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
333+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
332334
]
333335

334336
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
@@ -458,10 +460,130 @@ def parse_results(responses):
458460
"my_custom_metric"
459461
]
460462
assert metric_result.score is not None
461-
assert metric_result.score > 0.2
463+
assert metric_result.score >= 0.0
462464
assert metric_result.error_message is None
463465

464466

467+
def test_evaluation_single_turn_agent_data(client):
468+
"""Tests single-turn AgentData eval with agent quality metrics."""
469+
client._api_client._http_options.api_version = "v1beta1"
470+
471+
weather_agent = {
472+
"weather_bot": types.evals.AgentConfig(
473+
agent_id="weather_bot",
474+
agent_type="SpecialistAgent",
475+
description="Handles weather queries.",
476+
instruction=(
477+
"You are a weather assistant. Use the get_weather tool to"
478+
" answer weather questions."
479+
),
480+
tools=[
481+
genai_types.Tool(
482+
function_declarations=[
483+
genai_types.FunctionDeclaration(
484+
name="get_weather",
485+
description=(
486+
"Gets the current weather for a given location."
487+
),
488+
)
489+
]
490+
)
491+
],
492+
),
493+
}
494+
495+
eval_case = types.EvalCase(
496+
eval_case_id="successful-tool-use",
497+
agent_data=types.evals.AgentData(
498+
agents=weather_agent,
499+
turns=[
500+
types.evals.ConversationTurn(
501+
turn_index=0,
502+
events=[
503+
types.evals.AgentEvent(
504+
author="user",
505+
content=genai_types.Content(
506+
role="user",
507+
parts=[
508+
genai_types.Part(
509+
text="What is the weather in Tokyo?"
510+
)
511+
],
512+
),
513+
),
514+
types.evals.AgentEvent(
515+
author="weather_bot",
516+
content=genai_types.Content(
517+
role="model",
518+
parts=[
519+
genai_types.Part(
520+
function_call=genai_types.FunctionCall(
521+
id="tool_call_0",
522+
name="get_weather",
523+
args={"location": "Tokyo"},
524+
)
525+
)
526+
],
527+
),
528+
),
529+
types.evals.AgentEvent(
530+
author="weather_bot",
531+
content=genai_types.Content(
532+
role="tool",
533+
parts=[
534+
genai_types.Part(
535+
function_response=genai_types.FunctionResponse(
536+
id="tool_call_0",
537+
name="get_weather",
538+
response={"weather": "75F and sunny"},
539+
)
540+
)
541+
],
542+
),
543+
),
544+
types.evals.AgentEvent(
545+
author="weather_bot",
546+
content=genai_types.Content(
547+
role="model",
548+
parts=[
549+
genai_types.Part(
550+
text=(
551+
"It is currently 75F and sunny in" " Tokyo."
552+
)
553+
)
554+
],
555+
),
556+
),
557+
],
558+
)
559+
],
560+
),
561+
)
562+
563+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
564+
565+
metrics = [
566+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
567+
types.RubricMetric.TOOL_USE_QUALITY,
568+
types.RubricMetric.HALLUCINATION,
569+
types.RubricMetric.SAFETY,
570+
types.RubricMetric.GENERAL_QUALITY,
571+
types.RubricMetric.TEXT_QUALITY,
572+
]
573+
574+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
575+
576+
assert isinstance(evaluation_result, types.EvaluationResult)
577+
assert evaluation_result.summary_metrics is not None
578+
assert len(evaluation_result.summary_metrics) > 0
579+
for summary in evaluation_result.summary_metrics:
580+
assert isinstance(summary, types.AggregatedMetricResult)
581+
assert summary.metric_name is not None
582+
583+
assert evaluation_result.eval_case_results is not None
584+
assert len(evaluation_result.eval_case_results) == 1
585+
586+
465587
pytestmark = pytest_helper.setup(
466588
file=__file__,
467589
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ def test_multi_turn_predefined_metric(client):
224224

225225
predefined_metrics = [
226226
types.RubricMetric.MULTI_TURN_GENERAL_QUALITY,
227+
types.RubricMetric.MULTI_TURN_TEXT_QUALITY,
227228
]
228229

229230
evaluation_result = client.evals.evaluate(
@@ -233,11 +234,16 @@ def test_multi_turn_predefined_metric(client):
233234

234235
assert isinstance(evaluation_result, types.EvaluationResult)
235236
assert evaluation_result.summary_metrics is not None
236-
assert len(evaluation_result.summary_metrics) > 0
237+
assert len(evaluation_result.summary_metrics) == 2
238+
metric_names = set()
237239
for summary in evaluation_result.summary_metrics:
238240
assert isinstance(summary, types.AggregatedMetricResult)
239-
assert summary.metric_name == "multi_turn_general_quality_v1"
241+
metric_names.add(summary.metric_name)
240242
assert isinstance(summary.mean_score, float)
243+
assert metric_names == {
244+
"multi_turn_general_quality_v1",
245+
"multi_turn_text_quality_v1",
246+
}
241247

242248
assert evaluation_result.eval_case_results is not None
243249
assert len(evaluation_result.eval_case_results) > 0
@@ -415,6 +421,61 @@ def test_evaluation_gecko_text2video_metric(client):
415421
assert case_result.response_candidate_results is not None
416422

417423

424+
def test_single_turn_rubric_metrics(client):
425+
"""Tests single-turn text quality RubricMetrics with reference."""
426+
prompts_df = pd.DataFrame(
427+
{
428+
"prompt": ["Summarize the benefits of regular exercise."],
429+
"response": [
430+
"Exercise improves cardiovascular health, boosts mood through"
431+
" endorphin release, strengthens muscles and bones, and enhances"
432+
" sleep quality. Regular physical activity also helps maintain a"
433+
" healthy weight and reduces the risk of chronic diseases."
434+
],
435+
"reference": [
436+
"Exercise improves heart health, mood, muscle strength," " and sleep."
437+
],
438+
"context": [
439+
"Exercise improves heart health, mood, muscle strength," " and sleep."
440+
],
441+
}
442+
)
443+
444+
eval_dataset = types.EvaluationDataset(
445+
eval_dataset_df=prompts_df,
446+
candidate_name="gemini-2.5-flash",
447+
)
448+
449+
predefined_metrics = [
450+
types.RubricMetric.INSTRUCTION_FOLLOWING,
451+
types.RubricMetric.GENERAL_QUALITY,
452+
types.RubricMetric.TEXT_QUALITY,
453+
types.RubricMetric.GROUNDING,
454+
types.RubricMetric.SAFETY,
455+
types.RubricMetric.FINAL_RESPONSE_MATCH,
456+
types.RubricMetric.FINAL_RESPONSE_REFERENCE_FREE,
457+
]
458+
459+
evaluation_result = client.evals.evaluate(
460+
dataset=eval_dataset,
461+
metrics=predefined_metrics,
462+
)
463+
464+
assert isinstance(evaluation_result, types.EvaluationResult)
465+
assert evaluation_result.summary_metrics is not None
466+
assert len(evaluation_result.summary_metrics) > 0
467+
for summary in evaluation_result.summary_metrics:
468+
assert isinstance(summary, types.AggregatedMetricResult)
469+
assert summary.metric_name is not None
470+
471+
assert evaluation_result.eval_case_results is not None
472+
assert len(evaluation_result.eval_case_results) > 0
473+
for case_result in evaluation_result.eval_case_results:
474+
assert isinstance(case_result, types.EvalCaseResult)
475+
assert case_result.eval_case_index is not None
476+
assert case_result.response_candidate_results is not None
477+
478+
418479
pytestmark = pytest_helper.setup(
419480
file=__file__,
420481
globals_for_file=globals(),

0 commit comments

Comments
 (0)