Skip to content

Commit 73d9067

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
docs: Add a warning that the autorater config is not applicable for predefined metrics in SDK
PiperOrigin-RevId: 910847685
1 parent 3bc2f58 commit 73d9067

5 files changed

Lines changed: 77 additions & 19 deletions

File tree

agentplatform/_genai/_evals_metric_handlers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972972
raise ValueError(
973973
f"Metric '{self.metric.name}' is not a supported predefined metric."
974974
)
975+
if (
976+
self.metric.judge_model
977+
or self.metric.judge_model_generation_config
978+
or self.metric.judge_model_sampling_count
979+
):
980+
logger.warning(
981+
"Autorater config settings (judge_model, "
982+
"judge_model_generation_config, judge_model_sampling_count) "
983+
"are ignored for predefined metric '%s'.",
984+
self.metric.name,
985+
)
975986

976987
def _build_request_payload(
977988
self, eval_case: types.EvalCase, response_index: int
@@ -1031,6 +1042,7 @@ def _build_request_payload(
10311042
request_payload["autorater_config"] = genai_types.AutoraterConfig(
10321043
**autorater_config
10331044
)
1045+
10341046
return request_payload
10351047

10361048
@override

agentplatform/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2457,7 +2457,8 @@ class EvaluationRunConfig(_common.BaseModel):
24572457
default=None, description="""The output config for the evaluation run."""
24582458
)
24592459
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2460-
default=None, description="""The autorater config for the evaluation run."""
2460+
default=None,
2461+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24612462
)
24622463
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24632464
default=None, description="""The prompt template used for inference."""
@@ -2487,7 +2488,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24872488
"""The output config for the evaluation run."""
24882489

24892490
autorater_config: Optional[genai_types.AutoraterConfigDict]
2490-
"""The autorater config for the evaluation run."""
2491+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24912492

24922493
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24932494
"""The prompt template used for inference."""
@@ -4794,7 +4795,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47944795
default=None, description=""""""
47954796
)
47964797
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4797-
default=None, description=""""""
4798+
default=None,
4799+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47984800
)
47994801
metrics: Optional[list[Metric]] = Field(
48004802
default=None,
@@ -4845,7 +4847,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
48454847
""""""
48464848

48474849
autorater_config: Optional[genai_types.AutoraterConfigDict]
4848-
""""""
4850+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48494851

48504852
metrics: Optional[list[MetricDict]]
48514853
"""The metrics used for evaluation.
@@ -20828,7 +20830,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
2082820830
default=None, description=""""""
2082920831
)
2083020832
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
20831-
default=None, description=""""""
20833+
default=None,
20834+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
2083220835
)
2083320836
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
2083420837

@@ -20846,7 +20849,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
2084620849
""""""
2084720850

2084820851
autorater_config: Optional[genai_types.AutoraterConfigDict]
20849-
""""""
20852+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
2085020853

2085120854
config: Optional[EvaluateDatasetConfigDict]
2085220855
""""""

tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,39 @@ def test_evaluation_result(client):
106106
# assert case_result.response_candidate_results is not None
107107

108108

109+
def test_predefined_metric_with_judge_model_ignores_autorater_config(client):
110+
"""Tests that autorater_config is ignored for predefined metrics in replays."""
111+
prompts_df = pd.DataFrame(
112+
{
113+
"prompt": ["Explain the concept of machine learning in simple terms."],
114+
"response": [
115+
"Machine learning is a type of artificial intelligence that allows"
116+
" computers to learn from data without being explicitly programmed."
117+
],
118+
}
119+
)
120+
121+
eval_dataset = types.EvaluationDataset(
122+
eval_dataset_df=prompts_df,
123+
candidate_name="gemini-2.5-flash",
124+
)
125+
126+
# Set judge_model, which should be ignored for predefined metrics
127+
metric = types.Metric(
128+
name="safety_v1",
129+
judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash",
130+
)
131+
132+
evaluation_result = client.evals.evaluate(
133+
dataset=eval_dataset,
134+
metrics=[metric],
135+
)
136+
137+
assert isinstance(evaluation_result, types.EvaluationResult)
138+
assert evaluation_result.summary_metrics is not None
139+
assert evaluation_result.summary_metrics[0].metric_name == "safety_v1"
140+
141+
109142
def test_multi_turn_predefined_metric(client):
110143
"""Tests that evaluate works with multi-turn predefined metrics."""
111144
prompts_data = {

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from . import evals
3737
from . import types
3838

39-
4039
logger = logging.getLogger(__name__)
4140
_MAX_RETRIES = 5
4241
# HTTP status codes that are safe to retry with backoff.
@@ -972,6 +971,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972971
raise ValueError(
973972
f"Metric '{self.metric.name}' is not a supported predefined metric."
974973
)
974+
if (
975+
self.metric.judge_model
976+
or self.metric.judge_model_generation_config
977+
or self.metric.judge_model_sampling_count
978+
):
979+
logger.warning(
980+
"Autorater config settings (judge_model, "
981+
"judge_model_generation_config, judge_model_sampling_count) "
982+
"are ignored for predefined metric '%s'.",
983+
self.metric.name,
984+
)
975985

976986
def _build_request_payload(
977987
self, eval_case: types.EvalCase, response_index: int
@@ -1026,11 +1036,9 @@ def _build_request_payload(
10261036
"instance": instance_payload,
10271037
}
10281038

1029-
autorater_config = _get_autorater_config(self.metric)
1030-
if autorater_config:
1031-
request_payload["autorater_config"] = genai_types.AutoraterConfig(
1032-
**autorater_config
1033-
)
1039+
# Note: autorater_config is intentionally not passed for predefined
1040+
# metrics. The server uses its own model configuration for predefined
1041+
# metrics and ignores the autorater_config field.
10341042
return request_payload
10351043

10361044
@override
@@ -1045,7 +1053,6 @@ def get_metric_result(
10451053
lambda: self.module._evaluate_instances(
10461054
metrics=[self.metric],
10471055
instance=payload.get("instance"),
1048-
autorater_config=payload.get("autorater_config"),
10491056
),
10501057
metric_name,
10511058
)

vertexai/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,7 +2420,8 @@ class EvaluationRunConfig(_common.BaseModel):
24202420
default=None, description="""The output config for the evaluation run."""
24212421
)
24222422
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2423-
default=None, description="""The autorater config for the evaluation run."""
2423+
default=None,
2424+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24242425
)
24252426
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24262427
default=None, description="""The prompt template used for inference."""
@@ -2450,7 +2451,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24502451
"""The output config for the evaluation run."""
24512452

24522453
autorater_config: Optional[genai_types.AutoraterConfigDict]
2453-
"""The autorater config for the evaluation run."""
2454+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24542455

24552456
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24562457
"""The prompt template used for inference."""
@@ -4757,7 +4758,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47574758
default=None, description=""""""
47584759
)
47594760
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4760-
default=None, description=""""""
4761+
default=None,
4762+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47614763
)
47624764
metrics: Optional[list[Metric]] = Field(
47634765
default=None,
@@ -4808,7 +4810,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
48084810
""""""
48094811

48104812
autorater_config: Optional[genai_types.AutoraterConfigDict]
4811-
""""""
4813+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48124814

48134815
metrics: Optional[list[MetricDict]]
48144816
"""The metrics used for evaluation.
@@ -19050,7 +19052,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1905019052
default=None, description=""""""
1905119053
)
1905219054
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19053-
default=None, description=""""""
19055+
default=None,
19056+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1905419057
)
1905519058
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1905619059

@@ -19068,7 +19071,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1906819071
""""""
1906919072

1907019073
autorater_config: Optional[genai_types.AutoraterConfigDict]
19071-
""""""
19074+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1907219075

1907319076
config: Optional[EvaluateDatasetConfigDict]
1907419077
""""""

0 commit comments

Comments
 (0)