Skip to content

Commit 5dacd6b

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: Add a warning that the autorater config is not applicable for predefined metrics in SDK
PiperOrigin-RevId: 910847685
1 parent 6b7a529 commit 5dacd6b

4 files changed

Lines changed: 65 additions & 19 deletions

File tree

agentplatform/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2409,7 +2409,8 @@ class EvaluationRunConfig(_common.BaseModel):
24092409
default=None, description="""The output config for the evaluation run."""
24102410
)
24112411
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2412-
default=None, description="""The autorater config for the evaluation run."""
2412+
default=None,
2413+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24132414
)
24142415
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24152416
default=None, description="""The prompt template used for inference."""
@@ -2439,7 +2440,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24392440
"""The output config for the evaluation run."""
24402441

24412442
autorater_config: Optional[genai_types.AutoraterConfigDict]
2442-
"""The autorater config for the evaluation run."""
2443+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24432444

24442445
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24452446
"""The prompt template used for inference."""
@@ -4746,7 +4747,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47464747
default=None, description=""""""
47474748
)
47484749
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4749-
default=None, description=""""""
4750+
default=None,
4751+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47504752
)
47514753
metrics: Optional[list[Metric]] = Field(
47524754
default=None,
@@ -4797,7 +4799,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
47974799
""""""
47984800

47994801
autorater_config: Optional[genai_types.AutoraterConfigDict]
4800-
""""""
4802+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48014803

48024804
metrics: Optional[list[MetricDict]]
48034805
"""The metrics used for evaluation.
@@ -19028,7 +19030,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1902819030
default=None, description=""""""
1902919031
)
1903019032
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19031-
default=None, description=""""""
19033+
default=None,
19034+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1903219035
)
1903319036
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1903419037

@@ -19046,7 +19049,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1904619049
""""""
1904719050

1904819051
autorater_config: Optional[genai_types.AutoraterConfigDict]
19049-
""""""
19052+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1905019053

1905119054
config: Optional[EvaluateDatasetConfigDict]
1905219055
""""""

tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,39 @@ def test_evaluation_result(client):
106106
# assert case_result.response_candidate_results is not None
107107

108108

109+
def test_predefined_metric_with_judge_model_ignores_autorater_config(client):
110+
"""Tests that autorater_config is ignored for predefined metrics in replays."""
111+
prompts_df = pd.DataFrame(
112+
{
113+
"prompt": ["Explain the concept of machine learning in simple terms."],
114+
"response": [
115+
"Machine learning is a type of artificial intelligence that allows"
116+
" computers to learn from data without being explicitly programmed."
117+
],
118+
}
119+
)
120+
121+
eval_dataset = types.EvaluationDataset(
122+
eval_dataset_df=prompts_df,
123+
candidate_name="gemini-2.5-flash",
124+
)
125+
126+
# Set judge_model, which should be ignored for predefined metrics
127+
metric = types.Metric(
128+
name="safety_v1",
129+
judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash"
130+
)
131+
132+
evaluation_result = client.evals.evaluate(
133+
dataset=eval_dataset,
134+
metrics=[metric],
135+
)
136+
137+
assert isinstance(evaluation_result, types.EvaluationResult)
138+
assert evaluation_result.summary_metrics is not None
139+
assert evaluation_result.summary_metrics[0].metric_name == "safety_v1"
140+
141+
109142
def test_multi_turn_predefined_metric(client):
110143
"""Tests that evaluate works with multi-turn predefined metrics."""
111144
prompts_data = {

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from . import evals
3737
from . import types
3838

39-
4039
logger = logging.getLogger(__name__)
4140
_MAX_RETRIES = 5
4241
# HTTP status codes that are safe to retry with backoff.
@@ -972,6 +971,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972971
raise ValueError(
973972
f"Metric '{self.metric.name}' is not a supported predefined metric."
974973
)
974+
if (
975+
self.metric.judge_model
976+
or self.metric.judge_model_generation_config
977+
or self.metric.judge_model_sampling_count
978+
):
979+
logger.warning(
980+
"Autorater config settings (judge_model, "
981+
"judge_model_generation_config, judge_model_sampling_count) "
982+
"are ignored for predefined metric '%s'.",
983+
self.metric.name,
984+
)
975985

976986
def _build_request_payload(
977987
self, eval_case: types.EvalCase, response_index: int
@@ -1026,11 +1036,9 @@ def _build_request_payload(
10261036
"instance": instance_payload,
10271037
}
10281038

1029-
autorater_config = _get_autorater_config(self.metric)
1030-
if autorater_config:
1031-
request_payload["autorater_config"] = genai_types.AutoraterConfig(
1032-
**autorater_config
1033-
)
1039+
# Note: autorater_config is intentionally not passed for predefined
1040+
# metrics. The server uses its own model configuration for predefined
1041+
# metrics and ignores the autorater_config field.
10341042
return request_payload
10351043

10361044
@override
@@ -1045,7 +1053,6 @@ def get_metric_result(
10451053
lambda: self.module._evaluate_instances(
10461054
metrics=[self.metric],
10471055
instance=payload.get("instance"),
1048-
autorater_config=payload.get("autorater_config"),
10491056
),
10501057
metric_name,
10511058
)

vertexai/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2409,7 +2409,8 @@ class EvaluationRunConfig(_common.BaseModel):
24092409
default=None, description="""The output config for the evaluation run."""
24102410
)
24112411
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2412-
default=None, description="""The autorater config for the evaluation run."""
2412+
default=None,
2413+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24132414
)
24142415
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24152416
default=None, description="""The prompt template used for inference."""
@@ -2439,7 +2440,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24392440
"""The output config for the evaluation run."""
24402441

24412442
autorater_config: Optional[genai_types.AutoraterConfigDict]
2442-
"""The autorater config for the evaluation run."""
2443+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24432444

24442445
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24452446
"""The prompt template used for inference."""
@@ -4746,7 +4747,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47464747
default=None, description=""""""
47474748
)
47484749
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4749-
default=None, description=""""""
4750+
default=None,
4751+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47504752
)
47514753
metrics: Optional[list[Metric]] = Field(
47524754
default=None,
@@ -4797,7 +4799,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
47974799
""""""
47984800

47994801
autorater_config: Optional[genai_types.AutoraterConfigDict]
4800-
""""""
4802+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48014803

48024804
metrics: Optional[list[MetricDict]]
48034805
"""The metrics used for evaluation.
@@ -19028,7 +19030,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1902819030
default=None, description=""""""
1902919031
)
1903019032
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19031-
default=None, description=""""""
19033+
default=None,
19034+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1903219035
)
1903319036
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1903419037

@@ -19046,7 +19049,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1904619049
""""""
1904719050

1904819051
autorater_config: Optional[genai_types.AutoraterConfigDict]
19049-
""""""
19052+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1905019053

1905119054
config: Optional[EvaluateDatasetConfigDict]
1905219055
""""""

0 commit comments

Comments
 (0)