Skip to content

Commit aa0d51d

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: Add a warning that the autorater config is not applicable for predefined metrics in SDK
PiperOrigin-RevId: 910847685
1 parent a99f340 commit aa0d51d

5 files changed

Lines changed: 77 additions & 19 deletions

File tree

agentplatform/_genai/_evals_metric_handlers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972972
raise ValueError(
973973
f"Metric '{self.metric.name}' is not a supported predefined metric."
974974
)
975+
if (
976+
self.metric.judge_model
977+
or self.metric.judge_model_generation_config
978+
or self.metric.judge_model_sampling_count
979+
):
980+
logger.warning(
981+
"Autorater config settings (judge_model, "
982+
"judge_model_generation_config, judge_model_sampling_count) "
983+
"are ignored for predefined metric '%s'.",
984+
self.metric.name,
985+
)
975986

976987
def _build_request_payload(
977988
self, eval_case: types.EvalCase, response_index: int
@@ -1031,6 +1042,7 @@ def _build_request_payload(
10311042
request_payload["autorater_config"] = genai_types.AutoraterConfig(
10321043
**autorater_config
10331044
)
1045+
10341046
return request_payload
10351047

10361048
@override

agentplatform/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,7 +2435,8 @@ class EvaluationRunConfig(_common.BaseModel):
24352435
default=None, description="""The output config for the evaluation run."""
24362436
)
24372437
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2438-
default=None, description="""The autorater config for the evaluation run."""
2438+
default=None,
2439+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24392440
)
24402441
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24412442
default=None, description="""The prompt template used for inference."""
@@ -2465,7 +2466,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24652466
"""The output config for the evaluation run."""
24662467

24672468
autorater_config: Optional[genai_types.AutoraterConfigDict]
2468-
"""The autorater config for the evaluation run."""
2469+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24692470

24702471
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24712472
"""The prompt template used for inference."""
@@ -4772,7 +4773,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47724773
default=None, description=""""""
47734774
)
47744775
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4775-
default=None, description=""""""
4776+
default=None,
4777+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47764778
)
47774779
metrics: Optional[list[Metric]] = Field(
47784780
default=None,
@@ -4823,7 +4825,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
48234825
""""""
48244826

48254827
autorater_config: Optional[genai_types.AutoraterConfigDict]
4826-
""""""
4828+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48274829

48284830
metrics: Optional[list[MetricDict]]
48294831
"""The metrics used for evaluation.
@@ -19065,7 +19067,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1906519067
default=None, description=""""""
1906619068
)
1906719069
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19068-
default=None, description=""""""
19070+
default=None,
19071+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1906919072
)
1907019073
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1907119074

@@ -19083,7 +19086,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1908319086
""""""
1908419087

1908519088
autorater_config: Optional[genai_types.AutoraterConfigDict]
19086-
""""""
19089+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1908719090

1908819091
config: Optional[EvaluateDatasetConfigDict]
1908919092
""""""

tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,39 @@ def test_evaluation_result(client):
106106
# assert case_result.response_candidate_results is not None
107107

108108

109+
def test_predefined_metric_with_judge_model_ignores_autorater_config(client):
110+
"""Tests that autorater_config is ignored for predefined metrics in replays."""
111+
prompts_df = pd.DataFrame(
112+
{
113+
"prompt": ["Explain the concept of machine learning in simple terms."],
114+
"response": [
115+
"Machine learning is a type of artificial intelligence that allows"
116+
" computers to learn from data without being explicitly programmed."
117+
],
118+
}
119+
)
120+
121+
eval_dataset = types.EvaluationDataset(
122+
eval_dataset_df=prompts_df,
123+
candidate_name="gemini-2.5-flash",
124+
)
125+
126+
# Set judge_model, which should be ignored for predefined metrics
127+
metric = types.Metric(
128+
name="safety_v1",
129+
judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash",
130+
)
131+
132+
evaluation_result = client.evals.evaluate(
133+
dataset=eval_dataset,
134+
metrics=[metric],
135+
)
136+
137+
assert isinstance(evaluation_result, types.EvaluationResult)
138+
assert evaluation_result.summary_metrics is not None
139+
assert evaluation_result.summary_metrics[0].metric_name == "safety_v1"
140+
141+
109142
def test_multi_turn_predefined_metric(client):
110143
"""Tests that evaluate works with multi-turn predefined metrics."""
111144
prompts_data = {

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from . import evals
3737
from . import types
3838

39-
4039
logger = logging.getLogger(__name__)
4140
_MAX_RETRIES = 5
4241
# HTTP status codes that are safe to retry with backoff.
@@ -972,6 +971,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972971
raise ValueError(
973972
f"Metric '{self.metric.name}' is not a supported predefined metric."
974973
)
974+
if (
975+
self.metric.judge_model
976+
or self.metric.judge_model_generation_config
977+
or self.metric.judge_model_sampling_count
978+
):
979+
logger.warning(
980+
"Autorater config settings (judge_model, "
981+
"judge_model_generation_config, judge_model_sampling_count) "
982+
"are ignored for predefined metric '%s'.",
983+
self.metric.name,
984+
)
975985

976986
def _build_request_payload(
977987
self, eval_case: types.EvalCase, response_index: int
@@ -1026,11 +1036,9 @@ def _build_request_payload(
10261036
"instance": instance_payload,
10271037
}
10281038

1029-
autorater_config = _get_autorater_config(self.metric)
1030-
if autorater_config:
1031-
request_payload["autorater_config"] = genai_types.AutoraterConfig(
1032-
**autorater_config
1033-
)
1039+
# Note: autorater_config is intentionally not passed for predefined
1040+
# metrics. The server uses its own model configuration for predefined
1041+
# metrics and ignores the autorater_config field.
10341042
return request_payload
10351043

10361044
@override
@@ -1045,7 +1053,6 @@ def get_metric_result(
10451053
lambda: self.module._evaluate_instances(
10461054
metrics=[self.metric],
10471055
instance=payload.get("instance"),
1048-
autorater_config=payload.get("autorater_config"),
10491056
),
10501057
metric_name,
10511058
)

vertexai/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,7 +2420,8 @@ class EvaluationRunConfig(_common.BaseModel):
24202420
default=None, description="""The output config for the evaluation run."""
24212421
)
24222422
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2423-
default=None, description="""The autorater config for the evaluation run."""
2423+
default=None,
2424+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24242425
)
24252426
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24262427
default=None, description="""The prompt template used for inference."""
@@ -2450,7 +2451,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24502451
"""The output config for the evaluation run."""
24512452

24522453
autorater_config: Optional[genai_types.AutoraterConfigDict]
2453-
"""The autorater config for the evaluation run."""
2454+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24542455

24552456
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24562457
"""The prompt template used for inference."""
@@ -4757,7 +4758,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47574758
default=None, description=""""""
47584759
)
47594760
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4760-
default=None, description=""""""
4761+
default=None,
4762+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47614763
)
47624764
metrics: Optional[list[Metric]] = Field(
47634765
default=None,
@@ -4808,7 +4810,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
48084810
""""""
48094811

48104812
autorater_config: Optional[genai_types.AutoraterConfigDict]
4811-
""""""
4813+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48124814

48134815
metrics: Optional[list[MetricDict]]
48144816
"""The metrics used for evaluation.
@@ -19050,7 +19052,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1905019052
default=None, description=""""""
1905119053
)
1905219054
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19053-
default=None, description=""""""
19055+
default=None,
19056+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1905419057
)
1905519058
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1905619059

@@ -19068,7 +19071,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1906819071
""""""
1906919072

1907019073
autorater_config: Optional[genai_types.AutoraterConfigDict]
19071-
""""""
19074+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1907219075

1907319076
config: Optional[EvaluateDatasetConfigDict]
1907419077
""""""

0 commit comments

Comments
 (0)