Skip to content

Commit 7b6637e

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: Add a warning that the autorater config is not applicable for predefined metrics in SDK
PiperOrigin-RevId: 910847685
1 parent 1585602 commit 7b6637e

5 files changed

Lines changed: 73 additions & 12 deletions

File tree

agentplatform/_genai/_evals_metric_handlers.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972972
raise ValueError(
973973
f"Metric '{self.metric.name}' is not a supported predefined metric."
974974
)
975+
if (
976+
self.metric.judge_model
977+
or self.metric.judge_model_generation_config
978+
or self.metric.judge_model_sampling_count
979+
):
980+
logger.warning(
981+
"Autorater config settings (judge_model, "
982+
"judge_model_generation_config, judge_model_sampling_count) "
983+
"are ignored for predefined metric '%s'.",
984+
self.metric.name,
985+
)
975986

976987
def _build_request_payload(
977988
self, eval_case: types.EvalCase, response_index: int

agentplatform/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2424,7 +2424,8 @@ class EvaluationRunConfig(_common.BaseModel):
24242424
default=None, description="""The output config for the evaluation run."""
24252425
)
24262426
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2427-
default=None, description="""The autorater config for the evaluation run."""
2427+
default=None,
2428+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24282429
)
24292430
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24302431
default=None, description="""The prompt template used for inference."""
@@ -2454,7 +2455,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24542455
"""The output config for the evaluation run."""
24552456

24562457
autorater_config: Optional[genai_types.AutoraterConfigDict]
2457-
"""The autorater config for the evaluation run."""
2458+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24582459

24592460
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24602461
"""The prompt template used for inference."""
@@ -4761,7 +4762,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47614762
default=None, description=""""""
47624763
)
47634764
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4764-
default=None, description=""""""
4765+
default=None,
4766+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47654767
)
47664768
metrics: Optional[list[Metric]] = Field(
47674769
default=None,
@@ -4812,7 +4814,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
48124814
""""""
48134815

48144816
autorater_config: Optional[genai_types.AutoraterConfigDict]
4815-
""""""
4817+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48164818

48174819
metrics: Optional[list[MetricDict]]
48184820
"""The metrics used for evaluation.
@@ -19043,7 +19045,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1904319045
default=None, description=""""""
1904419046
)
1904519047
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19046-
default=None, description=""""""
19048+
default=None,
19049+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1904719050
)
1904819051
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1904919052

@@ -19061,7 +19064,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1906119064
""""""
1906219065

1906319066
autorater_config: Optional[genai_types.AutoraterConfigDict]
19064-
""""""
19067+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1906519068

1906619069
config: Optional[EvaluateDatasetConfigDict]
1906719070
""""""

tests/unit/agentplatform/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,39 @@ def test_evaluation_result(client):
106106
# assert case_result.response_candidate_results is not None
107107

108108

109+
def test_predefined_metric_with_judge_model_ignores_autorater_config(client):
110+
"""Tests that autorater_config is ignored for predefined metrics in replays."""
111+
prompts_df = pd.DataFrame(
112+
{
113+
"prompt": ["Explain the concept of machine learning in simple terms."],
114+
"response": [
115+
"Machine learning is a type of artificial intelligence that allows"
116+
" computers to learn from data without being explicitly programmed."
117+
],
118+
}
119+
)
120+
121+
eval_dataset = types.EvaluationDataset(
122+
eval_dataset_df=prompts_df,
123+
candidate_name="gemini-2.5-flash",
124+
)
125+
126+
# Set judge_model, which should be ignored for predefined metrics
127+
metric = types.Metric(
128+
name="safety_v1",
129+
judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash"
130+
)
131+
132+
evaluation_result = client.evals.evaluate(
133+
dataset=eval_dataset,
134+
metrics=[metric],
135+
)
136+
137+
assert isinstance(evaluation_result, types.EvaluationResult)
138+
assert evaluation_result.summary_metrics is not None
139+
assert evaluation_result.summary_metrics[0].metric_name == "safety_v1"
140+
141+
109142
def test_multi_turn_predefined_metric(client):
110143
"""Tests that evaluate works with multi-turn predefined metrics."""
111144
prompts_data = {

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
972972
raise ValueError(
973973
f"Metric '{self.metric.name}' is not a supported predefined metric."
974974
)
975+
if (
976+
self.metric.judge_model
977+
or self.metric.judge_model_generation_config
978+
or self.metric.judge_model_sampling_count
979+
):
980+
logger.warning(
981+
"Autorater config settings (judge_model, "
982+
"judge_model_generation_config, judge_model_sampling_count) "
983+
"are ignored for predefined metric '%s'.",
984+
self.metric.name,
985+
)
975986

976987
def _build_request_payload(
977988
self, eval_case: types.EvalCase, response_index: int

vertexai/_genai/types/common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2409,7 +2409,8 @@ class EvaluationRunConfig(_common.BaseModel):
24092409
default=None, description="""The output config for the evaluation run."""
24102410
)
24112411
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
2412-
default=None, description="""The autorater config for the evaluation run."""
2412+
default=None,
2413+
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
24132414
)
24142415
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
24152416
default=None, description="""The prompt template used for inference."""
@@ -2439,7 +2440,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
24392440
"""The output config for the evaluation run."""
24402441

24412442
autorater_config: Optional[genai_types.AutoraterConfigDict]
2442-
"""The autorater config for the evaluation run."""
2443+
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
24432444

24442445
prompt_template: Optional[EvaluationRunPromptTemplateDict]
24452446
"""The prompt template used for inference."""
@@ -4746,7 +4747,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
47464747
default=None, description=""""""
47474748
)
47484749
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
4749-
default=None, description=""""""
4750+
default=None,
4751+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
47504752
)
47514753
metrics: Optional[list[Metric]] = Field(
47524754
default=None,
@@ -4797,7 +4799,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
47974799
""""""
47984800

47994801
autorater_config: Optional[genai_types.AutoraterConfigDict]
4800-
""""""
4802+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
48014803

48024804
metrics: Optional[list[MetricDict]]
48034805
"""The metrics used for evaluation.
@@ -19028,7 +19030,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
1902819030
default=None, description=""""""
1902919031
)
1903019032
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
19031-
default=None, description=""""""
19033+
default=None,
19034+
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
1903219035
)
1903319036
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")
1903419037

@@ -19046,7 +19049,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
1904619049
""""""
1904719050

1904819051
autorater_config: Optional[genai_types.AutoraterConfigDict]
19049-
""""""
19052+
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""
1905019053

1905119054
config: Optional[EvaluateDatasetConfigDict]
1905219055
""""""

0 commit comments

Comments
 (0)