googleapis
diff --git a/‎agentplatform/_genai/_evals_metric_handlers.py‎
Lines changed: 76 additions & 64 deletions b/‎agentplatform/_genai/_evals_metric_handlers.py‎
Lines changed: 76 additions & 64 deletions
@@ -960,88 +960,100 @@ def aggregate(
 
 
 class PredefinedMetricHandler(MetricHandler[types.Metric]):
-    """Metric handler for predefined metrics."""
+  """Metric handler for predefined metrics."""
 
-    @property
-    def metric_name(self) -> str:
-        return self.metric.name or "unknown_metric"
+  @property
+  def metric_name(self) -> str:
+    return self.metric.name or "unknown_metric"
 
-    def __init__(self, module: "evals.Evals", metric: types.Metric):
-        super().__init__(module=module, metric=metric)
-        if self.metric.name not in _evals_constant.SUPPORTED_PREDEFINED_METRICS:
-            raise ValueError(
+  def __init__(self, module: "evals.Evals", metric: types.Metric):
+    super().__init__(module=module, metric=metric)
+    if self.metric.name not in _evals_constant.SUPPORTED_PREDEFINED_METRICS:
+      raise ValueError(
                 f"Metric '{self.metric.name}' is not a supported predefined metric."
             )
-
-    def _build_request_payload(
-        self, eval_case: types.EvalCase, response_index: int
-    ) -> dict[str, Any]:
-        """Builds the request parameters for evaluate instances request."""
-        response_content = _get_response_from_eval_case(
+    if (
+        self.metric.judge_model
+        or self.metric.judge_model_generation_config
+        or self.metric.judge_model_sampling_count
+    ):
+      logger.warning(
+          "Autorater config settings (judge_model, "
+          "judge_model_generation_config, judge_model_sampling_count) "
+          "are ignored for predefined metric '%s'.",
+          self.metric.name,
+      )
+
+  def _build_request_payload(
+      self, eval_case: types.EvalCase, response_index: int
+  ) -> dict[str, Any]:
+    """Builds the request parameters for evaluate instances request."""
+    response_content = _get_response_from_eval_case(
             eval_case, response_index, self.metric.name
         )
 
-        if not response_content and not getattr(eval_case, "agent_data", None):
-            raise ValueError(
+    if not response_content and not getattr(eval_case, "agent_data", None):
+      raise ValueError(
                 f"Response content missing for candidate {response_index}."
             )
 
-        if self.metric.name == "tool_use_quality_v1":
-            has_tool_call = _has_tool_call(eval_case.intermediate_events)
+    if self.metric.name == "tool_use_quality_v1":
+      has_tool_call = _has_tool_call(eval_case.intermediate_events)
 
-            # Check agent_data for tool calls if intermediate_events is empty
-            agent_data = getattr(eval_case, "agent_data", None)
-            if not has_tool_call and agent_data:
-                for turn in agent_data.turns or []:
-                    if _has_tool_call(turn.events):
-                        has_tool_call = True
-                        break
+      # Check agent_data for tool calls if intermediate_events is empty
+      agent_data = getattr(eval_case, "agent_data", None)
+      if not has_tool_call and agent_data:
+        for turn in agent_data.turns or []:
+          if _has_tool_call(turn.events):
+            has_tool_call = True
+            break
 
-            if not has_tool_call:
-                logger.warning(
+      if not has_tool_call:
+        logger.warning(
                     "Metric 'tool_use_quality_v1' requires tool usage in "
                     "'intermediate_events' or 'agent_data', but no tool usage was found for case %s.",
                     eval_case.eval_case_id,
                 )
 
-        extracted_prompt = _get_prompt_from_eval_case(eval_case)
-        prompt_instance_data = None
-        if self.metric.name and self.metric.name.startswith("multi_turn"):
-            prompt_contents = [
+    extracted_prompt = _get_prompt_from_eval_case(eval_case)
+    prompt_instance_data = None
+    if self.metric.name and self.metric.name.startswith("multi_turn"):
+      prompt_contents = [
                 msg.content for msg in (eval_case.conversation_history or [])
             ]
-            if extracted_prompt:
-                prompt_contents.append(extracted_prompt)
-            prompt_instance_data = types.evals.InstanceData(
+      if extracted_prompt:
+        prompt_contents.append(extracted_prompt)
+      prompt_instance_data = types.evals.InstanceData(
                 contents=types.evals.InstanceDataContents(contents=prompt_contents)
             )
 
-        instance_payload = _build_evaluation_instance(
+    instance_payload = _build_evaluation_instance(
             eval_case=eval_case,
             response_content=response_content,
             prompt_instance_data=prompt_instance_data,
         )
 
-        request_payload: dict[str, Any] = {
+    request_payload: dict[str, Any] = {
             "instance": instance_payload,
         }
 
-        autorater_config = _get_autorater_config(self.metric)
-        if autorater_config:
-            request_payload["autorater_config"] = genai_types.AutoraterConfig(
+    autorater_config = _get_autorater_config(self.metric)
+    if autorater_config:
+      request_payload["autorater_config"] = genai_types.AutoraterConfig(
                 **autorater_config
             )
-        return request_payload
 
-    @override
-    def get_metric_result(
+    return request_payload
+
+  @override
+  def get_metric_result(
         self, eval_case: types.EvalCase, response_index: int
     ) -> types.EvalCaseMetricResult:
-        """Processes a single evaluation case for a specific predefined metric."""
-        metric_name = self.metric.name
-        try:
-            payload = self._build_request_payload(eval_case, response_index)
-            api_response = _call_with_retry(
+    """Processes a single evaluation case for a specific predefined metric."""
+    metric_name = self.metric.name
+    try:
+      payload = self._build_request_payload(eval_case, response_index)
+      api_response = _call_with_retry(
                 lambda: self.module._evaluate_instances(
                     metrics=[self.metric],
                     instance=payload.get("instance"),
@@ -1050,25 +1062,25 @@ def get_metric_result(
                 metric_name,
             )
 
-            if (
+      if (
                 api_response
                 and hasattr(api_response, "metric_results")
                 and api_response.metric_results
             ):
-                result_data = api_response.metric_results[0]
+        result_data = api_response.metric_results[0]
 
-                error_message = None
-                if result_data.error and getattr(result_data.error, "code"):
-                    error_message = f"Error in metric result: {result_data.error}"
-                return types.EvalCaseMetricResult(
+        error_message = None
+        if result_data.error and getattr(result_data.error, "code"):
+          error_message = f"Error in metric result: {result_data.error}"
+        return types.EvalCaseMetricResult(
                     metric_name=metric_name,
                     score=result_data.score,
                     explanation=result_data.explanation,
                     rubric_verdicts=result_data.rubric_verdicts,
                     error_message=error_message,
                 )
-            else:
-                logger.error(
+      else:
+        logger.error(
                     "Metric results missing in API response for predefined metric '%s'."
                     " API response: %s",
                     metric_name,
@@ -1078,29 +1090,29 @@ def get_metric_result(
                         else "None"
                     ),
                 )
-                return types.EvalCaseMetricResult(
+        return types.EvalCaseMetricResult(
                     metric_name=metric_name,
                     error_message="Metric results missing in API response.",
                 )
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logger.error(
+    except Exception as e:  # pylint: disable=broad-exception-caught
+      logger.error(
                 "Error processing metric %s for case %s: %s",
                 metric_name,
                 eval_case.eval_case_id,
                 e,
                 exc_info=True,
             )
-            return types.EvalCaseMetricResult(
+      return types.EvalCaseMetricResult(
                 metric_name=metric_name, error_message=str(e)
             )
 
-    @override
-    def aggregate(
+  @override
+  def aggregate(
         self, eval_case_metric_results: list[types.EvalCaseMetricResult]
     ) -> types.AggregatedMetricResult:
-        """Aggregates the metric results for a predefined metric."""
-        logger.debug("Aggregating results for predefined metric: %s", self.metric.name)
-        return _default_aggregate_scores(
+    """Aggregates the metric results for a predefined metric."""
+    logger.debug("Aggregating results for predefined metric: %s", self.metric.name)
+    return _default_aggregate_scores(
             self.metric.name, eval_case_metric_results, calculate_pass_rate=True
         )