feat: Eval SDK: Migrate model call method by genai SDK usage in preview foler

vertex-sdk-bot · copybara-github · commit ad36123f6f9a · 2026-04-01T14:01:47.000-07:00
PiperOrigin-RevId: 893099457
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -2153,6 +2153,29 @@ def test_evaluate_invalid_metrics(self):
             )
             test_eval_task.evaluate()
 
+    @mock.patch("google.genai.Client")
+    def test_evaluate_model_genai(self, mock_client_class):
+        mock_client = mock.MagicMock()
+        mock_client.models.generate_content.return_value = mock.MagicMock(
+            text="test_response"
+        )
+        mock_client_class.return_value = mock_client
+        test_eval_task = EvalTaskPreview(
+            dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE,
+            metrics=[PointwisePreview.SUMMARIZATION_QUALITY],
+        )
+        with mock.patch.object(
+            target=gapic_evaluation_services_preview.EvaluationServiceClient,
+            attribute="evaluate_instances",
+            side_effect=_MOCK_SUMMARIZATION_QUALITY_RESULT_PREVIEW,
+        ):
+            test_result = test_eval_task.evaluate(
+                model="gemini-2.5-pro",
+                prompt_template="{instruction} test prompt template {context}",
+            )
+            assert mock_client.models.generate_content.call_count == 2
+            assert "summarization_quality/score" in test_result.metrics_table.columns
+
     def test_evaluate_duplicate_string_metric(self):
         metrics = [
             "exact_match",
diff --git a/tests/unit/vertexai/test_rubric_based_eval.py b/tests/unit/vertexai/test_rubric_based_eval.py
@@ -215,6 +215,35 @@ def test_pointwise_instruction_following_metric(self):
                 "rb_instruction_following/raw_outputs",
             ]
 
+    @mock.patch("google.genai.Client")
+    def test_pointwise_instruction_following_metric_genai(self, mock_client_class):
+        import copy
+
+        metric = copy.deepcopy(PredefinedRubricMetrics.Pointwise.INSTRUCTION_FOLLOWING)
+        metric.generation_config.model = "gemini-2.5-pro"
+        mock_client = mock.MagicMock()
+        mock_client.models.generate_content.return_value = mock.MagicMock(
+            text="""```json{"questions": ["test_rubric"]}```"""
+        )
+        mock_client_class.return_value = mock_client
+        with mock.patch.object(
+            target=gapic_evaluation_services.EvaluationServiceClient,
+            attribute="evaluate_instances",
+            side_effect=_MOCK_POINTWISE_RESPONSE,
+        ):
+            eval_result = EvalTask(
+                dataset=_TEST_EVAL_DATASET, metrics=[metric]
+            ).evaluate()
+            assert eval_result.metrics_table.columns.tolist() == [
+                "prompt",
+                "response",
+                "rubrics",
+                "rb_instruction_following/score",
+                "rb_instruction_following/rubric_verdict_pairs",
+                "rb_instruction_following/raw_outputs",
+            ]
+            assert mock_client.models.generate_content.call_count == 3
+
     def test_pairwise_instruction_following_metric(self):
         metric = PredefinedRubricMetrics.Pairwise.INSTRUCTION_FOLLOWING
         mock_model = mock.create_autospec(
diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py
@@ -74,7 +74,7 @@
 ]
 
 _RunnableType = Union[reasoning_engines.Queryable, Callable[[str], Dict[str, str]]]
-_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]
+_ModelType = Union[str, generative_models.GenerativeModel, Callable[[str], str]]
 
 
 def _validate_metrics(metrics: List[Union[str, metrics_base._Metric]]) -> None:
@@ -399,6 +399,11 @@ def _run_model_inference(
         if constants.Dataset.PROMPT_COLUMN in evaluation_run_config.dataset.columns:
             t1 = time.perf_counter()
             if isinstance(model, generative_models.GenerativeModel):
+                _LOGGER.warning(
+                    "vertexai.generative_models.GenerativeModel is deprecated for "
+                    "evaluation and will be removed in June 2026. Please pass a "
+                    "string model name instead."
+                )
                 responses = _pre_eval_utils._generate_responses_from_gemini_model(
                     model, evaluation_run_config.dataset
                 )
@@ -407,6 +412,15 @@ def _run_model_inference(
                     evaluation_run_config,
                     is_baseline_model,
                 )
+            elif isinstance(model, str):
+                responses = _pre_eval_utils._generate_responses_from_genai_model(
+                    model, evaluation_run_config.dataset
+                )
+                _pre_eval_utils.populate_eval_dataset_with_model_responses(
+                    responses,
+                    evaluation_run_config,
+                    is_baseline_model,
+                )
             elif callable(model):
                 responses = _pre_eval_utils._generate_response_from_custom_model_fn(
                     model, evaluation_run_config.dataset
diff --git a/vertexai/preview/evaluation/_pre_eval_utils.py b/vertexai/preview/evaluation/_pre_eval_utils.py
@@ -21,6 +21,8 @@
 from concurrent import futures
 from typing import Callable, Optional, Set, TYPE_CHECKING, Union, List
 
+from google import genai
+from google.cloud import aiplatform
 from google.cloud.aiplatform import base
 from google.cloud.aiplatform_v1beta1.types import (
     content as gapic_content_types,
@@ -70,6 +72,107 @@ def _assemble_prompt(
     )
 
 
+def _generate_content_text_response_genai(
+    model: str, client: genai.Client, prompt: str, max_retries: int = 3
+) -> str:
+    """Generates a text response from Gemini model from a text prompt with retries using genai module.
+
+    Args:
+        model: The model name string.
+        client: The genai client instance.
+        prompt: The prompt to send to the model.
+        max_retries: Maximum number of retries for response generation.
+
+    Returns:
+        The text response from the model.
+        Returns constants.RESPONSE_ERROR if there is an error after all retries.
+    """
+    for retry_attempt in range(max_retries):
+        try:
+            response = client.models.generate_content(
+                model=model,
+                contents=prompt,
+            )
+            # The new SDK raises exceptions on blocked content instead of returning
+            # block_reason directly, so if it succeeds, we can return the text.
+            if response.text:
+                return response.text
+            else:
+                _LOGGER.warning(
+                    "The model response was empty or blocked.\n"
+                    f"Prompt: {prompt}.\n"
+                    f"Retry attempt: {retry_attempt + 1}/{max_retries}"
+                )
+        except Exception as e:  # pylint: disable=broad-except
+            error_message = (
+                f"Failed to generate response candidates from GenAI model "
+                f"{model}.\n"
+                f"Error: {e}.\n"
+                f"Prompt: {prompt}.\n"
+                f"Retry attempt: {retry_attempt + 1}/{max_retries}"
+            )
+            _LOGGER.warning(error_message)
+        if retry_attempt < max_retries - 1:
+            _LOGGER.info(
+                f"Retrying response generation for prompt: {prompt}, attempt "
+                f"{retry_attempt + 1}/{max_retries}..."
+            )
+
+    final_error_message = (
+        f"Failed to generate response from GenAI model {model}.\n" f"Prompt: {prompt}."
+    )
+    _LOGGER.warning(final_error_message)
+    return constants.RESPONSE_ERROR
+
+
+def _generate_responses_from_genai_model(
+    model: str,
+    df: "pd.DataFrame",
+    rubric_generation_prompt_template: Optional[str] = None,
+) -> List[str]:
+    """Generates responses from Google GenAI SDK for the given evaluation dataset."""
+    _LOGGER.info(
+        f"Generating a total of {df.shape[0]} "
+        f"responses from Google GenAI model {model}."
+    )
+    tasks = []
+    client = genai.Client(
+        vertexai=True,
+        project=aiplatform.initializer.global_config.project,
+        location=aiplatform.initializer.global_config.location,
+    )
+
+    with tqdm(total=len(df)) as pbar:
+        with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
+            for idx, row in df.iterrows():
+                if rubric_generation_prompt_template:
+                    input_columns = prompt_template_base.PromptTemplate(
+                        rubric_generation_prompt_template
+                    ).variables
+                    if multimodal_utils.is_multimodal_instance(
+                        row[list(input_columns)].to_dict()
+                    ):
+                        prompt = multimodal_utils._assemble_multi_modal_prompt(
+                            rubric_generation_prompt_template, row, idx, input_columns
+                        )
+                    else:
+                        prompt = _assemble_prompt(
+                            row, rubric_generation_prompt_template
+                        )
+                else:
+                    prompt = row[constants.Dataset.PROMPT_COLUMN]
+                task = executor.submit(
+                    _generate_content_text_response_genai,
+                    prompt=prompt,
+                    model=model,
+                    client=client,
+                )
+                task.add_done_callback(lambda _: pbar.update(1))
+                tasks.append(task)
+        responses = [future.result() for future in tasks]
+    return responses
+
+
 def _generate_content_text_response(
     model: generative_models.GenerativeModel, prompt: str, max_attempts: int = 3
 ) -> str:
diff --git a/vertexai/preview/evaluation/eval_task.py b/vertexai/preview/evaluation/eval_task.py
@@ -63,7 +63,7 @@
 GenerativeModel = generative_models.GenerativeModel
 
 _RunnableType = Union[reasoning_engines.Queryable, Callable[[str], Dict[str, str]]]
-_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]
+_ModelType = Union[str, generative_models.GenerativeModel, Callable[[str], str]]
 
 
 class EvalTask:
@@ -579,6 +579,12 @@ def _log_eval_experiment_param(
                         for category, threshold in safety_settings.items()
                     }
                     eval_metadata.update(safety_settings_as_str)
+            elif isinstance(model, str):
+                eval_metadata.update(
+                    {
+                        "model_name": model,
+                    }
+                )
 
         if runnable:
             if isinstance(runnable, reasoning_engines.LangchainAgent):
diff --git a/vertexai/preview/evaluation/metric_utils.py b/vertexai/preview/evaluation/metric_utils.py
@@ -185,7 +185,7 @@ def _parse_required_inputs(
 
 def load(
     file_path: str,
-    baseline_model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
+    baseline_model: Optional[Union[str, GenerativeModel, Callable[[str], str]]] = None,
 ) -> Union[PointwiseMetric, PairwiseMetric, RubricBasedMetric]:
     """Loads a metric object from a YAML file.
 
@@ -206,7 +206,7 @@ def load(
 
 def loads(
     yaml_data: str,
-    baseline_model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
+    baseline_model: Optional[Union[str, GenerativeModel, Callable[[str], str]]] = None,
 ) -> Union[PointwiseMetric, PairwiseMetric, RubricBasedMetric]:
     """Loads a metric object from YAML data.
 
diff --git a/vertexai/preview/evaluation/metrics/_base.py b/vertexai/preview/evaluation/metrics/_base.py
@@ -32,7 +32,7 @@
 )
 
 
-_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]
+_ModelType = Union[str, generative_models.GenerativeModel, Callable[[str], str]]
 
 
 class _Metric(abc.ABC):
diff --git a/vertexai/preview/evaluation/metrics/pairwise_metric.py b/vertexai/preview/evaluation/metrics/pairwise_metric.py
@@ -26,10 +26,13 @@
 from vertexai.preview.evaluation.metrics import (
     custom_output_config as custom_output_config_class,
 )
+from google.cloud.aiplatform import base
 from vertexai.preview.evaluation.metrics import (
     metric_prompt_template as metric_prompt_template_base,
 )
 
+_LOGGER = base.Logger(__name__)
+
 
 class PairwiseMetric(_base._ModelBasedMetric):  # pylint: disable=protected-access
     """A Model-based Pairwise Metric.
@@ -64,8 +67,8 @@ class PairwiseMetric(_base._ModelBasedMetric):  # pylint: disable=protected-acce
     Usage Examples:
 
         ```
-        baseline_model = GenerativeModel("gemini-1.0-pro")
-        candidate_model = GenerativeModel("gemini-1.5-pro")
+        baseline_model = GenerativeModel("gemini-2.5-pro")
+        candidate_model = GenerativeModel("gemini-2.5-flash")
 
         pairwise_groundedness = PairwiseMetric(
             metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
@@ -96,7 +99,7 @@ def __init__(
             metric_prompt_template_base.PairwiseMetricPromptTemplate, str
         ],
         baseline_model: Optional[
-            Union[generative_models.GenerativeModel, Callable[[str], str]]
+            Union[str, generative_models.GenerativeModel, Callable[[str], str]]
         ] = None,
         system_instruction: Optional[str] = None,
         autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
@@ -124,6 +127,12 @@ def __init__(
             autorater_config=autorater_config,
             custom_output_config=custom_output_config,
         )
+        if isinstance(baseline_model, generative_models.GenerativeModel):
+            _LOGGER.warning(
+                "vertexai.generative_models.GenerativeModel is deprecated for "
+                "evaluation and will be removed in June 2026. Please pass a "
+                "string model name instead."
+            )
         self._baseline_model = baseline_model
 
     @property
diff --git a/vertexai/preview/evaluation/metrics/rubric_based_metric.py b/vertexai/preview/evaluation/metrics/rubric_based_metric.py
@@ -30,7 +30,7 @@
 if TYPE_CHECKING:
     import pandas as pd
 
-_DEFAULT_MODEL_NAME = "gemini-2.0-flash-001"
+_DEFAULT_MODEL_NAME = "gemini-2.5-pro"
 _LOGGER = base.Logger(__name__)
 
 
@@ -73,11 +73,18 @@ def generate_rubrics(
             )
             return eval_dataset
 
-        responses = _pre_eval_utils._generate_responses_from_gemini_model(
-            model,
-            eval_dataset,
-            self.generation_config.prompt_template,
-        )
+        if isinstance(model, str):
+            responses = _pre_eval_utils._generate_responses_from_genai_model(
+                model,
+                eval_dataset,
+                self.generation_config.prompt_template,
+            )
+        else:
+            responses = _pre_eval_utils._generate_responses_from_gemini_model(
+                model,
+                eval_dataset,
+                self.generation_config.prompt_template,
+            )
         if self.generation_config.parsing_fn:
             parsing_fn = self.generation_config.parsing_fn
         else:

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`)`
`33`	`33`
`34`	`34`
`35`		`-_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]`
	`35`	`+_ModelType = Union[str, generative_models.GenerativeModel, Callable[[str], str]]`
`36`	`36`
`37`	`37`
`38`	`38`	`class _Metric(abc.ABC):`