Replace Gemini 1.5 001 with 002, add example of Gemini 2.5 eval (#185)

dkashkin · web-flow · commit 4dd2cf675cc3 · 2025-05-28T11:27:47.000-07:00
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/dataset.jsonl b/genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/dataset.jsonl
@@ -2,5 +2,5 @@
 {"vars": {"document": "file://documents/document_02.txt", "question": "When did Dodge end production of their full size sedans?", "answer": "ANSWER_NOT_FOUND"}}
 {"vars": {"document": "file://documents/document_03.txt", "question": "How many square miles was the region impacted by the 2010 drought?", "answer": "1160000"}}
 {"vars": {"document": "file://documents/document_04.txt", "question": "What is the largest Mansion in the west coast?", "answer": "ANSWER_NOT_FOUND"}}
-{"vars": {"document": "file://documents/document_05.txt", "question": "Who founded Telnet?", "answer": "Larry Roberts"}}
+{"vars": {"document": "file://documents/document_05.txt", "question": "Who founded Telenet?", "answer": "Larry Roberts"}}
 {"vars": {"document": "file://documents/document_06.txt", "question": "When did the Chinese famine begin?", "answer": "1331"}}
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/promptfooconfig.yaml b/genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/promptfooconfig.yaml
@@ -2,7 +2,20 @@
 
 providers:  # one or more models, with optional temperature and system instructions
   - id: "vertex:gemini-1.5-flash"
-  - id: "vertex:gemini-2.0-flash-lite-001"
+  - id: "vertex:gemini-2.5-flash-preview-05-20"
+    label: 2.5-flash-thinking-disabled
+    config:
+      generationConfig:
+        temperature: 0.0
+      thinking_config:
+        thinking_budget: 0
+  - id: "vertex:gemini-2.5-flash-preview-05-20"
+    label: 2.5-flash-thinking-enabled
+    config:
+      generationConfig:
+        temperature: 0.0
+      thinking_config:
+        thinking_budget: 10000
 
 prompts:
   - file://prompt_template.txt
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_colab/document_qna_eval.ipynb b/genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_colab/document_qna_eval.ipynb
@@ -93,7 +93,7 @@
         "PROJECT_ID=your-project-id            # Google Cloud Project ID\n",
         "LOCATION=us-central1                  # Region for all required Google Cloud services\n",
         "EXPERIMENT_NAME=eval-document-qna     # Creates Vertex AI Experiment to track the eval runs\n",
-        "MODEL_BASELINE=gemini-1.5-flash-001   # Name of your current model\n",
+        "MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model\n",
         "MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model\n",
         "DATASET_URI=\"gs://gemini_assets/document_qna/dataset.jsonl\"  # Evaluation dataset in Google Cloud Storage\n",
         "PROMPT_TEMPLATE_URI=gs://gemini_assets/document_qna/prompt_template.txt  # Text file in Google Cloud Storage"
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py b/genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py
@@ -34,43 +34,14 @@ def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, pr
     )
     print(f"Baseline model score: {baseline_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
     print(f"Candidate model score: {candidate_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
-    export_results(baseline_model, baseline_results, candidate_model, candidate_results, f'eval_results_{timestamp}.json')
-    
-def export_results(baseline_model: str, baseline_results: EvalResult, candidate_model: str, candidate_results: EvalResult, file_name: str):
-    '''Export combined results of the two eval runs to a single JSON file that can be visualized in LLM Comparator.'''
-    with open(file_name, 'w') as f:
-        f.write(json.dumps(dict(
-            models=[dict(name=baseline_model), dict(name=candidate_model)],
-            examples=combine_eval_runs(baseline_results, candidate_results),
-            metadata={'custom_fields_schema':[]}
-        )))
-    print(f"Evaluation results saved to {file_name} in LLM Comparator format: https://pair-code.github.io/llm-comparator/")
-
-def combine_eval_runs(baseline: EvalResult, candidate: EvalResult) -> list[dict]:
-    '''Combine the evaluation results for the two models and calculate the pairwise score.'''
-    if None in [baseline, candidate] or len(baseline.metrics_table.index) != len(candidate.metrics_table.index):
-        raise ValueError(f'Invalid eval results!')
-    examples = []
-    for b, c in zip(baseline.metrics_table.to_dict(orient='records'), candidate.metrics_table.to_dict(orient='records')):
-        score_b = b.get('question_answering_quality/score')
-        score_c = c.get('question_answering_quality/score')
-        examples.append(dict(
-            input_text=b.get('prompt'),
-            output_text_a=b.get('response').strip(),
-            output_text_b=c.get('response').strip(),
-            score = 0 if score_b == score_c else 1.0 if score_b > score_c else -1.0,
-            tags=[],
-            individual_rater_scores=[]
-        ))
-    return examples
 
 if __name__ == '__main__':
     if os.getenv('PROJECT_ID', 'your-project-id') == 'your-project-id':
         raise ValueError('Please configure your Google Cloud Project ID.')
     vertexai.init(project=os.getenv('PROJECT_ID'), location='us-central1')
     run_eval(
         experiment_name = 'eval-document-qna-demo',
-        baseline_model = 'gemini-1.5-flash-001',
+        baseline_model = 'gemini-1.5-flash-002',
         candidate_model = 'gemini-2.0-flash-001',
         prompt_template_local_path = 'prompt_template.txt',
         dataset_local_path = 'dataset.jsonl'    
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/instruction_following/promptfoo/dataset.jsonl b/genai-on-vertex-ai/gemini/model_upgrades/instruction_following/promptfoo/dataset.jsonl
@@ -6,5 +6,4 @@
 {"vars":{"prompt": "A colt is 5 feet tall. It will grow 6 inches every month. How tall will it be in 3 years? Think step-by-step, then give your answer. Separate your thinking and the final answer by a line with just three \"*\" symbols: ***\nAt the end of your response, please explicitly add a postscript starting with P.P.S"}}
 {"vars":{"prompt": "I asked a friend about how to remove rust from my bike chain. He told me to pour coke on it and then scrub it with a steel wool. Is this a good way to remove rust? Respond with at least 20 sentences and have more than 4 words be in all capital letters."}}
 {"vars":{"prompt": "Compose a startup pitch on a new app called Tipperary that helps people to find the average tip size for each restaurant. Please make the response strongly structured. Wrap your entire output in JSON format."}}
-{"vars":{"prompt": "What is the next number in this series: 1, 4, 7, 11, 17? Please answer with only mathematical notation without any commas."}}
 {"vars":{"prompt": "Translate the following sentence into German and then criticize it: Werner was a good friend of mine, but not very smart.\nAvoid the word \"schlau\" throughout your response."}}
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/instruction_following/vertex_script/eval.py b/genai-on-vertex-ai/gemini/model_upgrades/instruction_following/vertex_script/eval.py
@@ -27,26 +27,23 @@ def load_dataset(dataset_local_path: str):
 def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, prompt_template_local_path: str, dataset_local_path: str, metric_name: str):
     timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
     prompt_template = open(prompt_template_local_path).read()
-    
-    metrics = EvalTask(
-      dataset=load_dataset(dataset_local_path),
-      metrics= [
-        PairwiseMetric(
-            metric=metric_name,
-            metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
-            # Baseline model for pairwise comparison
-            baseline_model=GenerativeModel(baseline_model),
-        ),
-    ],
-      experiment=experiment_name
-  ).evaluate(
-      model=GenerativeModel(candidate_model),
-      prompt_template=prompt_template,
-      experiment_run_name=f"{timestamp}-{candidate_model.replace('.', '-')}"
-  )
-    
-    print("Baseline model win rate:", round(metrics.summary_metrics[f'{metric_name}/baseline_model_win_rate'],3))
-    print("Candidate model win rate:", round(metrics.summary_metrics[f'{metric_name}/candidate_model_win_rate'],3))
+    metric = PairwiseMetric(
+        metric=metric_name,
+        metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
+        baseline_model=GenerativeModel(baseline_model),
+    )
+    task = EvalTask(
+        dataset=load_dataset(dataset_local_path),
+        metrics=[metric],
+        experiment=experiment_name
+    )
+    results = task.evaluate(
+        model=GenerativeModel(candidate_model),
+        prompt_template=prompt_template,
+        experiment_run_name=f"{timestamp}-{candidate_model.replace('.', '-')}"
+    )    
+    print("Baseline model win rate:", round(results.summary_metrics[f'{metric_name}/baseline_model_win_rate'],3))
+    print("Candidate model win rate:", round(results.summary_metrics[f'{metric_name}/candidate_model_win_rate'],3))
 
 if __name__ == '__main__':
     if os.getenv("PROJECT_ID", "your-project-id") == "your-project-id":
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_colab/multiturn_chat_eval.ipynb b/genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_colab/multiturn_chat_eval.ipynb
@@ -89,7 +89,7 @@
         "PROJECT_ID=your-project-id            # Google Cloud Project ID\n",
         "LOCATION=us-central1                  # Region for all required Google Cloud services\n",
         "EXPERIMENT_NAME=eval-multiturn-chat   # Creates Vertex AI Experiment to track the eval runs\n",
-        "MODEL_BASELINE=gemini-1.5-flash-001   # Name of your current model\n",
+        "MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model\n",
         "MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model\n",
         "DATASET_URI=\"gs://gemini_assets/multiturn_chat/dataset.jsonl\"  # Evaluation dataset in Google Cloud Storage\n",
         "PROMPT_TEMPLATE_URI=gs://gemini_assets/multiturn_chat/prompt_template.txt  # Text file in Google Cloud Storage"
@@ -259,8 +259,8 @@
     "accelerator": "GPU",
     "colab": {
       "gpuType": "L4",
-      "provenance": [],
-      "name": "multiturn_chat_eval.ipynb"
+      "name": "multiturn_chat_eval.ipynb",
+      "provenance": []
     },
     "kernelspec": {
       "display_name": "Python 3",
@@ -281,4 +281,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py b/genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py
@@ -64,7 +64,7 @@ def run_eval(project_id: str, location:str, experiment_name: str, baseline_model
         project_id=os.getenv('PROJECT_ID'),
         location=os.getenv('LOCATION') or 'us-central1',
         experiment_name = 'eval-multiturn-chat',
-        baseline_model = 'gemini-1.5-flash-001',
+        baseline_model = 'gemini-1.5-flash-002',
         candidate_model = 'gemini-2.0-flash-001',
         dataset_local_path = 'dataset.jsonl'    
     )
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_colab/summarization_eval.ipynb b/genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_colab/summarization_eval.ipynb
@@ -93,7 +93,7 @@
         "PROJECT_ID=your-project-id            # Google Cloud Project ID\n",
         "LOCATION=us-central1                  # Region for all required Google Cloud services\n",
         "EXPERIMENT_NAME=eval-summarization    # Creates Vertex AI Experiment to track the eval runs\n",
-        "MODEL_BASELINE=gemini-1.5-flash-001   # Name of your current model\n",
+        "MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model\n",
         "MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model\n",
         "DATASET_URI=\"gs://gemini_assets/summarization/dataset.jsonl\"  # Evaluation dataset in Google Cloud Storage\n",
         "PROMPT_TEMPLATE_URI=gs://gemini_assets/summarization/prompt_template.txt  # Text file in Google Cloud Storage"
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_script/eval.py b/genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_script/eval.py
@@ -40,7 +40,7 @@ def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, pr
     vertexai.init(project=os.getenv("PROJECT_ID"), location='us-central1')
     run_eval(
         experiment_name = 'eval-summarization-demo',
-        baseline_model = 'gemini-1.5-flash-001',
+        baseline_model = 'gemini-1.5-flash-002',
         candidate_model = 'gemini-2.0-flash-001',
         prompt_template_local_path = 'prompt_template.txt',
         dataset_local_path = 'dataset.jsonl'    
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/text_classification/vertex_colab/text_classification_eval.ipynb b/genai-on-vertex-ai/gemini/model_upgrades/text_classification/vertex_colab/text_classification_eval.ipynb
@@ -89,7 +89,7 @@
         "PROJECT_ID=your-project-id            # Google Cloud Project ID\n",
         "LOCATION=us-central1                  # Region for all required Google Cloud services\n",
         "EXPERIMENT_NAME=eval-classification   # Creates Vertex AI Experiment to track the eval runs\n",
-        "MODEL_BASELINE=gemini-1.5-flash-001   # Name of your current model\n",
+        "MODEL_BASELINE=gemini-1.5-flash-002   # Name of your current model\n",
         "MODEL_CANDIDATE=gemini-2.0-flash-001  # This model will be compared to the baseline model\n",
         "DATASET_URI=\"gs://gemini_assets/classification_vertex/dataset.jsonl\"  # Evaluation dataset in Google Cloud Storage\n",
         "PROMPT_TEMPLATE_URI=gs://gemini_assets/classification_vertex/prompt_template.txt  # Text file in Google Cloud Storage"
diff --git a/genai-on-vertex-ai/gemini/model_upgrades/text_classification/vertex_script/eval.py b/genai-on-vertex-ai/gemini/model_upgrades/text_classification/vertex_script/eval.py
@@ -46,7 +46,7 @@ def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, pr
     vertexai.init(project=os.getenv("PROJECT_ID"), location='us-central1')
     run_eval(
         experiment_name = 'evals-classifier-demo',
-        baseline_model = 'gemini-1.5-flash-001',
+        baseline_model = 'gemini-1.5-flash-002',
         candidate_model = 'gemini-2.0-flash-001',
         prompt_template_local_path = 'prompt_template.txt',
         dataset_local_path = 'dataset.jsonl'    

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ def run_eval(project_id: str, location:str, experiment_name: str, baseline_model`
`64`	`64`	`project_id=os.getenv('PROJECT_ID'),`
`65`	`65`	`location=os.getenv('LOCATION') or 'us-central1',`
`66`	`66`	`experiment_name = 'eval-multiturn-chat',`
`67`		`- baseline_model = 'gemini-1.5-flash-001',`
	`67`	`+ baseline_model = 'gemini-1.5-flash-002',`
`68`	`68`	`candidate_model = 'gemini-2.0-flash-001',`
`69`	`69`	`dataset_local_path = 'dataset.jsonl'`
`70`	`70`	`)`