Skip to content

Commit 4dd2cf6

Browse files
authored
Replace Gemini 1.5 001 with 002, add example of Gemini 2.5 eval (#185)
1 parent 215bc81 commit 4dd2cf6

12 files changed

Lines changed: 43 additions & 63 deletions

File tree

genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/dataset.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
{"vars": {"document": "file://documents/document_02.txt", "question": "When did Dodge end production of their full size sedans?", "answer": "ANSWER_NOT_FOUND"}}
33
{"vars": {"document": "file://documents/document_03.txt", "question": "How many square miles was the region impacted by the 2010 drought?", "answer": "1160000"}}
44
{"vars": {"document": "file://documents/document_04.txt", "question": "What is the largest Mansion in the west coast?", "answer": "ANSWER_NOT_FOUND"}}
5-
{"vars": {"document": "file://documents/document_05.txt", "question": "Who founded Telnet?", "answer": "Larry Roberts"}}
5+
{"vars": {"document": "file://documents/document_05.txt", "question": "Who founded Telenet?", "answer": "Larry Roberts"}}
66
{"vars": {"document": "file://documents/document_06.txt", "question": "When did the Chinese famine begin?", "answer": "1331"}}

genai-on-vertex-ai/gemini/model_upgrades/document_qna/promptfoo/promptfooconfig.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,20 @@
22

33
providers: # one or more models, with optional temperature and system instructions
44
- id: "vertex:gemini-1.5-flash"
5-
- id: "vertex:gemini-2.0-flash-lite-001"
5+
- id: "vertex:gemini-2.5-flash-preview-05-20"
6+
label: 2.5-flash-thinking-disabled
7+
config:
8+
generationConfig:
9+
temperature: 0.0
10+
thinking_config:
11+
thinking_budget: 0
12+
- id: "vertex:gemini-2.5-flash-preview-05-20"
13+
label: 2.5-flash-thinking-enabled
14+
config:
15+
generationConfig:
16+
temperature: 0.0
17+
thinking_config:
18+
thinking_budget: 10000
619

720
prompts:
821
- file://prompt_template.txt

genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_colab/document_qna_eval.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
"PROJECT_ID=your-project-id # Google Cloud Project ID\n",
9494
"LOCATION=us-central1 # Region for all required Google Cloud services\n",
9595
"EXPERIMENT_NAME=eval-document-qna # Creates Vertex AI Experiment to track the eval runs\n",
96-
"MODEL_BASELINE=gemini-1.5-flash-001 # Name of your current model\n",
96+
"MODEL_BASELINE=gemini-1.5-flash-002 # Name of your current model\n",
9797
"MODEL_CANDIDATE=gemini-2.0-flash-001 # This model will be compared to the baseline model\n",
9898
"DATASET_URI=\"gs://gemini_assets/document_qna/dataset.jsonl\" # Evaluation dataset in Google Cloud Storage\n",
9999
"PROMPT_TEMPLATE_URI=gs://gemini_assets/document_qna/prompt_template.txt # Text file in Google Cloud Storage"

genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -34,43 +34,14 @@ def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, pr
3434
)
3535
print(f"Baseline model score: {baseline_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
3636
print(f"Candidate model score: {candidate_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
37-
export_results(baseline_model, baseline_results, candidate_model, candidate_results, f'eval_results_{timestamp}.json')
38-
39-
def export_results(baseline_model: str, baseline_results: EvalResult, candidate_model: str, candidate_results: EvalResult, file_name: str):
40-
'''Export combined results of the two eval runs to a single JSON file that can be visualized in LLM Comparator.'''
41-
with open(file_name, 'w') as f:
42-
f.write(json.dumps(dict(
43-
models=[dict(name=baseline_model), dict(name=candidate_model)],
44-
examples=combine_eval_runs(baseline_results, candidate_results),
45-
metadata={'custom_fields_schema':[]}
46-
)))
47-
print(f"Evaluation results saved to {file_name} in LLM Comparator format: https://pair-code.github.io/llm-comparator/")
48-
49-
def combine_eval_runs(baseline: EvalResult, candidate: EvalResult) -> list[dict]:
50-
'''Combine the evaluation results for the two models and calculate the pairwise score.'''
51-
if None in [baseline, candidate] or len(baseline.metrics_table.index) != len(candidate.metrics_table.index):
52-
raise ValueError(f'Invalid eval results!')
53-
examples = []
54-
for b, c in zip(baseline.metrics_table.to_dict(orient='records'), candidate.metrics_table.to_dict(orient='records')):
55-
score_b = b.get('question_answering_quality/score')
56-
score_c = c.get('question_answering_quality/score')
57-
examples.append(dict(
58-
input_text=b.get('prompt'),
59-
output_text_a=b.get('response').strip(),
60-
output_text_b=c.get('response').strip(),
61-
score = 0 if score_b == score_c else 1.0 if score_b > score_c else -1.0,
62-
tags=[],
63-
individual_rater_scores=[]
64-
))
65-
return examples
6637

6738
if __name__ == '__main__':
6839
if os.getenv('PROJECT_ID', 'your-project-id') == 'your-project-id':
6940
raise ValueError('Please configure your Google Cloud Project ID.')
7041
vertexai.init(project=os.getenv('PROJECT_ID'), location='us-central1')
7142
run_eval(
7243
experiment_name = 'eval-document-qna-demo',
73-
baseline_model = 'gemini-1.5-flash-001',
44+
baseline_model = 'gemini-1.5-flash-002',
7445
candidate_model = 'gemini-2.0-flash-001',
7546
prompt_template_local_path = 'prompt_template.txt',
7647
dataset_local_path = 'dataset.jsonl'

genai-on-vertex-ai/gemini/model_upgrades/instruction_following/promptfoo/dataset.jsonl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,4 @@
66
{"vars":{"prompt": "A colt is 5 feet tall. It will grow 6 inches every month. How tall will it be in 3 years? Think step-by-step, then give your answer. Separate your thinking and the final answer by a line with just three \"*\" symbols: ***\nAt the end of your response, please explicitly add a postscript starting with P.P.S"}}
77
{"vars":{"prompt": "I asked a friend about how to remove rust from my bike chain. He told me to pour coke on it and then scrub it with a steel wool. Is this a good way to remove rust? Respond with at least 20 sentences and have more than 4 words be in all capital letters."}}
88
{"vars":{"prompt": "Compose a startup pitch on a new app called Tipperary that helps people to find the average tip size for each restaurant. Please make the response strongly structured. Wrap your entire output in JSON format."}}
9-
{"vars":{"prompt": "What is the next number in this series: 1, 4, 7, 11, 17? Please answer with only mathematical notation without any commas."}}
109
{"vars":{"prompt": "Translate the following sentence into German and then criticize it: Werner was a good friend of mine, but not very smart.\nAvoid the word \"schlau\" throughout your response."}}

genai-on-vertex-ai/gemini/model_upgrades/instruction_following/vertex_script/eval.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,23 @@ def load_dataset(dataset_local_path: str):
2727
def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, prompt_template_local_path: str, dataset_local_path: str, metric_name: str):
2828
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
2929
prompt_template = open(prompt_template_local_path).read()
30-
31-
metrics = EvalTask(
32-
dataset=load_dataset(dataset_local_path),
33-
metrics= [
34-
PairwiseMetric(
35-
metric=metric_name,
36-
metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
37-
# Baseline model for pairwise comparison
38-
baseline_model=GenerativeModel(baseline_model),
39-
),
40-
],
41-
experiment=experiment_name
42-
).evaluate(
43-
model=GenerativeModel(candidate_model),
44-
prompt_template=prompt_template,
45-
experiment_run_name=f"{timestamp}-{candidate_model.replace('.', '-')}"
46-
)
47-
48-
print("Baseline model win rate:", round(metrics.summary_metrics[f'{metric_name}/baseline_model_win_rate'],3))
49-
print("Candidate model win rate:", round(metrics.summary_metrics[f'{metric_name}/candidate_model_win_rate'],3))
30+
metric = PairwiseMetric(
31+
metric=metric_name,
32+
metric_prompt_template=MetricPromptTemplateExamples.Pairwise.INSTRUCTION_FOLLOWING.metric_prompt_template,
33+
baseline_model=GenerativeModel(baseline_model),
34+
)
35+
task = EvalTask(
36+
dataset=load_dataset(dataset_local_path),
37+
metrics=[metric],
38+
experiment=experiment_name
39+
)
40+
results = task.evaluate(
41+
model=GenerativeModel(candidate_model),
42+
prompt_template=prompt_template,
43+
experiment_run_name=f"{timestamp}-{candidate_model.replace('.', '-')}"
44+
)
45+
print("Baseline model win rate:", round(results.summary_metrics[f'{metric_name}/baseline_model_win_rate'],3))
46+
print("Candidate model win rate:", round(results.summary_metrics[f'{metric_name}/candidate_model_win_rate'],3))
5047

5148
if __name__ == '__main__':
5249
if os.getenv("PROJECT_ID", "your-project-id") == "your-project-id":

genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_colab/multiturn_chat_eval.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
"PROJECT_ID=your-project-id # Google Cloud Project ID\n",
9090
"LOCATION=us-central1 # Region for all required Google Cloud services\n",
9191
"EXPERIMENT_NAME=eval-multiturn-chat # Creates Vertex AI Experiment to track the eval runs\n",
92-
"MODEL_BASELINE=gemini-1.5-flash-001 # Name of your current model\n",
92+
"MODEL_BASELINE=gemini-1.5-flash-002 # Name of your current model\n",
9393
"MODEL_CANDIDATE=gemini-2.0-flash-001 # This model will be compared to the baseline model\n",
9494
"DATASET_URI=\"gs://gemini_assets/multiturn_chat/dataset.jsonl\" # Evaluation dataset in Google Cloud Storage\n",
9595
"PROMPT_TEMPLATE_URI=gs://gemini_assets/multiturn_chat/prompt_template.txt # Text file in Google Cloud Storage"
@@ -259,8 +259,8 @@
259259
"accelerator": "GPU",
260260
"colab": {
261261
"gpuType": "L4",
262-
"provenance": [],
263-
"name": "multiturn_chat_eval.ipynb"
262+
"name": "multiturn_chat_eval.ipynb",
263+
"provenance": []
264264
},
265265
"kernelspec": {
266266
"display_name": "Python 3",
@@ -281,4 +281,4 @@
281281
},
282282
"nbformat": 4,
283283
"nbformat_minor": 0
284-
}
284+
}

genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def run_eval(project_id: str, location:str, experiment_name: str, baseline_model
6464
project_id=os.getenv('PROJECT_ID'),
6565
location=os.getenv('LOCATION') or 'us-central1',
6666
experiment_name = 'eval-multiturn-chat',
67-
baseline_model = 'gemini-1.5-flash-001',
67+
baseline_model = 'gemini-1.5-flash-002',
6868
candidate_model = 'gemini-2.0-flash-001',
6969
dataset_local_path = 'dataset.jsonl'
7070
)

genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_colab/summarization_eval.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
"PROJECT_ID=your-project-id # Google Cloud Project ID\n",
9494
"LOCATION=us-central1 # Region for all required Google Cloud services\n",
9595
"EXPERIMENT_NAME=eval-summarization # Creates Vertex AI Experiment to track the eval runs\n",
96-
"MODEL_BASELINE=gemini-1.5-flash-001 # Name of your current model\n",
96+
"MODEL_BASELINE=gemini-1.5-flash-002 # Name of your current model\n",
9797
"MODEL_CANDIDATE=gemini-2.0-flash-001 # This model will be compared to the baseline model\n",
9898
"DATASET_URI=\"gs://gemini_assets/summarization/dataset.jsonl\" # Evaluation dataset in Google Cloud Storage\n",
9999
"PROMPT_TEMPLATE_URI=gs://gemini_assets/summarization/prompt_template.txt # Text file in Google Cloud Storage"

genai-on-vertex-ai/gemini/model_upgrades/summarization/vertex_script/eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, pr
4040
vertexai.init(project=os.getenv("PROJECT_ID"), location='us-central1')
4141
run_eval(
4242
experiment_name = 'eval-summarization-demo',
43-
baseline_model = 'gemini-1.5-flash-001',
43+
baseline_model = 'gemini-1.5-flash-002',
4444
candidate_model = 'gemini-2.0-flash-001',
4545
prompt_template_local_path = 'prompt_template.txt',
4646
dataset_local_path = 'dataset.jsonl'

0 commit comments

Comments
 (0)