Skip to content

Commit 24b2db3

Browse files
ci: implement multi-model evaluation support (#169)
1 parent 69c0c82 commit 24b2db3

8 files changed

Lines changed: 238 additions & 21 deletions

cloudbuild.yaml

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ steps:
7272
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
7373
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
7474
75-
7675
# Set environment variables for extension
7776
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
7877
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
@@ -84,18 +83,28 @@ steps:
8483
# Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
8584
export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
8685
87-
# Combine CI metadata with run config
88-
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
86+
# Combine CI metadata with all available run configs
87+
for config in /workspace/evals/*run_config.yaml; do
88+
if [ -f "$config" ]; then
89+
echo "Appending CI metadata to $config"
90+
cat /workspace/evals/ci_metadata.yaml >> "$config"
91+
fi
92+
done
8993
90-
# Substitute environment variables in model_config.yaml
94+
# Substitute environment variables in all configs
9195
python3 /workspace/evals/substitute_env.py
9296
9397
cd /evalbench
9498
export PYTHONPATH=./evalbench:./evalbench/evalproto
9599
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
96100
97-
echo "Launching Standalone Evaluation..."
98-
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
101+
# Run evaluations for all available run configs
102+
for config in /workspace/evals/*run_config.yaml; do
103+
if [ -f "$config" ]; then
104+
echo "Launching Evaluation for config: $config"
105+
python3 evalbench/evalbench.py --experiment_config="$config"
106+
fi
107+
done
99108
100109
101110
availableSecrets:

evals/claude_code_model.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
claude_code_version: "@anthropic-ai/claude-code@2.1.119"
16+
generator: claude_code
17+
model: "claude-opus-4-7"
18+
19+
use_vertex: true
20+
vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
21+
vertex_region: "global"
22+
23+
env:
24+
# Global environment variables
25+
CLOUD_ML_REGION: "global"
26+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
27+
28+
# Cloud SQL PostgreSQL extension configuration
29+
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
30+
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
31+
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
32+
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
33+
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
34+
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
35+
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
36+
37+
setup:
38+
skills_dir: "/workspace/cloud-sql-postgresql"

evals/claude_dataset.json

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "cloud-sql-debug-instance",
5+
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
6+
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
7+
"expected_trajectory": [
8+
"list_instances.js",
9+
"get_instance.js"
10+
],
11+
"expected_skills": [
12+
"cloud-sql-postgres-admin"
13+
],
14+
"env": {
15+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
16+
},
17+
"kind": "tools",
18+
"max_turns": 3
19+
},
20+
{
21+
"id": "cloud-sql-schema-tables-explore",
22+
"starting_prompt": "I want to understand the structure of my database.",
23+
"conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
24+
"expected_trajectory": [
25+
"list_schemas.js",
26+
"list_tables.js"
27+
],
28+
"expected_skills": [
29+
"cloud-sql-postgres-data"
30+
],
31+
"env": {
32+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
33+
},
34+
"kind": "tools",
35+
"max_turns": 3
36+
},
37+
{
38+
"id": "cloud-sql-performance-check",
39+
"starting_prompt": "Our database performance seems degraded.",
40+
"conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
41+
"expected_trajectory": [
42+
"list_active_queries.js",
43+
"list_locks.js"
44+
],
45+
"expected_skills": [
46+
"cloud-sql-postgres-monitor"
47+
],
48+
"env": {
49+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
50+
},
51+
"kind": "tools",
52+
"max_turns": 3
53+
},
54+
{
55+
"id": "cloud-sql-metrics-cpu-investigation",
56+
"starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
57+
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
58+
"expected_trajectory": [
59+
"get_system_metrics.js",
60+
"list_database_stats.js"
61+
],
62+
"expected_skills": [
63+
"cloud-sql-postgres-monitor"
64+
],
65+
"env": {
66+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
67+
},
68+
"kind": "tools",
69+
"max_turns": 3
70+
}
71+
]
72+
}

evals/claude_run_config.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Dataset Related Configs
16+
dataset_config: /workspace/evals/claude_dataset.json
17+
dataset_format: agent-format
18+
19+
# Orchestrator Configuration
20+
orchestrator: agent
21+
model_config: /workspace/evals/claude_code_model.yaml
22+
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
23+
24+
# Runner Related Configs
25+
runners:
26+
agent_runners: 1
27+
28+
# Scorer Related Configs
29+
scorers:
30+
# Qualitative (Judge-based)
31+
goal_completion:
32+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
33+
behavioral_metrics:
34+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
35+
skills_best_practices:
36+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
37+
skills_dir: /workspace/cloud-sql-postgresql/skills
38+
39+
# Performance
40+
turn_count: {}
41+
end_to_end_latency: {}
42+
tool_call_latency: {}
43+
token_consumption: {}
44+
skills_trajectory: {}
45+
46+
# Reporting Related Configs
47+
reporting:
48+
bigquery:
49+
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

evals/gemini_cli_model.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
gemini_cli_version: "@google/gemini-cli@latest"
16+
generator: gemini_cli
17+
env:
18+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
19+
GOOGLE_CLOUD_LOCATION: "global"
20+
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
GEMINI_CLI_TRUST_WORKSPACE: "true"
22+
setup:
23+
extensions:
24+
# Points to the symlink created in cloudbuild.yaml to match the extension ID
25+
"/workspace/cloud-sql-postgresql":
26+
settings:
27+
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
28+
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
29+
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
30+
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
31+
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
32+
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
33+
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
"list_instances",
99
"get_instance"
1010
],
11+
"expected_skills": [
12+
"cloud-sql-postgres-admin"
13+
],
1114
"env": {
1215
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
1316
},
@@ -22,6 +25,9 @@
2225
"list_schemas",
2326
"list_tables"
2427
],
28+
"expected_skills": [
29+
"cloud-sql-postgres-data"
30+
],
2531
"env": {
2632
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
2733
},
@@ -36,6 +42,9 @@
3642
"list_active_queries",
3743
"list_locks"
3844
],
45+
"expected_skills": [
46+
"cloud-sql-postgres-monitor"
47+
],
3948
"env": {
4049
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
4150
},
@@ -50,11 +59,14 @@
5059
"get_system_metrics",
5160
"list_database_stats"
5261
],
62+
"expected_skills": [
63+
"cloud-sql-postgres-monitor"
64+
],
5365
"env": {
5466
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
5567
},
5668
"kind": "tools",
5769
"max_turns": 3
5870
}
5971
]
60-
}
72+
}
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
dataset_config: /workspace/evals/dataset.json
15+
dataset_config: /workspace/evals/gemini_dataset.json
1616
dataset_format: gemini-cli-format
1717

1818
orchestrator: geminicli
19-
model_config: /workspace/evals/model_config.yaml
19+
model_config: /workspace/evals/gemini_cli_model.yaml
2020
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2121

2222
scorers:
@@ -39,4 +39,3 @@ scorers:
3939
reporting:
4040
bigquery:
4141
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
42-

evals/substitute_env.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
import os
22
import re
3+
import glob
34

45
def main():
5-
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
6-
for yaml_path in yaml_paths:
7-
if os.path.exists(yaml_path):
8-
with open(yaml_path, 'r') as f:
9-
content = f.read()
10-
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
11-
with open(yaml_path, 'w') as f:
12-
f.write(content)
13-
print(f"Successfully substituted environment variables in {yaml_path}")
14-
else:
15-
print(f"File not found: {yaml_path}")
6+
# Find all .yaml and .json files in /workspace/evals
7+
paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True)
8+
9+
for path in paths:
10+
if os.path.isfile(path):
11+
try:
12+
with open(path, 'r') as f:
13+
content = f.read()
14+
# Substitute ${VAR} with environment variables
15+
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
16+
with open(path, 'w') as f:
17+
f.write(content)
18+
print(f"Successfully substituted environment variables in {path}")
19+
except Exception as e:
20+
print(f"Error processing {path}: {e}")
1621

1722
if __name__ == '__main__':
1823
main()

0 commit comments

Comments
 (0)