Skip to content

Commit a9394cc

Browse files
ci: implement multi-model evaluation support by replacing static run configs with dynamic discovery and adding Claude/Gemini-specific configurations.
1 parent 69c0c82 commit a9394cc

6 files changed

Lines changed: 126 additions & 21 deletions

File tree

cloudbuild.yaml

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ steps:
4545
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
4646
4747
# Check if execution labels are present using exact matching via jq
48-
if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
49-
echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
48+
if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals" or .name == "ci:run-evals-gemini" or .name == "ci:run-evals-claude")' pr_data.json > /dev/null; then
49+
echo "PR does not have required labels. Skipping execution."
5050
exit 0
5151
fi
5252
echo "Execution label detected. Processing release version context..."
@@ -72,7 +72,6 @@ steps:
7272
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
7373
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
7474
75-
7675
# Set environment variables for extension
7776
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
7877
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
@@ -84,18 +83,28 @@ steps:
8483
# Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
8584
export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
8685
87-
# Combine CI metadata with run config
88-
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
86+
# Combine CI metadata with all available run configs
87+
for config in /workspace/evals/*run_config.yaml; do
88+
if [ -f "$config" ]; then
89+
echo "Appending CI metadata to $config"
90+
cat /workspace/evals/ci_metadata.yaml >> "$config"
91+
fi
92+
done
8993
90-
# Substitute environment variables in model_config.yaml
94+
# Substitute environment variables in all configs
9195
python3 /workspace/evals/substitute_env.py
9296
9397
cd /evalbench
9498
export PYTHONPATH=./evalbench:./evalbench/evalproto
9599
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
96100
97-
echo "Launching Standalone Evaluation..."
98-
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
101+
# Run evaluations for all available run configs
102+
for config in /workspace/evals/*run_config.yaml; do
103+
if [ -f "$config" ]; then
104+
echo "Launching Evaluation for config: $config"
105+
python3 evalbench/evalbench.py --experiment_config="$config"
106+
fi
107+
done
99108
100109
101110
availableSecrets:

evals/claude_code_model.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
claude_code_version: "@anthropic-ai/claude-code@2.1.85"
2+
generator: claude_code
3+
model: "claude-opus-4-6" # Use "claude-opus-4-20250514" for direct API
4+
5+
use_vertex: true
6+
vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
7+
vertex_region: "us-central1"
8+
9+
env:
10+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
11+
12+
setup:
13+
mcp_servers:
14+
"cloud-sql":
15+
httpUrl: "https://sqladmin.googleapis.com/mcp"
16+
authProviderType: google_credentials
17+
headers:
18+
X-Goog-User-Project: "${GOOGLE_CLOUD_PROJECT}"

evals/claude_run_config.yaml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
dataset_config: /workspace/evals/dataset.json
16+
dataset_format: gemini-cli-format
17+
18+
orchestrator: agent
19+
model_config: /workspace/evals/claude_code_model.yaml
20+
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
21+
22+
scorers:
23+
# Qualitative (Judge-based)
24+
goal_completion:
25+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
26+
behavioral_metrics:
27+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
28+
skills_best_practices:
29+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
30+
skills_dir: /workspace/cloud-sql-postgresql/skills
31+
32+
# Performance
33+
turn_count: {}
34+
end_to_end_latency: {}
35+
tool_call_latency: {}
36+
token_consumption: {}
37+
skills_trajectory: {}
38+
39+
reporting:
40+
bigquery:
41+
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

evals/gemini_cli_model.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
gemini_cli_version: "@google/gemini-cli@latest"
16+
generator: gemini_cli
17+
env:
18+
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
19+
GOOGLE_CLOUD_LOCATION: "global"
20+
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
GEMINI_CLI_TRUST_WORKSPACE: "true"
22+
setup:
23+
extensions:
24+
# Points to the symlink created in cloudbuild.yaml to match the extension ID
25+
"/workspace/cloud-sql-postgresql":
26+
settings:
27+
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
28+
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
29+
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
30+
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
31+
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
32+
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
33+
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ dataset_config: /workspace/evals/dataset.json
1616
dataset_format: gemini-cli-format
1717

1818
orchestrator: geminicli
19-
model_config: /workspace/evals/model_config.yaml
19+
model_config: /workspace/evals/gemini_cli_model.yaml
2020
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2121

2222
scorers:
@@ -39,4 +39,3 @@ scorers:
3939
reporting:
4040
bigquery:
4141
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
42-

evals/substitute_env.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
import os
22
import re
3+
import glob
34

45
def main():
5-
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
6-
for yaml_path in yaml_paths:
7-
if os.path.exists(yaml_path):
8-
with open(yaml_path, 'r') as f:
9-
content = f.read()
10-
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
11-
with open(yaml_path, 'w') as f:
12-
f.write(content)
13-
print(f"Successfully substituted environment variables in {yaml_path}")
14-
else:
15-
print(f"File not found: {yaml_path}")
6+
# Find all .yaml and .json files in /workspace/evals
7+
paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True)
8+
9+
for path in paths:
10+
if os.path.isfile(path):
11+
try:
12+
with open(path, 'r') as f:
13+
content = f.read()
14+
# Substitute ${VAR} with environment variables
15+
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
16+
with open(path, 'w') as f:
17+
f.write(content)
18+
print(f"Successfully substituted environment variables in {path}")
19+
except Exception as e:
20+
print(f"Error processing {path}: {e}")
1621

1722
if __name__ == '__main__':
1823
main()

0 commit comments

Comments
 (0)