Skip to content
Merged
21 changes: 15 additions & 6 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ steps:
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT


# Set environment variables for extension
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
Expand All @@ -84,18 +83,28 @@ steps:
# Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD

# Combine CI metadata with run config
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
# Combine CI metadata with all available run configs
for config in /workspace/evals/*run_config.yaml; do
if [ -f "$config" ]; then
echo "Appending CI metadata to $config"
cat /workspace/evals/ci_metadata.yaml >> "$config"
fi
done

# Substitute environment variables in model_config.yaml
# Substitute environment variables in all configs
python3 /workspace/evals/substitute_env.py

cd /evalbench
export PYTHONPATH=./evalbench:./evalbench/evalproto
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

echo "Launching Standalone Evaluation..."
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
# Run evaluations for all available run configs
for config in /workspace/evals/*run_config.yaml; do
if [ -f "$config" ]; then
echo "Launching Evaluation for config: $config"
python3 evalbench/evalbench.py --experiment_config="$config"
fi
done


availableSecrets:
Expand Down
35 changes: 35 additions & 0 deletions evals/claude_code_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

claude_code_version: "@anthropic-ai/claude-code@2.1.85"
generator: claude_code
model: "claude-opus-4-6"

use_vertex: true
vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
vertex_region: "us-east5"
Comment thread
omkargaikwad23 marked this conversation as resolved.
Outdated

env:
CLOUD_ML_REGION: "us-east5"
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"

setup:
skills_dir: "/workspace/cloud-sql-postgresql"
41 changes: 41 additions & 0 deletions evals/claude_run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset_config: /workspace/evals/dataset.json
dataset_format: gemini-cli-format

orchestrator: agent
model_config: /workspace/evals/claude_code_model.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
# Qualitative (Judge-based)
goal_completion:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_best_practices:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_dir: /workspace/cloud-sql-postgresql/skills

# Performance
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}
skills_trajectory: {}

reporting:
bigquery:
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
33 changes: 33 additions & 0 deletions evals/gemini_cli_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gemini_cli_version: "@google/gemini-cli@latest"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
GOOGLE_CLOUD_LOCATION: "global"
GOOGLE_GENAI_USE_VERTEXAI: "true"
GEMINI_CLI_TRUST_WORKSPACE: "true"
setup:
extensions:
# Points to the symlink created in cloudbuild.yaml to match the extension ID
"/workspace/cloud-sql-postgresql":
settings:
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
3 changes: 1 addition & 2 deletions evals/run_config.yaml → evals/gemini_run_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dataset_config: /workspace/evals/dataset.json
dataset_format: gemini-cli-format

orchestrator: geminicli
model_config: /workspace/evals/model_config.yaml
model_config: /workspace/evals/gemini_cli_model.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
Expand All @@ -39,4 +39,3 @@ scorers:
reporting:
bigquery:
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

27 changes: 16 additions & 11 deletions evals/substitute_env.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import os
import re
import glob

def main():
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
for yaml_path in yaml_paths:
if os.path.exists(yaml_path):
with open(yaml_path, 'r') as f:
content = f.read()
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(yaml_path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {yaml_path}")
else:
print(f"File not found: {yaml_path}")
# Find all .yaml and .json files in /workspace/evals
paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True)

for path in paths:
if os.path.isfile(path):
try:
with open(path, 'r') as f:
content = f.read()
# Substitute ${VAR} with environment variables
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {path}")
except Exception as e:
print(f"Error processing {path}: {e}")

if __name__ == '__main__':
main()
Loading