Skip to content

Commit 5d5fc1c

Browse files
ci: update evaluation configuration with skill tagging, model upgrade to 4-7, and restructured run settings
1 parent abae0d3 commit 5d5fc1c

3 files changed

Lines changed: 27 additions & 4 deletions

File tree

evals/claude_code_model.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@
1414

1515
claude_code_version: "@anthropic-ai/claude-code@2.1.85"
1616
generator: claude_code
17-
model: "claude-opus-4-6"
17+
model: "claude-opus-4-7"
1818

1919
use_vertex: true
2020
vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
21-
vertex_region: "us-east5"
21+
vertex_region: "global"
2222

2323
env:
24-
CLOUD_ML_REGION: "us-east5"
24+
# Global environment variables
25+
CLOUD_ML_REGION: "global"
2526
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
27+
28+
# Cloud SQL PostgreSQL extension configuration
2629
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
2730
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
2831
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"

evals/claude_run_config.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,20 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
# Dataset Related Configs
1516
dataset_config: /workspace/evals/dataset.json
16-
dataset_format: gemini-cli-format
17+
dataset_format: agent-format
1718

19+
# Orchestrator Configuration
1820
orchestrator: agent
1921
model_config: /workspace/evals/claude_code_model.yaml
2022
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2123

24+
# Runner Related Configs
25+
runners:
26+
agent_runners: 1
27+
28+
# Scorer Related Configs
2229
scorers:
2330
# Qualitative (Judge-based)
2431
goal_completion:
@@ -36,6 +43,7 @@ scorers:
3643
token_consumption: {}
3744
skills_trajectory: {}
3845

46+
# Reporting Related Configs
3947
reporting:
4048
bigquery:
4149
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

evals/dataset.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
"list_instances",
99
"get_instance"
1010
],
11+
"expected_skills": [
12+
"cloud-sql-postgres-admin"
13+
],
1114
"env": {
1215
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
1316
},
@@ -22,6 +25,9 @@
2225
"list_schemas",
2326
"list_tables"
2427
],
28+
"expected_skills": [
29+
"cloud-sql-postgres-data"
30+
],
2531
"env": {
2632
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
2733
},
@@ -36,6 +42,9 @@
3642
"list_active_queries",
3743
"list_locks"
3844
],
45+
"expected_skills": [
46+
"cloud-sql-postgres-monitor"
47+
],
3948
"env": {
4049
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
4150
},
@@ -50,6 +59,9 @@
5059
"get_system_metrics",
5160
"list_database_stats"
5261
],
62+
"expected_skills": [
63+
"cloud-sql-postgres-monitor"
64+
],
5365
"env": {
5466
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
5567
},

0 commit comments

Comments
 (0)