Skip to content

Commit 9b88419

Browse files
ci: split shared dataset into model-specific configurations and update references
1 parent 9dfca58 commit 9b88419

4 files changed

Lines changed: 75 additions & 3 deletions

File tree

evals/claude_dataset.json

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "cloud-sql-debug-instance",
5+
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
6+
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
7+
"expected_trajectory": [
8+
"list_instances.js",
9+
"get_instance.js"
10+
],
11+
"expected_skills": [
12+
"cloud-sql-postgres-admin"
13+
],
14+
"env": {
15+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
16+
},
17+
"kind": "tools",
18+
"max_turns": 3
19+
},
20+
{
21+
"id": "cloud-sql-schema-tables-explore",
22+
"starting_prompt": "I want to understand the structure of my database.",
23+
"conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
24+
"expected_trajectory": [
25+
"list_schemas.js",
26+
"list_tables.js"
27+
],
28+
"expected_skills": [
29+
"cloud-sql-postgres-data"
30+
],
31+
"env": {
32+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
33+
},
34+
"kind": "tools",
35+
"max_turns": 3
36+
},
37+
{
38+
"id": "cloud-sql-performance-check",
39+
"starting_prompt": "Our database performance seems degraded.",
40+
"conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
41+
"expected_trajectory": [
42+
"list_active_queries.js",
43+
"list_locks.js"
44+
],
45+
"expected_skills": [
46+
"cloud-sql-postgres-monitor"
47+
],
48+
"env": {
49+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
50+
},
51+
"kind": "tools",
52+
"max_turns": 3
53+
},
54+
{
55+
"id": "cloud-sql-metrics-cpu-investigation",
56+
"starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
57+
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
58+
"expected_trajectory": [
59+
"get_system_metrics.js",
60+
"list_database_stats.js"
61+
],
62+
"expected_skills": [
63+
"cloud-sql-postgres-monitor"
64+
],
65+
"env": {
66+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
67+
},
68+
"kind": "tools",
69+
"max_turns": 3
70+
}
71+
]
72+
}

evals/claude_run_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
# Dataset Related Configs
16-
dataset_config: /workspace/evals/dataset.json
16+
dataset_config: /workspace/evals/claude_dataset.json
1717
dataset_format: agent-format
1818

1919
# Orchestrator Configuration
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,4 @@
6969
"max_turns": 3
7070
}
7171
]
72-
}
72+
}

evals/gemini_run_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
dataset_config: /workspace/evals/dataset.json
15+
dataset_config: /workspace/evals/gemini_dataset.json
1616
dataset_format: gemini-cli-format
1717

1818
orchestrator: geminicli

0 commit comments

Comments
 (0)