Skip to content

Commit c441803

Browse files
ci: update evalbench pipeline trigger, sync scorers, and bump gemini-cli (#126)
1 parent c6e37b1 commit c441803

6 files changed

Lines changed: 34 additions & 20 deletions

File tree

.github/labels.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,8 @@
8383

8484
- name: 'release-please:force-run'
8585
color: bdca82
86-
description: Manually trigger the release please workflow on a PR.
86+
description: Manually trigger the release please workflow on a PR.
87+
88+
- name: 'ci:run-evals'
89+
color: 4285f4
90+
description: Manually trigger the evaluation CI pipeline on a PR.

cloudbuild.yaml

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,7 @@ steps:
2727
- |
2828
set -e
2929
30-
# Only run on release branches
31-
if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
32-
echo "Not a release-please branch. Exiting."
33-
exit 0
34-
fi
35-
echo "Release branch detected. Fetching PR data from GitHub API..."
30+
echo "Fetching PR data from GitHub API..."
3631
3732
# Fetch PR data and status code
3833
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -46,19 +41,25 @@ steps:
4641
4742
PR_DATA=$(cat pr_data.json)
4843
49-
# Extract labels and title from PR data (Use $$ to escape bash variables)
50-
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
44+
# Extract title from PR data (Use $$ to escape bash variables)
5145
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
5246
53-
# Determine Release Version (Use double quotes and $$ for bash variables)
54-
if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
47+
# Check if execution labels are present using exact matching via jq
48+
if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
49+
echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
50+
exit 0
51+
fi
52+
echo "Execution label detected. Processing release version context..."
53+
54+
# Determine Release Version based on branch name
55+
if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
5556
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
5657
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
5758
else
58-
export RELEASE_VERSION="unknown"
59+
export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
5960
fi
6061
else
61-
export RELEASE_VERSION="unknown"
62+
export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
6263
fi
6364
6465
# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
@@ -68,6 +69,8 @@ steps:
6869
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
6970
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
7071
export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION
72+
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
73+
7174
7275
# Combine CI metadata with run config
7376
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml

evals/dataset.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,28 @@
22
"scenarios": [
33
{
44
"id": "bq-search-and-insight",
5-
"starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.",
5+
"starting_prompt": "Find tables related to sales in project ${GOOGLE_CLOUD_PROJECT}.",
66
"conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.",
77
"expected_trajectory": [
88
"search_catalog",
99
"ask_data_insights"
1010
],
1111
"env": {
12-
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
12+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
1313
},
1414
"kind": "tools",
1515
"max_turns": 4
1616
},
1717
{
1818
"id": "bq-insight-and-forecast",
19-
"starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?",
19+
"starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project '${GOOGLE_CLOUD_PROJECT}'?",
2020
"conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.",
2121
"expected_trajectory": [
2222
"ask_data_insights",
2323
"forecast"
2424
],
2525
"env": {
26-
"GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
26+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
2727
},
2828
"kind": "tools",
2929
"max_turns": 4

evals/model_config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
gemini_cli_version: "@google/gemini-cli@0.38.1"
15+
gemini_cli_version: "@google/gemini-cli@latest"
1616
generator: gemini_cli
1717
env:
1818
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
1919
GOOGLE_CLOUD_LOCATION: "global"
2020
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
GEMINI_CLI_TRUST_WORKSPACE: "true"
2122
setup:
2223
extensions:
2324
# Points to the symlink created in cloudbuild.yaml to match the extension ID

evals/run_config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,18 @@ scorers:
2525
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2626
behavioral_metrics:
2727
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
28+
skills_best_practices:
29+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
30+
skills_dir: /workspace/bigquery-data-analytics/skills
2831

2932
# Performance
3033
turn_count: {}
3134
end_to_end_latency: {}
3235
tool_call_latency: {}
3336
token_consumption: {}
37+
skills_trajectory: {}
3438

3539
reporting:
3640
bigquery:
37-
gcp_project_id: cloud-db-nl2sql
41+
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
42+

evals/substitute_env.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ def main():
66
workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
77
yaml_paths = [
88
os.path.join(workspace, 'evals/model_config.yaml'),
9-
os.path.join(workspace, 'evals/run_config.yaml')
9+
os.path.join(workspace, 'evals/run_config.yaml'),
10+
os.path.join(workspace, 'evals/dataset.json')
1011
]
1112

1213
for yaml_path in yaml_paths:

0 commit comments

Comments
 (0)