ci: update evalbench pipeline trigger, sync scorers, and bump gemini-cli (#126)

omkargaikwad23 · web-flow · commit c441803e7888 · 2026-05-05T14:58:52.000+05:30
diff --git a/.github/labels.yaml b/.github/labels.yaml
@@ -83,4 +83,8 @@
 
 - name: 'release-please:force-run'
   color: bdca82
-  description: Manually trigger the release please workflow on a PR.
+  description: Manually trigger the release please workflow on a PR.
+
+- name: 'ci:run-evals'
+  color: 4285f4
+  description: Manually trigger the evaluation CI pipeline on a PR.
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -27,12 +27,7 @@ steps:
       - |
         set -e
 
-        # Only run on release branches
-        if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
-          echo "Not a release-please branch. Exiting."
-          exit 0
-        fi
-        echo "Release branch detected. Fetching PR data from GitHub API..."
+        echo "Fetching PR data from GitHub API..."
 
         # Fetch PR data and status code
         HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -46,19 +41,25 @@ steps:
 
         PR_DATA=$(cat pr_data.json)
 
-        # Extract labels and title from PR data (Use $$ to escape bash variables)
-        PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
+        # Extract title from PR data (Use $$ to escape bash variables)
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
-        # Determine Release Version (Use double quotes and $$ for bash variables)
-        if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
+        # Check if execution labels are present using exact matching via jq
+        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
+          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
+          exit 0
+        fi
+        echo "Execution label detected. Processing release version context..."
+
+        # Determine Release Version based on branch name
+        if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
           if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
             export RELEASE_VERSION="$${BASH_REMATCH[1]}"
           else
-            export RELEASE_VERSION="unknown"
+            export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
           fi
         else
-          export RELEASE_VERSION="unknown"
+          export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
         fi
 
         # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
@@ -68,6 +69,8 @@ steps:
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION
+        export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
+
 
         # Combine CI metadata with run config
         cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
diff --git a/evals/dataset.json b/evals/dataset.json
@@ -2,28 +2,28 @@
   "scenarios": [
     {
       "id": "bq-search-and-insight",
-      "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.",
+      "starting_prompt": "Find tables related to sales in project ${GOOGLE_CLOUD_PROJECT}.",
       "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.",
       "expected_trajectory": [
         "search_catalog",
         "ask_data_insights"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 4
     },
     {
       "id": "bq-insight-and-forecast",
-      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?",
+      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project '${GOOGLE_CLOUD_PROJECT}'?",
       "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.",
       "expected_trajectory": [
         "ask_data_insights",
         "forecast"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 4
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-gemini_cli_version: "@google/gemini-cli@0.38.1"
+gemini_cli_version: "@google/gemini-cli@latest"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_CLI_TRUST_WORKSPACE: "true"
 setup:
   extensions:
     # Points to the symlink created in cloudbuild.yaml to match the extension ID
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
@@ -25,13 +25,18 @@ scorers:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
   behavioral_metrics:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  skills_best_practices:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+    skills_dir: /workspace/bigquery-data-analytics/skills
 
   # Performance
   turn_count: {}
   end_to_end_latency: {}
   tool_call_latency: {}
   token_consumption: {}
+  skills_trajectory: {}
 
 reporting:
   bigquery:
-    gcp_project_id: cloud-db-nl2sql
+    gcp_project_id: "${EVAL_REPORTING_PROJECT}"
+
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
@@ -6,7 +6,8 @@ def main():
     workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
     yaml_paths = [
         os.path.join(workspace, 'evals/model_config.yaml'),
-        os.path.join(workspace, 'evals/run_config.yaml')
+        os.path.join(workspace, 'evals/run_config.yaml'),
+        os.path.join(workspace, 'evals/dataset.json')
     ]
     
     for yaml_path in yaml_paths:

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,8 @@ def main():`
`6`	`6`	`workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')`
`7`	`7`	`yaml_paths = [`
`8`	`8`	`os.path.join(workspace, 'evals/model_config.yaml'),`
`9`		`- os.path.join(workspace, 'evals/run_config.yaml')`
	`9`	`+ os.path.join(workspace, 'evals/run_config.yaml'),`
	`10`	`+ os.path.join(workspace, 'evals/dataset.json')`
`10`	`11`	`]`
`11`	`12`
`12`	`13`	`for yaml_path in yaml_paths:`