chore: parallelize nightly evaluations and fix suite timeouts (#472)

cocosheng-g · web-flow · commit 52c365de8e9c · 2026-03-05T14:46:32.000-05:00
This PR overhauls the nightly evaluations suite to run significantly faster and eliminates several sources of non-deterministic timeouts and endless loops. ### 🚀 Enhancements * **Parallel Execution**: Refactored `evals-nightly.yml` to dynamically generate a matrix of all `.eval.ts` files and run them concurrently, drastically reducing the total suite execution time. * **Aggregated Reporting**: Enhanced `scripts/aggregate_evals.ts` to collect artifacts from parallel jobs and output a unified, structured Markdown table with pass rates, latencies, and detailed collapsible failure logs. ### 🛠️ Stability & Bug Fixes * **Issue Fixer Search Loops**: Scaffolded missing mock source files (e.g., `src/index.js`), a `package.json` with a valid `test` script, and a mock `gh` CLI executable in the `TestRig`. This prevents the agent from infinitely searching for non-existent files or trying to repair a broken mock testing environment. * **Assistant Polling Loops**: Injected the mock MCP server into `gemini-assistant.eval.ts` so it can successfully execute `add_issue_comment`. Updated the system prompt to explicitly instruct the agent to exit immediately after posting its plan, rather than infinitely polling the issue for an `@gemini-cli /approve` comment. Also explicitly defined the typo in `fix-typo` to stop the agent from hopelessly guessing. * **JSON Quoting Flakes**: Updated `gemini-scheduled-triage.toml` to output its JSON array to `$GITHUB_ENV` using a heredoc (`cat << 'EOF'`) instead of `echo '...'`. This prevents bash syntax errors when the model's generated text naturally contains single quotes. * **Global Timeout Boundaries**: Increased the `TestRig` hard-kill timeout from 3 to 10 minutes to safely accommodate complex, high-turn fixes (e.g., `fix-flaky-test`). * **Security Warnings**: Added a top-level `permissions: contents: read` block to the nightly workflow to resolve CodeQL linting warnings. Successful run: https://github.com/google-github-actions/run-gemini-cli/actions/runs/22689405186
diff --git a/.github/commands/gemini-invoke.toml b/.github/commands/gemini-invoke.toml
@@ -82,7 +82,7 @@ Begin every task by building a complete picture of the situation.
       Please review this plan. To approve, comment `@gemini-cli /approve` on this issue. To make changes, comment changes needed.
       ```
 
-3. **Post the Plan**: You MUST use `add_issue_comment` to post your plan. The workflow should end only after this tool call has been successfully formulated.
+3. **Post the Plan**: You MUST use `add_issue_comment` to post your plan. The workflow should end only after this tool call has been successfully formulated. Do not wait for human approval or check for comments; exit immediately after posting.
 
 -----
 
diff --git a/.github/commands/gemini-scheduled-triage.toml b/.github/commands/gemini-scheduled-triage.toml
@@ -85,9 +85,15 @@ Iterate through each issue object. For each issue:
 
 ### Step 5: Construct and Write Output
 
-Assemble the results into a single JSON array, formatted as a string, according to the **Output Specification** below. Finally, execute the command to write this string to the output file, ensuring the JSON is enclosed in single quotes to prevent shell interpretation.
-
-- Use the shell command to write: `echo 'TRIAGED_ISSUES=...' > "$GITHUB_ENV"` (Replace `...` with the final, minified JSON array string).
+Assemble the results into a single JSON array, formatted as a string, according to the **Output Specification** below. Finally, execute the command to write this string to the output file.
+
+- Use the shell command to write using a heredoc to prevent quote escaping issues:
+  ```bash
+  cat << 'EOF' >> "$GITHUB_ENV"
+  TRIAGED_ISSUES=...
+  EOF
+  ```
+  (Replace `...` with the final, minified JSON array string).
 
 ## Output Specification
 
diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml
@@ -1,5 +1,8 @@
 name: 'Nightly Evaluations'
 
+permissions:
+  contents: 'read'
+
 on:
   schedule:
     - cron: '0 1 * * *' # 1 AM UTC
@@ -11,15 +14,27 @@ on:
         default: '1'
 
 jobs:
+  list-evals:
+    runs-on: 'ubuntu-22.04'
+    outputs:
+      matrix: '${{ steps.set-matrix.outputs.matrix }}'
+    steps:
+      - name: 'Checkout code'
+        uses: 'actions/checkout@v4' # ratchet:exclude
+      - id: 'set-matrix'
+        run: |
+          FILES=$(find evals -maxdepth 1 -name "*.eval.ts" | sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "matrix=${FILES}" >> "$GITHUB_OUTPUT"
+
   evaluate:
+    needs: 'list-evals'
     runs-on: 'ubuntu-22.04'
-    permissions:
-      contents: 'read'
     strategy:
       fail-fast: false
       matrix:
-        model: ['gemini-3-pro-preview', 'gemini-3-flash-preview']
-    name: 'Evaluate ${{ matrix.model }}'
+        model: ['gemini-3-flash-preview']
+        eval-file: '${{ fromJson(needs.list-evals.outputs.matrix) }}'
+    name: 'Evaluate ${{ matrix.eval-file }} (${{ matrix.model }})'
 
     steps:
       - name: 'Checkout code'
@@ -32,12 +47,14 @@ jobs:
           cache: 'npm'
 
       - name: 'Install dependencies'
+        # Retry logic for transient network or package retrieval failures
         run: |
           npm ci || (sleep 10 && npm ci) || (sleep 30 && npm ci)
 
       - name: 'Install Gemini CLI'
+        # Retry logic for transient network or package retrieval failures
         run: |
-          npm install -g @google/gemini-cli@0.29.7 || (sleep 10 && npm install -g @google/gemini-cli@0.29.7) || (sleep 30 && npm install -g @google/gemini-cli@0.29.7)
+          npm install -g @google/gemini-cli@latest || (sleep 10 && npm install -g @google/gemini-cli@latest) || (sleep 30 && npm install -g @google/gemini-cli@latest)
 
       - name: 'Run Evaluations'
         id: 'run_evals'
@@ -46,16 +63,42 @@ jobs:
           GOOGLE_API_KEY: '${{ secrets.GOOGLE_API_KEY }}'
           GEMINI_MODEL: '${{ matrix.model }}'
         run: |
-          npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json || true
+          BASE_NAME=$(basename "${{ matrix.eval-file }}" .eval.ts)
+          npm run test:evals -- "${{ matrix.eval-file }}" --reporter=json --outputFile="eval-results-${{ matrix.model }}-${BASE_NAME}.json"
 
       - name: 'Upload Results'
         if: 'always()'
         uses: 'actions/upload-artifact@v4' # ratchet:exclude
         with:
-          name: 'eval-results-${{ matrix.model }}'
-          path: 'eval-results-${{ matrix.model }}.json'
+          name: 'eval-results-${{ matrix.model }}-${{ strategy.job-index }}'
+          path: 'eval-results-${{ matrix.model }}-*.json'
 
-      - name: 'Job Summary'
-        if: 'always()'
+  report:
+    needs: 'evaluate'
+    if: 'always()'
+    runs-on: 'ubuntu-22.04'
+    steps:
+      - name: 'Checkout code'
+        uses: 'actions/checkout@v4' # ratchet:exclude
+
+      - name: 'Set up Node.js'
+        uses: 'actions/setup-node@v4' # ratchet:exclude
+        with:
+          node-version: '20'
+          cache: 'npm'
+
+      - name: 'Install dependencies'
+        # Retry logic for transient network or package retrieval failures
+        run: |
+          npm ci || (sleep 10 && npm ci) || (sleep 30 && npm ci)
+
+      - name: 'Download Results'
+        uses: 'actions/download-artifact@v4' # ratchet:exclude
+        with:
+          path: 'eval-results'
+          pattern: 'eval-results-*'
+          merge-multiple: true
+
+      - name: 'Aggregate All Results'
         run: |
-          npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY"
+          npx tsx scripts/aggregate_evals.ts eval-results/*.json >> "$GITHUB_STEP_SUMMARY"
diff --git a/evals/data/gemini-assistant.json b/evals/data/gemini-assistant.json
@@ -3,7 +3,7 @@
     "id": "fix-typo",
     "inputs": {
       "TITLE": "Fix typo in utils.js",
-      "DESCRIPTION": "There is a typo in the helper function name.",
+      "DESCRIPTION": "There is a typo in the helper function name. It should be 'newName' instead of 'oldName'.",
       "EVENT_NAME": "issues",
       "IS_PULL_REQUEST": "false",
       "ISSUE_NUMBER": "10",
diff --git a/evals/gemini-assistant.eval.ts b/evals/gemini-assistant.eval.ts
@@ -18,6 +18,7 @@ describe('Gemini Assistant Workflow', () => {
     it.concurrent(`should propose a relevant plan: ${item.id}`, async () => {
       const rig = new TestRig(`assistant-${item.id}`);
       try {
+        rig.setupMockMcp();
         rig.initGit();
         rig.createFile(
           'utils.js',
diff --git a/evals/gemini-scheduled-triage.eval.ts b/evals/gemini-scheduled-triage.eval.ts
@@ -16,7 +16,7 @@ const dataset: ScheduledTriageCase[] = JSON.parse(
 
 describe('Scheduled Triage Workflow', () => {
   for (const item of dataset) {
-    it.concurrent(`should batch triage issues: ${item.id}`, async () => {
+    it(`should batch triage issues: ${item.id}`, async () => {
       const rig = new TestRig(`scheduled-triage-${item.id}`);
       try {
         mkdirSync(join(rig.testDir, '.gemini/commands'), { recursive: true });
@@ -37,6 +37,12 @@ describe('Scheduled Triage Workflow', () => {
         const triagedLine = content
           .split('\n')
           .find((l) => l.startsWith('TRIAGED_ISSUES='));
+
+        if (!triagedLine) {
+          console.error(
+            `Failed to find TRIAGED_ISSUES in env file. stdout: ${stdout}`,
+          );
+        }
         expect(triagedLine).toBeDefined();
 
         const jsonStr = triagedLine!.split('=', 2)[1];
diff --git a/evals/issue-fixer.eval.ts b/evals/issue-fixer.eval.ts
@@ -26,7 +26,48 @@ describe('Issue Fixer Workflow', () => {
         );
         rig.createFile(
           'package.json',
-          '{"name": "test", "dependencies": {"lodash": "4.17.0"}}',
+          '{"name": "test", "scripts": {"test": "echo \\"tests passed\\" && exit 0"}, "dependencies": {"lodash": "4.17.0"}}',
+        );
+        rig.createFile(
+          'src/db/search.js',
+          'export function searchUser(db, name) {\n  const query = "SELECT * FROM users WHERE name = \'" + name + "\'";\n  return db.query(query);\n}\n',
+        );
+        rig.createFile(
+          'src/index.js',
+          'function calculate(a, b) {\n  return a + b;\n}\n\nfunction login(username, password) {\n  if (password === "forgot password") throw new Error("crash");\n  return true;\n}\n',
+        );
+        rig.createFile(
+          'src/async.js',
+          "async function fetchData() {\n  return await api.get('/data');\n}\n",
+        );
+        rig.createFile(
+          'src/ui/Component.tsx',
+          "import React from 'react';\nexport const Component = () => {\n  return <div>UI</div>;\n}\n",
+        );
+        rig.createFile(
+          'src/utils/validation.ts',
+          'export const validate = () => true;\n',
+        );
+        rig.createFile(
+          'src/UserForm.tsx',
+          "import React from 'react';\nexport const UserForm = () => {\n  const isValid = true;\n  return <form>User</form>;\n}\n",
+        );
+        rig.createFile(
+          'src/OrderForm.tsx',
+          "import React from 'react';\nexport const OrderForm = () => {\n  const isValid = true;\n  return <form>Order</form>;\n}\n",
+        );
+        rig.createFile(
+          'test/UserProfile.test.js',
+          'describe("UserProfile", () => {\n  it("should load data", async () => {\n    // Flaky network call\n  });\n});\n',
+        );
+
+        rig.createFile(
+          'src/CheckoutWizard.tsx',
+          'import React, { useState } from "react";\nexport const CheckoutWizard = () => {\n  const [step, setStep] = useState(0);\n  const nextStep = async () => {\n    await new Promise(r => setTimeout(r, 100));\n    setStep(s => s + 1);\n  };\n  return <button onClick={nextStep}>Next</button>;\n};\n',
+        );
+        rig.createFile(
+          'scripts/deploy.js',
+          'const fs = require("fs");\nif (fs.exists("dist")) {\n  console.log("Deploying...");\n}\n',
         );
 
         mkdirSync(join(rig.testDir, '.gemini/commands'), { recursive: true });
diff --git a/evals/issue-triage.eval.ts b/evals/issue-triage.eval.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 import { TestRig } from './test-rig';
-import { readFileSync, mkdirSync, copyFileSync } from 'node:fs';
+import { readFileSync, mkdirSync, copyFileSync, existsSync } from 'node:fs';
 import { join } from 'node:path';
 
 interface TriageCase {
@@ -18,7 +18,7 @@ const dataset: TriageCase[] = JSON.parse(readFileSync(datasetPath, 'utf-8'));
 
 describe('Issue Triage Workflow', () => {
   for (const item of dataset) {
-    it.concurrent(`should correctly triage: ${item.id}`, async () => {
+    it(`should correctly triage: ${item.id}`, async () => {
       const rig = new TestRig(`triage-${item.id}`);
       try {
         // Setup the command
@@ -36,7 +36,16 @@ describe('Issue Triage Workflow', () => {
           GITHUB_ENV: envFile,
         };
 
-        await rig.run(['--prompt', '/gemini-triage', '--yolo'], env);
+        const stdout = await rig.run(
+          ['--prompt', '/gemini-triage', '--yolo'],
+          env,
+        );
+
+        if (!existsSync(envFile)) {
+          throw new Error(
+            `envFile was not created at ${envFile}.\nStdout: ${stdout}\nStderr: ${rig.lastRunStderr}`,
+          );
+        }
 
         // Check the output in GITHUB_ENV
         const content = readFileSync(envFile, 'utf-8');
diff --git a/evals/mock-mcp-server.ts b/evals/mock-mcp-server.ts
@@ -287,6 +287,18 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
           },
         ],
       };
+    case 'issue_read':
+      return {
+        content: [
+          {
+            type: 'text',
+            text: JSON.stringify({
+              title: 'Mock Issue',
+              body: 'This is a mock issue body.',
+            }),
+          },
+        ],
+      };
     case 'issue_read.get_comments':
       return {
         content: [
diff --git a/evals/pr-review.eval.ts b/evals/pr-review.eval.ts
diff --git a/evals/test-rig.ts b/evals/test-rig.ts
diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts
diff --git a/scripts/aggregate_evals.ts b/scripts/aggregate_evals.ts