Nightly Evaluations #27
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: 'Nightly Evaluations' | |
| on: | |
| schedule: | |
| - cron: '0 1 * * *' # 1 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| iterations: | |
| description: 'Number of iterations per test case' | |
| required: true | |
| default: '1' | |
| jobs: | |
| evaluate: | |
| runs-on: 'ubuntu-22.04' | |
| permissions: | |
| contents: 'read' | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: ['gemini-3-pro-preview', 'gemini-3-flash-preview'] | |
| name: 'Evaluate ${{ matrix.model }}' | |
| steps: | |
| - name: 'Checkout code' | |
| uses: 'actions/checkout@v4' # ratchet:exclude | |
| - name: 'Set up Node.js' | |
| uses: 'actions/setup-node@v4' # ratchet:exclude | |
| with: | |
| node-version: '20' | |
| cache: 'npm' | |
| - name: 'Install dependencies' | |
| run: | | |
| npm ci || (sleep 10 && npm ci) || (sleep 30 && npm ci) | |
| - name: 'Install Gemini CLI' | |
| run: | | |
| npm install -g @google/gemini-cli@0.29.7 || (sleep 10 && npm install -g @google/gemini-cli@0.29.7) || (sleep 30 && npm install -g @google/gemini-cli@0.29.7) | |
| - name: 'Run Evaluations' | |
| id: run_evals | |
| env: | |
| GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' | |
| GOOGLE_API_KEY: '${{ secrets.GOOGLE_API_KEY }}' | |
| GEMINI_MODEL: '${{ matrix.model }}' | |
| run: | | |
| npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json || true | |
| - name: 'Upload Results' | |
| if: 'always()' | |
| uses: 'actions/upload-artifact@v4' # ratchet:exclude | |
| with: | |
| name: 'eval-results-${{ matrix.model }}' | |
| path: 'eval-results-${{ matrix.model }}.json' | |
| - name: 'Job Summary' | |
| if: 'always()' | |
| run: | | |
| npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY" |