Skip to content

Nightly Evaluations #28

Nightly Evaluations

Nightly Evaluations #28

Workflow file for this run

name: 'Nightly Evaluations'
on:
schedule:
- cron: '0 1 * * *' # 1 AM UTC
workflow_dispatch:
inputs:
iterations:
description: 'Number of iterations per test case'
required: true
default: '1'
jobs:
evaluate:
runs-on: 'gemini-cli-ubuntu-16-core'
permissions:
contents: 'read'
strategy:
fail-fast: false
matrix:
model: ['gemini-3-pro-preview', 'gemini-3-flash-preview']
name: 'Evaluate ${{ matrix.model }}'
steps:
- name: 'Checkout code'
uses: 'actions/checkout@v4' # ratchet:exclude
- name: 'Set up Node.js'
uses: 'actions/setup-node@v4' # ratchet:exclude
with:
node-version: '20'
cache: 'npm'
- name: 'Install dependencies'
run: |
npm ci || (sleep 10 && npm ci) || (sleep 30 && npm ci)
- name: 'Install Gemini CLI'
run: |
npm install -g @google/gemini-cli@0.29.7 || (sleep 10 && npm install -g @google/gemini-cli@0.29.7) || (sleep 30 && npm install -g @google/gemini-cli@0.29.7)
- name: 'Run Evaluations'
id: run_evals
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GOOGLE_API_KEY: '${{ secrets.GOOGLE_API_KEY }}'
GEMINI_MODEL: '${{ matrix.model }}'
run: |
npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json || true
- name: 'Upload Results'
if: 'always()'
uses: 'actions/upload-artifact@v4' # ratchet:exclude
with:
name: 'eval-results-${{ matrix.model }}'
path: 'eval-results-${{ matrix.model }}.json'
- name: 'Job Summary'
if: 'always()'
run: |
npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY"