Run Evals #44
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Run Evals | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| suite_filter: | |
| description: "Comma-separated glob patterns for eval files to run" | |
| required: false | |
| default: "" | |
| target: | |
| description: "Optional target override (leave empty to use each eval's own target)" | |
| required: false | |
| default: "" | |
| threshold: | |
| description: "Minimum score threshold (0-1)" | |
| required: false | |
| default: "0.8" | |
| jobs: | |
| evals: | |
| name: Run AgentV Evals | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| checks: write | |
| models: read | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: 22 | |
| - uses: ./.github/actions/setup-bun | |
| - name: Build | |
| run: bun run build | |
| - name: Install GitHub Copilot CLI | |
| run: npm install -g @github/copilot | |
| - name: Install Pi CLI | |
| run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)" | |
| - name: Install uv (Python package manager) | |
| run: curl -LsSf https://astral.sh/uv/install.sh | sh | |
| - name: Configure credentials | |
| run: | | |
| cat > .env <<EOF | |
| GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }} | |
| GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }} | |
| COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }} | |
| AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }} | |
| GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }} | |
| GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }} | |
| OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }} | |
| GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }} | |
| EOF | |
| - name: Resolve inputs | |
| id: filter | |
| run: | | |
| PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS }}" | |
| EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS }}" | |
| if [ -n "$EXCLUDES" ]; then PATTERNS="$PATTERNS,$EXCLUDES"; fi | |
| echo "patterns=$PATTERNS" >> "$GITHUB_OUTPUT" | |
| echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT" | |
| echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" | |
| - name: Run AgentV evals | |
| id: run-evals | |
| env: | |
| COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }} | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| mkdir -p .agentv/ci-results | |
| # Split comma-separated patterns into positional args | |
| IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" | |
| # Build optional --target flag (empty = use each eval's own target) | |
| TARGET_FLAG=() | |
| if [ -n "${{ steps.filter.outputs.target }}" ]; then | |
| TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}") | |
| fi | |
| bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ | |
| "${TARGET_FLAG[@]}" \ | |
| --workers 3 \ | |
| --threshold ${{ steps.filter.outputs.threshold }} \ | |
| --output .agentv/ci-results/junit.xml \ | |
| --benchmark-json .agentv/ci-results/benchmark.json \ | |
| --artifacts .agentv/ci-results/artifacts | |
| EXIT_CODE=$? | |
| echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT" | |
| - name: Post eval summary | |
| if: always() | |
| run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY" | |
| - name: Publish JUnit test results | |
| if: always() | |
| continue-on-error: true | |
| uses: dorny/test-reporter@v1 | |
| with: | |
| name: AgentV Eval Results | |
| path: .agentv/ci-results/junit.xml | |
| reporter: java-junit | |
| fail-on-error: false | |
| - name: Upload eval artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results-${{ github.run_id }} | |
| path: | | |
| .agentv/ci-results/ | |
| .agentv/logs/ | |
| retention-days: 30 | |
| - name: Fail if threshold not met | |
| if: always() | |
| run: | | |
| if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then | |
| echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})" | |
| exit 1 | |
| fi |