Enforce version sync and parallelize ready eval DAG stages #153
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Eval Quality | |
| on: | |
| pull_request: | |
| paths: | |
| - 'src/**' | |
| - 'eval/**' | |
| - '.github/workflows/eval.yml' | |
| - 'Cargo.toml' | |
| - 'Cargo.lock' | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| env: | |
| CARGO_TERM_COLOR: always | |
| jobs: | |
| eval: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Check eval secret | |
| id: secret-check | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| run: | | |
| if [ -n "${OPENAI_API_KEY}" ]; then | |
| echo "configured=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "configured=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - uses: actions/checkout@v4 | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| with: | |
| fetch-depth: 0 | |
| - uses: dtolnay/rust-toolchain@1.88.0 | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| - uses: Swatinem/rust-cache@v2 | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| - name: Build current branch binary | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| run: cargo build --release | |
| - name: Build baseline report from origin/main | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| run: | | |
| git fetch origin main --depth=1 | |
| git worktree add /tmp/diffscope-main origin/main | |
| cd /tmp/diffscope-main | |
| cargo build --release | |
| ./target/release/diffscope eval \ | |
| --model gpt-4o-mini \ | |
| --temperature 0 \ | |
| --fixtures eval/fixtures \ | |
| --output /tmp/eval-baseline.json | |
| - name: Run eval thresholds on current branch | |
| if: ${{ steps.secret-check.outputs.configured == 'true' }} | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| run: | | |
| ./target/release/diffscope eval \ | |
| --model gpt-4o-mini \ | |
| --temperature 0 \ | |
| --fixtures eval/fixtures \ | |
| --output eval-current.json \ | |
| --baseline /tmp/eval-baseline.json \ | |
| --max-micro-f1-drop 0.20 \ | |
| --min-micro-f1 0.20 \ | |
| --min-rule-f1 sec.shell.injection=0.10 \ | |
| --min-rule-f1 reliability.unwrap_panic=0.10 \ | |
| --max-rule-f1-drop sec.shell.injection=0.25 \ | |
| --max-rule-f1-drop reliability.unwrap_panic=0.25 | |
| - name: Upload eval reports | |
| if: ${{ always() && steps.secret-check.outputs.configured == 'true' }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-reports | |
| path: | | |
| eval-current.json | |
| /tmp/eval-baseline.json | |
| - name: Skip message | |
| if: ${{ steps.secret-check.outputs.configured != 'true' }} | |
| run: echo "Skipping eval workflow because OPENAI_API_KEY secret is not configured." |