Skip to content

Commit 7c08190

Browse files
committed
Add CI workflow for skill evals (port from #54 + unit-test job)
1 parent 5115186 commit 7c08190

1 file changed

Lines changed: 273 additions & 0 deletions

File tree

.github/workflows/eval-skills.yml

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
name: Skill Evals
2+
3+
# Diff-gated evaluation runner for the public-facing skills under skills/.
4+
#
5+
# Triggers:
6+
# - pull_request: comment score diff vs main; do not commit anything.
7+
# - schedule: nightly run on changed skills, commit refreshed
8+
# eval-scores.json + per-skill README badges back to main.
9+
# - workflow_dispatch: manual full or partial re-run.
10+
#
11+
# Cost shape: only suites whose source has actually changed (per
12+
# evals/scripts/diff-changed-skills.js) get re-evaluated, so a typical PR
13+
# touching one skill costs roughly one suite's worth of API tokens.
14+
15+
on:
16+
pull_request:
17+
paths:
18+
- "skills/**"
19+
- "evals/**"
20+
- ".github/workflows/eval-skills.yml"
21+
schedule:
22+
# 09:17 UTC daily - off the hour to avoid lining up with API rate limits.
23+
- cron: "17 9 * * *"
24+
workflow_dispatch:
25+
inputs:
26+
run_all:
27+
description: "Re-run every suite regardless of diff"
28+
type: boolean
29+
default: false
30+
31+
concurrency:
32+
group: skill-evals-${{ github.ref }}
33+
cancel-in-progress: true
34+
35+
permissions:
36+
# contents: write is needed only on `schedule` / `workflow_dispatch` so the
37+
# aggregate job can push the refreshed eval-scores.json and per-skill README
38+
# badges back to main. Pull requests use the same workflow but the commit
39+
# step is gated on event_name, so PR runs effectively only need read.
40+
contents: write
41+
pull-requests: write
42+
43+
jobs:
44+
unit-test:
45+
name: "Unit tests"
46+
runs-on: ubuntu-latest
47+
steps:
48+
- name: Checkout
49+
uses: actions/checkout@v4
50+
51+
- name: Setup Node.js
52+
uses: actions/setup-node@v4
53+
with:
54+
node-version: "22"
55+
cache: "npm"
56+
cache-dependency-path: evals/package-lock.json
57+
58+
- name: Install eval dependencies
59+
run: npm ci
60+
working-directory: evals
61+
62+
- name: Run unit tests
63+
run: npm test
64+
working-directory: evals
65+
66+
diff:
67+
name: "Compute changed suites"
68+
runs-on: ubuntu-latest
69+
outputs:
70+
slugs: ${{ steps.compute.outputs.slugs }}
71+
has_changes: ${{ steps.compute.outputs.has_changes }}
72+
steps:
73+
- name: Checkout
74+
uses: actions/checkout@v4
75+
with:
76+
fetch-depth: 0
77+
78+
- name: Compute changed suites
79+
id: compute
80+
run: |
81+
if [[ "${{ inputs.run_all }}" == "true" ]]; then
82+
# run_all overrides: list every suite
83+
slugs="$(node -e 'const m=require("./evals/scripts/_manifest");process.stdout.write(JSON.stringify(m.SUITES.map(s=>s.suite)))')"
84+
else
85+
slugs="$(node evals/scripts/diff-changed-skills.js --json --verbose)"
86+
fi
87+
echo "slugs=${slugs}" >> "$GITHUB_OUTPUT"
88+
if [[ "${slugs}" == "[]" ]]; then
89+
echo "has_changes=false" >> "$GITHUB_OUTPUT"
90+
else
91+
echo "has_changes=true" >> "$GITHUB_OUTPUT"
92+
fi
93+
echo "Changed suites: ${slugs}"
94+
95+
evaluate:
96+
name: "Evaluate ${{ matrix.suite }}"
97+
needs: [unit-test, diff]
98+
if: needs.diff.outputs.has_changes == 'true'
99+
runs-on: ubuntu-latest
100+
strategy:
101+
fail-fast: false
102+
max-parallel: 3
103+
matrix:
104+
suite: ${{ fromJson(needs.diff.outputs.slugs) }}
105+
steps:
106+
- name: Checkout
107+
uses: actions/checkout@v4
108+
with:
109+
fetch-depth: 0
110+
111+
- name: Setup Node.js
112+
uses: actions/setup-node@v4
113+
with:
114+
node-version: "22"
115+
cache: "npm"
116+
cache-dependency-path: evals/package-lock.json
117+
118+
- name: Install eval dependencies
119+
run: npm ci
120+
working-directory: evals
121+
122+
- name: Run eval suite
123+
env:
124+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
125+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
126+
AGENT_MODEL: ${{ vars.AGENT_MODEL || 'claude-sonnet-4-6' }}
127+
RUBRIC_MODEL: ${{ vars.RUBRIC_MODEL || 'anthropic:messages:claude-haiku-4-5-20251001' }}
128+
run: node scripts/aggregate.js --run --only=${{ matrix.suite }}
129+
working-directory: evals
130+
131+
- name: Upload suite results
132+
uses: actions/upload-artifact@v4
133+
with:
134+
name: results-${{ matrix.suite }}
135+
path: evals/${{ matrix.suite }}/results.json
136+
retention-days: 14
137+
138+
aggregate:
139+
name: "Aggregate scores"
140+
needs: [diff, evaluate]
141+
if: needs.diff.outputs.has_changes == 'true' && always() && needs.evaluate.result != 'cancelled'
142+
runs-on: ubuntu-latest
143+
steps:
144+
- name: Checkout
145+
uses: actions/checkout@v4
146+
with:
147+
fetch-depth: 0
148+
token: ${{ secrets.GITHUB_TOKEN }}
149+
150+
- name: Setup Node.js
151+
uses: actions/setup-node@v4
152+
with:
153+
node-version: "22"
154+
155+
- name: Install eval dependencies
156+
run: npm ci
157+
working-directory: evals
158+
159+
- name: Download all suite results
160+
uses: actions/download-artifact@v4
161+
with:
162+
path: artifact-results
163+
pattern: results-*
164+
merge-multiple: false
165+
166+
- name: Stage suite results into evals/<suite>/results.json
167+
run: |
168+
set -e
169+
shopt -s nullglob
170+
for d in artifact-results/results-*; do
171+
name=$(basename "$d" | sed 's/^results-//')
172+
mkdir -p "evals/$name"
173+
if [[ -f "$d/results.json" ]]; then
174+
cp "$d/results.json" "evals/$name/results.json"
175+
echo "Staged evals/$name/results.json"
176+
fi
177+
done
178+
179+
- name: Save previous eval-scores.json for diff
180+
run: |
181+
if [[ -f eval-scores.json ]]; then
182+
cp eval-scores.json /tmp/eval-scores-before.json
183+
else
184+
echo '{"schemaVersion":1,"updatedAt":null,"skills":{}}' > /tmp/eval-scores-before.json
185+
fi
186+
187+
- name: Aggregate
188+
env:
189+
SUITES_JSON: ${{ needs.diff.outputs.slugs }}
190+
run: |
191+
slugs=$(echo "$SUITES_JSON" | node -e 'let s="";process.stdin.on("data",c=>s+=c);process.stdin.on("end",()=>{const a=JSON.parse(s);process.stdout.write(a.join(","))})')
192+
if [[ -z "$slugs" ]]; then
193+
echo "No suites to aggregate"
194+
exit 0
195+
fi
196+
node scripts/aggregate.js --only="$slugs"
197+
working-directory: evals
198+
199+
- name: Render README badges
200+
run: node evals/scripts/render-badges.js
201+
202+
- name: PR comment with score diff
203+
if: github.event_name == 'pull_request'
204+
uses: actions/github-script@v7
205+
env:
206+
BEFORE_PATH: /tmp/eval-scores-before.json
207+
AFTER_PATH: ${{ github.workspace }}/eval-scores.json
208+
with:
209+
script: |
210+
const fs = require('node:fs');
211+
const before = JSON.parse(fs.readFileSync(process.env.BEFORE_PATH, 'utf-8'));
212+
const after = JSON.parse(fs.readFileSync(process.env.AFTER_PATH, 'utf-8'));
213+
const lines = [
214+
'<!-- skill-evals-comment -->',
215+
'## Skill eval results',
216+
'',
217+
'| Skill | Before | After | Δ |',
218+
'|-------|-------:|------:|----:|',
219+
];
220+
const keys = new Set([
221+
...Object.keys(before.skills || {}),
222+
...Object.keys(after.skills || {}),
223+
]);
224+
for (const key of [...keys].sort()) {
225+
const b = (before.skills || {})[key];
226+
const a = (after.skills || {})[key];
227+
if (!a) continue;
228+
const beforeStr = b && b.score !== null ? `${b.score}/100 (${b.passed}/${b.total})` : '-';
229+
const afterStr = a.score !== null ? `${a.score}/100 (${a.passed}/${a.total})` : 'errored';
230+
const delta = (b && b.score !== null && a.score !== null)
231+
? (a.score - b.score === 0 ? 'no change' : (a.score - b.score > 0 ? `+${a.score - b.score}` : `${a.score - b.score}`))
232+
: 'new';
233+
lines.push(`| \`${key}\` | ${beforeStr} | ${afterStr} | ${delta} |`);
234+
}
235+
lines.push('');
236+
lines.push('_Only suites whose source actually changed since their last recorded score were re-run. Soft-failing while we stabilise the baseline._');
237+
const body = lines.join('\n');
238+
239+
const { data: comments } = await github.rest.issues.listComments({
240+
owner: context.repo.owner,
241+
repo: context.repo.repo,
242+
issue_number: context.issue.number,
243+
});
244+
const existing = comments.find((c) => c.body && c.body.startsWith('<!-- skill-evals-comment -->'));
245+
if (existing) {
246+
await github.rest.issues.updateComment({
247+
owner: context.repo.owner,
248+
repo: context.repo.repo,
249+
comment_id: existing.id,
250+
body,
251+
});
252+
} else {
253+
await github.rest.issues.createComment({
254+
owner: context.repo.owner,
255+
repo: context.repo.repo,
256+
issue_number: context.issue.number,
257+
body,
258+
});
259+
}
260+
261+
- name: Commit refreshed scores and badges
262+
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
263+
run: |
264+
if git diff --quiet eval-scores.json skills/; then
265+
echo "No score or badge changes to commit"
266+
exit 0
267+
fi
268+
git config user.name 'github-actions[bot]'
269+
git config user.email '41898282+github-actions[bot]@users.noreply.github.com'
270+
git add eval-scores.json
271+
git add skills/**/README.md
272+
git commit -m "chore(evals): refresh eval-scores.json and README badges"
273+
git push origin HEAD:${{ github.ref_name }}

0 commit comments

Comments
 (0)