Skip to content

Commit 716e2ba

Browse files
authored
feat(skill): add eval framework to measure SKILL.md effectiveness (#602)
## Summary Adds an evaluation framework that measures how effectively SKILL.md guides an LLM agent to use the Sentry CLI efficiently. Inspired by the [skill-creator](https://github.com/anthropics/claude-plugins-official/tree/main/plugins/skill-creator) plugin approach of prompt → plan → grade. - **Two-phase eval:** sends SKILL.md + user prompt to an LLM, then grades the planned commands with deterministic checks (string matching) and an LLM judge (coherence) - **8 test cases** covering the failure modes from #598: no pre-auth, no org/project lookup, correct fields, minimal calls, trusts auto-detection - **Anthropic API** with `claude-sonnet-4-6` + `claude-opus-4-6` as agents, `claude-haiku-4-5` as judge - **CI job** runs on skill-related file changes, protected by the `skill-eval` environment (requires reviewer approval to use the API key) - **Blocking** — added to CI Status, fails below 75% threshold - **Baseline:** 8/8 cases passed (100%) on both models ### Running locally With an Anthropic API key: ```bash ANTHROPIC_API_KEY=sk-ant-... bun run eval:skill ``` Test a single model: ```bash EVAL_AGENT_MODELS=claude-sonnet-4-6 ANTHROPIC_API_KEY=... bun run eval:skill ``` Ref #598
1 parent 27a9f0f commit 716e2ba

35 files changed

Lines changed: 1150 additions & 1062 deletions

File tree

.github/workflows/ci.yml

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ jobs:
4242
- 'docs/**'
4343
- 'plugins/**'
4444
- 'script/generate-skill.ts'
45+
- 'script/eval-skill.ts'
46+
- 'test/skill-eval/**'
4547
code:
4648
- 'src/**'
4749
- 'test/**'
@@ -133,6 +135,58 @@ jobs:
133135
echo "::error::Generated files are out of date. Run 'bun run generate:skill' and 'bun run generate:command-docs' locally and commit the result."
134136
exit 1
135137
138+
eval-skill:
139+
name: Eval SKILL.md
140+
needs: [changes]
141+
if: needs.changes.outputs.skill == 'true'
142+
runs-on: ubuntu-latest
143+
steps:
144+
# For fork PRs: check if eval has already passed via commit status
145+
- name: Detect fork
146+
id: detect-fork
147+
run: |
148+
if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
149+
echo "is_fork=true" >> "$GITHUB_OUTPUT"
150+
fi
151+
- name: Check fork eval status
152+
if: steps.detect-fork.outputs.is_fork == 'true'
153+
env:
154+
GH_TOKEN: ${{ github.token }}
155+
run: |
156+
SHA="${{ github.event.pull_request.head.sha }}"
157+
STATUS=$(gh api "repos/${{ github.repository }}/commits/$SHA/statuses" \
158+
--jq '[.[] | select(.context == "eval-skill/fork")] | first | .state // "none"')
159+
if [[ "$STATUS" != "success" ]]; then
160+
echo "::error::Fork PR modifies skill files but eval has not passed for commit $SHA."
161+
echo "::error::A maintainer must review the code and add the 'eval-skill' label."
162+
exit 1
163+
fi
164+
echo "Fork eval passed for $SHA"
165+
# For internal PRs: run the eval directly
166+
- uses: actions/checkout@v6
167+
if: steps.detect-fork.outputs.is_fork != 'true'
168+
- uses: oven-sh/setup-bun@v2
169+
if: steps.detect-fork.outputs.is_fork != 'true'
170+
- uses: actions/cache@v5
171+
if: steps.detect-fork.outputs.is_fork != 'true'
172+
id: cache
173+
with:
174+
path: node_modules
175+
key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }}
176+
- if: steps.detect-fork.outputs.is_fork != 'true' && steps.cache.outputs.cache-hit != 'true'
177+
run: bun install --frozen-lockfile
178+
- name: Eval SKILL.md
179+
if: steps.detect-fork.outputs.is_fork != 'true'
180+
run: bun run eval:skill
181+
env:
182+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
183+
- name: Upload eval results
184+
if: always() && steps.detect-fork.outputs.is_fork != 'true'
185+
uses: actions/upload-artifact@v7
186+
with:
187+
name: skill-eval-results
188+
path: test/skill-eval/results.json
189+
136190
lint:
137191
name: Lint & Typecheck
138192
needs: [changes]
@@ -493,15 +547,15 @@ jobs:
493547
ci-status:
494548
name: CI Status
495549
if: always()
496-
needs: [changes, check-skill, build-binary, build-npm, build-docs, test-e2e, publish-nightly]
550+
needs: [changes, check-skill, eval-skill, build-binary, build-npm, build-docs, test-e2e, publish-nightly]
497551
runs-on: ubuntu-latest
498552
permissions: {}
499553
steps:
500554
- name: Check CI status
501555
run: |
502556
# Check for explicit failures or cancellations in all jobs
503557
# publish-nightly is skipped on PRs (if: github.ref == 'refs/heads/main') — that's expected
504-
results="${{ needs.check-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.publish-nightly.result }}"
558+
results="${{ needs.check-skill.result }} ${{ needs.eval-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.publish-nightly.result }}"
505559
for result in $results; do
506560
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
507561
echo "::error::CI failed"
@@ -519,5 +573,9 @@ jobs:
519573
echo "::error::CI failed - upstream job failed causing check-skill to be skipped"
520574
exit 1
521575
fi
576+
if [[ "${{ needs.changes.outputs.skill }}" == "true" && "${{ needs.eval-skill.result }}" == "skipped" ]]; then
577+
echo "::error::CI failed - upstream job failed causing eval-skill to be skipped"
578+
exit 1
579+
fi
522580
523581
echo "CI passed"
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
name: Eval SKILL.md (Fork PRs)
2+
3+
on:
4+
pull_request_target:
5+
types: [labeled, synchronize]
6+
7+
permissions:
8+
contents: read
9+
statuses: write
10+
pull-requests: write
11+
12+
jobs:
13+
remove-labels-on-sync:
14+
name: Reset eval labels
15+
if: github.event.action == 'synchronize'
16+
runs-on: ubuntu-latest
17+
steps:
18+
- name: Remove eval labels
19+
env:
20+
GH_TOKEN: ${{ github.token }}
21+
run: |
22+
PR=${{ github.event.number }}
23+
REPO=${{ github.repository }}
24+
gh api "repos/$REPO/issues/$PR/labels/eval-skill" -X DELETE 2>/dev/null || true
25+
gh api "repos/$REPO/issues/$PR/labels/eval-skill-passed" -X DELETE 2>/dev/null || true
26+
27+
eval:
28+
name: Run skill eval
29+
if: >-
30+
github.event.action == 'labeled'
31+
&& github.event.label.name == 'eval-skill'
32+
&& github.event.pull_request.head.repo.fork == true
33+
runs-on: ubuntu-latest
34+
steps:
35+
- uses: actions/checkout@v6
36+
with:
37+
ref: ${{ github.event.pull_request.head.sha }}
38+
39+
- uses: oven-sh/setup-bun@v2
40+
41+
- uses: actions/cache@v5
42+
id: cache
43+
with:
44+
path: node_modules
45+
key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }}
46+
- if: steps.cache.outputs.cache-hit != 'true'
47+
run: bun install --frozen-lockfile
48+
49+
- name: Eval SKILL.md
50+
id: eval
51+
run: bun run eval:skill
52+
env:
53+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
54+
continue-on-error: true
55+
56+
- name: Post commit status
57+
env:
58+
GH_TOKEN: ${{ github.token }}
59+
run: |
60+
SHA="${{ github.event.pull_request.head.sha }}"
61+
if [[ "${{ steps.eval.outcome }}" == "success" ]]; then
62+
STATE="success"
63+
DESC="Skill eval passed"
64+
else
65+
STATE="failure"
66+
DESC="Skill eval failed"
67+
fi
68+
gh api "repos/${{ github.repository }}/statuses/$SHA" \
69+
-f state="$STATE" \
70+
-f context="eval-skill/fork" \
71+
-f description="$DESC"
72+
73+
- name: Remove eval-skill label
74+
if: always()
75+
env:
76+
GH_TOKEN: ${{ github.token }}
77+
run: |
78+
gh api "repos/${{ github.repository }}/issues/${{ github.event.number }}/labels/eval-skill" \
79+
-X DELETE 2>/dev/null || true
80+
81+
# Use the SENTRY_RELEASE_BOT app token to add the label — app tokens
82+
# can trigger workflow runs, unlike GITHUB_TOKEN (recursion protection).
83+
- name: Get app token
84+
id: token
85+
if: steps.eval.outcome == 'success'
86+
uses: actions/create-github-app-token@v3
87+
with:
88+
app-id: ${{ vars.SENTRY_RELEASE_BOT_CLIENT_ID }}
89+
private-key: ${{ secrets.SENTRY_RELEASE_BOT_PRIVATE_KEY }}
90+
91+
- name: Add eval-skill-passed label (triggers main CI re-run)
92+
if: steps.eval.outcome == 'success'
93+
env:
94+
GH_TOKEN: ${{ steps.token.outputs.token }}
95+
run: |
96+
gh api "repos/${{ github.repository }}/issues/${{ github.event.number }}/labels" \
97+
--input - <<< '{"labels":["eval-skill-passed"]}'

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ coverage-isolated
1818

1919
# test artifacts
2020
*.junit.xml
21+
test/skill-eval/results.json
2122

2223
# logs
2324
logs

0 commit comments

Comments
 (0)