getsentry
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 60 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 60 additions & 2 deletions
diff --git a/‎.github/workflows/eval-skill-fork.yml‎
Lines changed: 97 additions & 0 deletions b/‎.github/workflows/eval-skill-fork.yml‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
@@ -42,6 +42,8 @@ jobs:
               - 'docs/**'
               - 'plugins/**'
               - 'script/generate-skill.ts'
+              - 'script/eval-skill.ts'
+              - 'test/skill-eval/**'
             code:
               - 'src/**'
               - 'test/**'
@@ -133,6 +135,58 @@ jobs:
           echo "::error::Generated files are out of date. Run 'bun run generate:skill' and 'bun run generate:command-docs' locally and commit the result."
           exit 1
 
+  eval-skill:
+    name: Eval SKILL.md
+    needs: [changes]
+    if: needs.changes.outputs.skill == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      # For fork PRs: check if eval has already passed via commit status
+      - name: Detect fork
+        id: detect-fork
+        run: |
+          if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
+            echo "is_fork=true" >> "$GITHUB_OUTPUT"
+          fi
+      - name: Check fork eval status
+        if: steps.detect-fork.outputs.is_fork == 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          SHA="${{ github.event.pull_request.head.sha }}"
+          STATUS=$(gh api "repos/${{ github.repository }}/commits/$SHA/statuses" \
+            --jq '[.[] | select(.context == "eval-skill/fork")] | first | .state // "none"')
+          if [[ "$STATUS" != "success" ]]; then
+            echo "::error::Fork PR modifies skill files but eval has not passed for commit $SHA."
+            echo "::error::A maintainer must review the code and add the 'eval-skill' label."
+            exit 1
+          fi
+          echo "Fork eval passed for $SHA"
+      # For internal PRs: run the eval directly
+      - uses: actions/checkout@v6
+        if: steps.detect-fork.outputs.is_fork != 'true'
+      - uses: oven-sh/setup-bun@v2
+        if: steps.detect-fork.outputs.is_fork != 'true'
+      - uses: actions/cache@v5
+        if: steps.detect-fork.outputs.is_fork != 'true'
+        id: cache
+        with:
+          path: node_modules
+          key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }}
+      - if: steps.detect-fork.outputs.is_fork != 'true' && steps.cache.outputs.cache-hit != 'true'
+        run: bun install --frozen-lockfile
+      - name: Eval SKILL.md
+        if: steps.detect-fork.outputs.is_fork != 'true'
+        run: bun run eval:skill
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      - name: Upload eval results
+        if: always() && steps.detect-fork.outputs.is_fork != 'true'
+        uses: actions/upload-artifact@v7
+        with:
+          name: skill-eval-results
+          path: test/skill-eval/results.json
+
   lint:
     name: Lint & Typecheck
     needs: [changes]
@@ -493,15 +547,15 @@ jobs:
   ci-status:
     name: CI Status
     if: always()
-    needs: [changes, check-skill, build-binary, build-npm, build-docs, test-e2e, publish-nightly]
+    needs: [changes, check-skill, eval-skill, build-binary, build-npm, build-docs, test-e2e, publish-nightly]
     runs-on: ubuntu-latest
     permissions: {}
     steps:
       - name: Check CI status
         run: |
           # Check for explicit failures or cancellations in all jobs
           # publish-nightly is skipped on PRs (if: github.ref == 'refs/heads/main') — that's expected
-          results="${{ needs.check-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.publish-nightly.result }}"
+          results="${{ needs.check-skill.result }} ${{ needs.eval-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.publish-nightly.result }}"
           for result in $results; do
             if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
               echo "::error::CI failed"
@@ -519,5 +573,9 @@ jobs:
             echo "::error::CI failed - upstream job failed causing check-skill to be skipped"
             exit 1
           fi
+          if [[ "${{ needs.changes.outputs.skill }}" == "true" && "${{ needs.eval-skill.result }}" == "skipped" ]]; then
+            echo "::error::CI failed - upstream job failed causing eval-skill to be skipped"
+            exit 1
+          fi
 
           echo "CI passed"
@@ -0,0 +1,97 @@
+name: Eval SKILL.md (Fork PRs)
+
+on:
+  pull_request_target:
+    types: [labeled, synchronize]
+
+permissions:
+  contents: read
+  statuses: write
+  pull-requests: write
+
+jobs:
+  remove-labels-on-sync:
+    name: Reset eval labels
+    if: github.event.action == 'synchronize'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remove eval labels
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          PR=${{ github.event.number }}
+          REPO=${{ github.repository }}
+          gh api "repos/$REPO/issues/$PR/labels/eval-skill" -X DELETE 2>/dev/null || true
+          gh api "repos/$REPO/issues/$PR/labels/eval-skill-passed" -X DELETE 2>/dev/null || true
+
+  eval:
+    name: Run skill eval
+    if: >-
+      github.event.action == 'labeled'
+      && github.event.label.name == 'eval-skill'
+      && github.event.pull_request.head.repo.fork == true
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+
+      - uses: oven-sh/setup-bun@v2
+
+      - uses: actions/cache@v5
+        id: cache
+        with:
+          path: node_modules
+          key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }}
+      - if: steps.cache.outputs.cache-hit != 'true'
+        run: bun install --frozen-lockfile
+
+      - name: Eval SKILL.md
+        id: eval
+        run: bun run eval:skill
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        continue-on-error: true
+
+      - name: Post commit status
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          SHA="${{ github.event.pull_request.head.sha }}"
+          if [[ "${{ steps.eval.outcome }}" == "success" ]]; then
+            STATE="success"
+            DESC="Skill eval passed"
+          else
+            STATE="failure"
+            DESC="Skill eval failed"
+          fi
+          gh api "repos/${{ github.repository }}/statuses/$SHA" \
+            -f state="$STATE" \
+            -f context="eval-skill/fork" \
+            -f description="$DESC"
+
+      - name: Remove eval-skill label
+        if: always()
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh api "repos/${{ github.repository }}/issues/${{ github.event.number }}/labels/eval-skill" \
+            -X DELETE 2>/dev/null || true
+
+      # Use the SENTRY_RELEASE_BOT app token to add the label — app tokens
+      # can trigger workflow runs, unlike GITHUB_TOKEN (recursion protection).
+      - name: Get app token
+        id: token
+        if: steps.eval.outcome == 'success'
+        uses: actions/create-github-app-token@v3
+        with:
+          app-id: ${{ vars.SENTRY_RELEASE_BOT_CLIENT_ID }}
+          private-key: ${{ secrets.SENTRY_RELEASE_BOT_PRIVATE_KEY }}
+
+      - name: Add eval-skill-passed label (triggers main CI re-run)
+        if: steps.eval.outcome == 'success'
+        env:
+          GH_TOKEN: ${{ steps.token.outputs.token }}
+        run: |
+          gh api "repos/${{ github.repository }}/issues/${{ github.event.number }}/labels" \
+            --input - <<< '{"labels":["eval-skill-passed"]}'
@@ -18,6 +18,7 @@ coverage-isolated
 
 # test artifacts
 *.junit.xml
+test/skill-eval/results.json
 
 # logs
 logs