anyplot/.github/workflows/impl-review.yml at 4cecc77192a6463eefac30812e5ddf93e706f193 · MarkusNeusinger/anyplot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
name: "Impl: Review"
run-name: "Review: PR #${{ inputs.pr_number }}"

# AI quality review for implementation PRs
# Triggered by impl-generate.yml after PR creation

on:
  workflow_dispatch:
    inputs:
      pr_number:
        description: 'PR number to review'
        required: true
        type: string

jobs:
  review:
    runs-on: ubuntu-latest
    permissions:
      contents: write  # Needed for pushing quality score to PR branch
      pull-requests: write
      issues: write
      id-token: write
      actions: write

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Extract PR info
        id: pr
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ inputs.pr_number }}
        run: |
          PR_DATA=$(gh pr view "$PR_NUMBER" --json headRefName,headRefOid,body)
          HEAD_REF=$(echo "$PR_DATA" | jq -r '.headRefName')
          HEAD_SHA=$(echo "$PR_DATA" | jq -r '.headRefOid')
          BODY=$(echo "$PR_DATA" | jq -r '.body')

          # Extract spec-id and library from branch: implementation/{spec-id}/{library}
          SPEC_ID=$(echo "$HEAD_REF" | cut -d'/' -f2)
          LIBRARY=$(echo "$HEAD_REF" | cut -d'/' -f3)

          # Extract issue number from PR body
          ISSUE_NUMBER=$(echo "$BODY" | grep -oP '\*\*Parent Issue:\*\* #\K\d+' | head -1 || echo "")

          echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
          echo "specification_id=$SPEC_ID" >> $GITHUB_OUTPUT
          echo "library=$LIBRARY" >> $GITHUB_OUTPUT
          echo "branch=$HEAD_REF" >> $GITHUB_OUTPUT
          echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT
          echo "issue_number=$ISSUE_NUMBER" >> $GITHUB_OUTPUT

          echo "::notice::Reviewing PR #$PR_NUMBER for $LIBRARY implementation of $SPEC_ID (branch: $HEAD_REF)"

      - name: Checkout PR code
        run: |
          git fetch origin ${{ steps.pr.outputs.head_sha }}
          git checkout ${{ steps.pr.outputs.head_sha }}

      - name: Check attempt count
        id: attempts
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
        run: |
          LABELS=$(gh pr view "$PR_NUMBER" --json labels -q '.labels[].name' 2>/dev/null || echo "")

          if echo "$LABELS" | grep -q "ai-attempt-3"; then
            echo "count=3" >> $GITHUB_OUTPUT
            echo "display=3" >> $GITHUB_OUTPUT
          elif echo "$LABELS" | grep -q "ai-attempt-2"; then
            echo "count=2" >> $GITHUB_OUTPUT
            echo "display=3" >> $GITHUB_OUTPUT
          elif echo "$LABELS" | grep -q "ai-attempt-1"; then
            echo "count=1" >> $GITHUB_OUTPUT
            echo "display=2" >> $GITHUB_OUTPUT
          else
            echo "count=0" >> $GITHUB_OUTPUT
            echo "display=1" >> $GITHUB_OUTPUT
          fi

      - name: Setup GCS authentication
        id: gcs
        continue-on-error: true
        uses: google-github-actions/auth@v2
        with:
          credentials_json: ${{ secrets.GCS_CREDENTIALS }}

      - name: Setup gcloud CLI
        if: steps.gcs.outcome == 'success'
        uses: google-github-actions/setup-gcloud@v2

      - name: Download plot images from staging
        if: steps.gcs.outcome == 'success'
        env:
          SPEC_ID: ${{ steps.pr.outputs.specification_id }}
          LIBRARY: ${{ steps.pr.outputs.library }}
        run: |
          mkdir -p plot_images
          gsutil -m cp "gs://pyplots-images/staging/${SPEC_ID}/${LIBRARY}/*" plot_images/ 2>/dev/null || echo "No images found"
          ls -la plot_images/ 2>/dev/null || echo "Empty"

      - name: React with eyes emoji
        if: steps.attempts.outputs.count != '3'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
        run: |
          gh api "repos/${{ github.repository }}/issues/$PR_NUMBER/reactions" -f content=eyes

      - name: Run AI Quality Review
        id: review
        if: steps.attempts.outputs.count != '3'
        continue-on-error: true
        timeout-minutes: 30
        uses: anthropics/claude-code-action@v1
        with:
          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
          claude_args: "--model opus"
          prompt: |
            ## Task: AI Quality Review for **${{ steps.pr.outputs.library }}** (Attempt ${{ steps.attempts.outputs.display }}/3)

            Review the implementation and evaluate if it meets quality standards.

            ### Your Task

            1. **Read the specification**: `plots/${{ steps.pr.outputs.specification_id }}/specification.md`

            2. **Read the implementation**:
               `plots/${{ steps.pr.outputs.specification_id }}/implementations/${{ steps.pr.outputs.library }}.py`

            3. **Read library rules**: `prompts/library/${{ steps.pr.outputs.library }}.md`

            4. **View plot images** in `plot_images/` directory
               - Use vision to analyze each image
               - Compare with spec requirements

            5. **Evaluate against quality criteria** from `prompts/quality-criteria.md`

            6. **Post verdict as PR comment** on PR #${{ steps.pr.outputs.pr_number }}:

            ```markdown
            ## AI Review - Attempt ${{ steps.attempts.outputs.display }}/3

            ### Quality Score: XX/100

            ### Criteria Checklist
            - [x] VQ-001: Axes labeled correctly
            - [x] VQ-002: Grid is subtle
            - [ ] VQ-003: Elements clear
            ...

            ### Issues Found
            1. **Issue**: Description
            2. **Issue**: Description

            ### AI Feedback
            > Specific suggestions for improvement

            ### Verdict: APPROVED / REJECTED
            ```

            7. **Save score to file**:
               ```bash
               echo "XX" > quality_score.txt
               ```

            8. **DO NOT add ai-approved or ai-rejected labels** - the workflow will add them after updating metadata.

      - name: Extract quality score
        id: score
        if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'success'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUM: ${{ steps.pr.outputs.pr_number }}
        run: |
          if [ -f "quality_score.txt" ]; then
            SCORE=$(cat quality_score.txt | tr -d '[:space:]')
          else
            SCORE=$(gh pr view "$PR_NUM" --json comments -q '.comments[-1].body' | grep -oP 'Score: \K\d+' | head -1 || echo "0")
          fi
          echo "score=$SCORE" >> $GITHUB_OUTPUT

      - name: Add quality score label
        if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUM: ${{ steps.pr.outputs.pr_number }}
          SCORE: ${{ steps.score.outputs.score }}
        run: |
          LABEL="quality:${SCORE}"
          gh label create "$LABEL" --color "0e8a16" --description "Quality score ${SCORE}/100" 2>/dev/null || true
          gh pr edit "$PR_NUM" --add-label "$LABEL"

      - name: Update quality score in metadata
        if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          SPEC_ID: ${{ steps.pr.outputs.specification_id }}
          LIBRARY: ${{ steps.pr.outputs.library }}
          SCORE: ${{ steps.score.outputs.score }}
          BRANCH: ${{ steps.pr.outputs.branch }}
        run: |
          METADATA_FILE="plots/${SPEC_ID}/metadata/${LIBRARY}.yaml"

          # Configure git auth and checkout the PR branch
          git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ github.repository }}.git"
          git fetch origin "$BRANCH"
          git checkout -B "$BRANCH" "origin/$BRANCH"

          # Update quality_score in metadata file
          if [ -f "$METADATA_FILE" ]; then
            sed -i "s/quality_score: null.*/quality_score: ${SCORE}/" "$METADATA_FILE"
            sed -i "s/quality_score: [0-9]\+.*/quality_score: ${SCORE}/" "$METADATA_FILE"

            git config user.name "github-actions[bot]"
            git config user.email "github-actions[bot]@users.noreply.github.com"
            git add "$METADATA_FILE"

            if ! git diff --cached --quiet; then
              git commit -m "chore(${LIBRARY}): set quality score ${SCORE} for ${SPEC_ID}"
              git push origin "$BRANCH"
              echo "::notice::Quality score ${SCORE} committed to ${BRANCH}"
            fi
          fi

      - name: Handle review failure
        if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'failure'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUM: ${{ steps.pr.outputs.pr_number }}
        run: |
          gh pr edit "$PR_NUM" --add-label "ai-review-failed"
          gh pr comment "$PR_NUM" --body "## :warning: AI Review Failed

          The AI review action failed or timed out.

          **Options:**
          1. Re-run the workflow manually
          2. Request manual human review

          ---
          :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*"

      - name: Add verdict label and take action
        if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUM: ${{ steps.pr.outputs.pr_number }}
          SPEC_ID: ${{ steps.pr.outputs.specification_id }}
          LIBRARY: ${{ steps.pr.outputs.library }}
          SCORE: ${{ steps.score.outputs.score }}
          ATTEMPT: ${{ steps.attempts.outputs.display }}
        run: |
          # Add ai-approved or ai-rejected label based on score
          if [ "$SCORE" -ge 85 ]; then
            gh pr edit "$PR_NUM" --add-label "ai-approved"
            echo "::notice::Added ai-approved label (score $SCORE >= 85)"
          else
            gh pr edit "$PR_NUM" --add-label "ai-rejected"
            echo "::notice::Added ai-rejected label (score $SCORE < 85)"
          fi

          # Now check labels and take action
          HAS_APPROVED=$(gh pr view "$PR_NUM" --json labels -q '[.labels[].name] | any(. == "ai-approved")' || echo "false")
          HAS_REJECTED=$(gh pr view "$PR_NUM" --json labels -q '[.labels[].name] | any(. == "ai-rejected")' || echo "false")

          if [[ "$HAS_APPROVED" == "true" ]]; then
            echo "Triggering impl-merge.yml for approved PR"
            gh workflow run impl-merge.yml -f pr_number="$PR_NUM"
            echo "::notice::PR approved. Triggered impl-merge.yml"

          elif [[ "$HAS_REJECTED" == "true" ]]; then
            echo "Triggering impl-repair.yml for rejected PR"
            gh pr edit "$PR_NUM" --add-label "ai-attempt-${ATTEMPT}" 2>/dev/null || true
            gh workflow run impl-repair.yml \
              -f pr_number="$PR_NUM" \
              -f specification_id="$SPEC_ID" \
              -f library="$LIBRARY" \
              -f attempt="$ATTEMPT"
          fi

      - name: Mark as not-feasible after 3 attempts
        if: steps.attempts.outputs.count == '3'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUM: ${{ steps.pr.outputs.pr_number }}
          LIBRARY: ${{ steps.pr.outputs.library }}
          SPEC_ID: ${{ steps.pr.outputs.specification_id }}
          ISSUE_NUMBER: ${{ steps.pr.outputs.issue_number }}
        run: |
          gh pr edit "$PR_NUM" --add-label "not-feasible"

          gh pr comment "$PR_NUM" --body "## AI Review - Final Status

          ### Status: Not Feasible

          AI Review failed after **3 attempts**. This ${LIBRARY} implementation could not meet quality standards.

          **Options:**
          1. Manual review and fix
          2. Mark this library as unsupported for this plot type

          ---
          :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*"

          if [ -n "$ISSUE_NUMBER" ]; then
            gh issue edit "$ISSUE_NUMBER" --add-label "impl:${LIBRARY}:failed" 2>/dev/null || true
            gh issue comment "$ISSUE_NUMBER" --body "**${LIBRARY}** implementation failed after 3 AI review attempts. See PR #${PR_NUM}."
          fi