Skip to content

Review: PR #3284

Review: PR #3284 #2963

Workflow file for this run

name: "Impl: Review"
run-name: "Review: PR #${{ inputs.pr_number || github.event.client_payload.pr_number }}"
# AI quality review for implementation PRs
# Triggered by impl-generate.yml after PR creation
# Last updated: 2025-12-23
on:
workflow_dispatch:
inputs:
pr_number:
description: 'PR number to review'
required: true
type: string
repository_dispatch:
types: [review-pr]
jobs:
review:
runs-on: ubuntu-latest
permissions:
contents: write # Needed for pushing quality score to PR branch
pull-requests: write
issues: write
id-token: write
actions: write
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Extract PR info
id: pr
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ inputs.pr_number || github.event.client_payload.pr_number }}
run: |
PR_DATA=$(gh pr view "$PR_NUMBER" --json headRefName,headRefOid,body)
HEAD_REF=$(echo "$PR_DATA" | jq -r '.headRefName')
HEAD_SHA=$(echo "$PR_DATA" | jq -r '.headRefOid')
BODY=$(echo "$PR_DATA" | jq -r '.body')
# Extract spec-id and library from branch: implementation/{spec-id}/{library}
SPEC_ID=$(echo "$HEAD_REF" | cut -d'/' -f2)
LIBRARY=$(echo "$HEAD_REF" | cut -d'/' -f3)
# Extract issue number from PR body
ISSUE_NUMBER=$(echo "$BODY" | grep -oP '\*\*Parent Issue:\*\* #\K\d+' | head -1 || echo "")
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
echo "specification_id=$SPEC_ID" >> $GITHUB_OUTPUT
echo "library=$LIBRARY" >> $GITHUB_OUTPUT
echo "branch=$HEAD_REF" >> $GITHUB_OUTPUT
echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT
echo "issue_number=$ISSUE_NUMBER" >> $GITHUB_OUTPUT
echo "::notice::Reviewing PR #$PR_NUMBER for $LIBRARY implementation of $SPEC_ID (branch: $HEAD_REF)"
- name: Checkout PR code
run: |
git fetch origin ${{ steps.pr.outputs.head_sha }}
git checkout ${{ steps.pr.outputs.head_sha }}
- name: Check attempt count
id: attempts
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
run: |
LABELS=$(gh pr view "$PR_NUMBER" --json labels -q '.labels[].name' 2>/dev/null || echo "")
if echo "$LABELS" | grep -q "ai-attempt-3"; then
echo "count=3" >> $GITHUB_OUTPUT
echo "display=3" >> $GITHUB_OUTPUT
elif echo "$LABELS" | grep -q "ai-attempt-2"; then
echo "count=2" >> $GITHUB_OUTPUT
echo "display=3" >> $GITHUB_OUTPUT
elif echo "$LABELS" | grep -q "ai-attempt-1"; then
echo "count=1" >> $GITHUB_OUTPUT
echo "display=2" >> $GITHUB_OUTPUT
else
echo "count=0" >> $GITHUB_OUTPUT
echo "display=1" >> $GITHUB_OUTPUT
fi
- name: Setup GCS authentication
id: gcs
continue-on-error: true
uses: google-github-actions/auth@v3
with:
credentials_json: ${{ secrets.GCS_CREDENTIALS }}
- name: Setup gcloud CLI
if: steps.gcs.outcome == 'success'
uses: google-github-actions/setup-gcloud@v3
- name: Download plot images from staging
if: steps.gcs.outcome == 'success'
env:
SPEC_ID: ${{ steps.pr.outputs.specification_id }}
LIBRARY: ${{ steps.pr.outputs.library }}
run: |
mkdir -p plot_images
gsutil -m cp "gs://pyplots-images/staging/${SPEC_ID}/${LIBRARY}/*" plot_images/ 2>/dev/null || true
ls -la plot_images/
- name: Verify plot image exists
run: |
if [ ! -f "plot_images/plot.png" ]; then
echo "::error::No plot.png found in staging - cannot review without image"
echo "::error::The impl-generate workflow may have failed to produce an image"
exit 1
fi
echo "::notice::Found plot.png for review"
- name: React with eyes emoji
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
run: |
gh api "repos/${{ github.repository }}/issues/$PR_NUMBER/reactions" -f content=eyes
- name: Run AI Quality Review
id: review
continue-on-error: true
timeout-minutes: 30
uses: anthropics/claude-code-action@v1
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
claude_args: "--model opus"
prompt: |
## Task: AI Quality Review for **${{ steps.pr.outputs.library }}** (Attempt ${{ steps.attempts.outputs.display }}/3)
Review the implementation and evaluate if it meets quality standards.
### Your Task
1. **Read the specification**: `plots/${{ steps.pr.outputs.specification_id }}/specification.md`
2. **Read the implementation**:
`plots/${{ steps.pr.outputs.specification_id }}/implementations/${{ steps.pr.outputs.library }}.py`
3. **Read library rules**: `prompts/library/${{ steps.pr.outputs.library }}.md`
4. **Read impl-tags guide**: `prompts/impl-tags-generator.md` (for step 8)
5. **MANDATORY: View the plot image**
- You MUST use the Read tool to open `plot_images/plot.png`
- Visually analyze the image - this is critical for the review
- DO NOT skip this step - a review without seeing the image is invalid
- If the image cannot be read, STOP and report the error
6. **Evaluate against quality criteria** from `prompts/quality-criteria.md`
7. **Post verdict as PR comment** on PR #${{ steps.pr.outputs.pr_number }}:
```markdown
## AI Review - Attempt ${{ steps.attempts.outputs.display }}/3
### Image Description
> Describe what you see in the plot: colors used, axis labels, title, data representation, overall layout.
> This proves you actually looked at the image.
### Quality Score: XX/100
### Criteria Checklist
**Visual Quality (40 pts)**
- [ ] VQ-01: Text Legibility (10) - all text readable at full size
- [ ] VQ-02: No Overlap (8) - no overlapping text
- [ ] VQ-03: Element Visibility (8) - markers/lines sized for data density
- [ ] VQ-04: Color Accessibility (5) - colorblind-safe
- [ ] VQ-05: Layout Balance (5) - good proportions
**Spec Compliance (25 pts)**
- [ ] SC-01: Plot Type (8) - correct chart type
- [ ] SC-02: Data Mapping (5) - X/Y correctly assigned
- [ ] SC-03: Required Features (5) - all spec features present
- [ ] SC-06: Title Format (2) - uses {spec-id} · {library} · pyplots.ai
**Data Quality (20 pts)**
- [ ] DQ-01: Feature Coverage (8) - shows ALL aspects of plot type
- [ ] DQ-02: Realistic Context (7) - plausible scenario
- [ ] DQ-03: Appropriate Scale (5) - sensible values
**Code Quality (10 pts)**
- [ ] CQ-01: KISS Structure (3) - no functions/classes
- [ ] CQ-02: Reproducibility (3) - fixed seed
**Library Features (5 pts)**
- [ ] LF-01: Uses distinctive library features
### Strengths
- Strength 1 (keep these aspects)
- Strength 2
### Weaknesses
- Weakness 1 (AI will fix these - let it decide HOW)
### Verdict: APPROVED / REJECTED
```
8. **Save review data to files** (for the workflow to parse):
```bash
echo "XX" > quality_score.txt
# Save structured feedback as JSON (one array per file)
echo '["Strength 1", "Strength 2"]' > review_strengths.json
echo '["Weakness 1"]' > review_weaknesses.json
# Save verdict
echo "APPROVED" > review_verdict.txt # or "REJECTED"
# Save image description (multi-line text)
cat > review_image_description.txt << 'EOF'
The plot shows a scatter plot with blue markers...
[Your full image description here]
EOF
# Save criteria checklist as structured JSON
cat > review_checklist.json << 'EOF'
{
"visual_quality": {
"score": 36,
"max": 40,
"items": [
{"id": "VQ-01", "name": "Text Legibility", "score": 10, "max": 10, "passed": true, "comment": "All text readable"},
{"id": "VQ-02", "name": "No Overlap", "score": 8, "max": 8, "passed": true, "comment": "No overlapping elements"}
]
},
"spec_compliance": {"score": 23, "max": 25, "items": [...]},
"data_quality": {"score": 18, "max": 20, "items": [...]},
"code_quality": {"score": 10, "max": 10, "items": [...]},
"library_features": {"score": 5, "max": 5, "items": [...]}
}
EOF
```
9. **Generate impl_tags** (based on prompts/impl-tags-generator.md):
Analyze the implementation code and create impl_tags with 5 dimensions:
- `dependencies`: External packages beyond numpy/pandas/plotting library
- `techniques`: Visualization techniques (twin-axes, colorbar, etc.)
- `patterns`: Code patterns (data-generation, iteration-over-groups, etc.)
- `dataprep`: Data transformations (kde, binning, correlation-matrix, etc.)
- `styling`: Visual style (publication-ready, alpha-blending, etc.)
```bash
cat > review_impl_tags.json << 'EOF'
{
"dependencies": [],
"techniques": ["colorbar", "annotations"],
"patterns": ["data-generation"],
"dataprep": [],
"styling": ["publication-ready"]
}
EOF
```
10. **DO NOT add ai-approved or ai-rejected labels** - the workflow will add them after updating metadata.
**IMPORTANT**: Your review MUST include the "Image Description" section. A review without an image description will be considered invalid.
**IMPORTANT**: All review data (strengths, weaknesses, image_description, criteria_checklist) is saved to metadata for future regeneration. Be specific!
- name: Extract quality score
id: score
if: steps.review.conclusion == 'success'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUM: ${{ steps.pr.outputs.pr_number }}
run: |
if [ -f "quality_score.txt" ]; then
SCORE=$(cat quality_score.txt | tr -d '[:space:]')
else
SCORE=$(gh pr view "$PR_NUM" --json comments -q '.comments[-1].body' | grep -oP 'Score: \K\d+' | head -1 || echo "0")
fi
# Validate score is a number between 1-100, default to 0 if invalid
if ! [[ "$SCORE" =~ ^[0-9]+$ ]] || [ "$SCORE" -lt 1 ] || [ "$SCORE" -gt 100 ]; then
echo "::warning::Invalid quality score '$SCORE', defaulting to 0"
SCORE="0"
fi
echo "score=$SCORE" >> $GITHUB_OUTPUT
- name: Add quality score label
if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUM: ${{ steps.pr.outputs.pr_number }}
SCORE: ${{ steps.score.outputs.score }}
run: |
LABEL="quality:${SCORE}"
gh label create "$LABEL" --color "0e8a16" --description "Quality score ${SCORE}/100" 2>/dev/null || true
gh pr edit "$PR_NUM" --add-label "$LABEL"
- name: Install Python dependencies for metadata update
if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
run: pip install pyyaml
- name: Update metadata and implementation header
if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SPEC_ID: ${{ steps.pr.outputs.specification_id }}
LIBRARY: ${{ steps.pr.outputs.library }}
SCORE: ${{ steps.score.outputs.score }}
BRANCH: ${{ steps.pr.outputs.branch }}
run: |
METADATA_FILE="plots/${SPEC_ID}/metadata/${LIBRARY}.yaml"
IMPL_FILE="plots/${SPEC_ID}/implementations/${LIBRARY}.py"
# Configure git auth and checkout the PR branch
git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ github.repository }}.git"
git fetch origin "$BRANCH"
git checkout -B "$BRANCH" "origin/$BRANCH"
# Update metadata file with quality score, timestamp, and review feedback
if [ -f "$METADATA_FILE" ]; then
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
# Write Python script to temp file to avoid YAML/shell escaping issues
cat > /tmp/update_metadata.py << 'EOF'
import yaml
import json
import sys
from pathlib import Path
metadata_file = sys.argv[1]
score = int(sys.argv[2])
timestamp = sys.argv[3]
# Read existing review data files
strengths = []
weaknesses = []
image_description = None
criteria_checklist = None
verdict = None
impl_tags = None
if Path('review_strengths.json').exists():
try:
with open('review_strengths.json') as f:
strengths = json.load(f)
except:
pass
if Path('review_weaknesses.json').exists():
try:
with open('review_weaknesses.json') as f:
weaknesses = json.load(f)
except:
pass
if Path('review_image_description.txt').exists():
try:
with open('review_image_description.txt') as f:
image_description = f.read().strip()
except:
pass
if Path('review_checklist.json').exists():
try:
with open('review_checklist.json') as f:
criteria_checklist = json.load(f)
except:
pass
if Path('review_verdict.txt').exists():
try:
with open('review_verdict.txt') as f:
verdict = f.read().strip()
except:
pass
if Path('review_impl_tags.json').exists():
try:
with open('review_impl_tags.json') as f:
impl_tags = json.load(f)
except:
pass
# Load existing metadata
with open(metadata_file, 'r') as f:
data = yaml.safe_load(f)
data['quality_score'] = score
data['updated'] = timestamp
if 'review' not in data:
data['review'] = {}
# Update review section with all fields
data['review']['strengths'] = strengths
data['review']['weaknesses'] = weaknesses
# Add extended review data (issue #2845)
if image_description:
data['review']['image_description'] = image_description
if criteria_checklist:
data['review']['criteria_checklist'] = criteria_checklist
if verdict:
data['review']['verdict'] = verdict
# Add impl_tags (issue #2434)
if impl_tags:
data['impl_tags'] = impl_tags
def str_representer(dumper, data):
if isinstance(data, str) and data.endswith('Z') and 'T' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style="'")
# Use literal block style for multi-line strings (image_description)
if isinstance(data, str) and '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
yaml.add_representer(str, str_representer)
with open(metadata_file, 'w') as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
EOF
python3 /tmp/update_metadata.py "$METADATA_FILE" "$SCORE" "$TIMESTAMP"
echo "::notice::Updated metadata with quality score ${SCORE} and extended review data"
fi
# Update implementation header with quality score
if [ -f "$IMPL_FILE" ]; then
# Get library and python versions from metadata
LIBRARY_VERSION=$(python3 -c "import yaml; print(yaml.safe_load(open('$METADATA_FILE'))['library_version'])" 2>/dev/null || echo "unknown")
PYTHON_VERSION=$(python3 -c "import yaml; print(yaml.safe_load(open('$METADATA_FILE'))['python_version'])" 2>/dev/null || echo "3.13")
CREATED_DATE=$(python3 -c "import yaml; print(yaml.safe_load(open('$METADATA_FILE'))['created'][:10])" 2>/dev/null || echo "$(date +%Y-%m-%d)")
# Get title from specification.yaml
TITLE=$(python3 -c "import yaml; print(yaml.safe_load(open('plots/${SPEC_ID}/specification.yaml'))['title'])" 2>/dev/null || echo "${SPEC_ID}")
# Replace old header using Python
python3 -c "
import re
impl_file = '$IMPL_FILE'
spec_id = '${SPEC_ID}'
title = '''${TITLE}'''
library = '${LIBRARY}'
lib_version = '${LIBRARY_VERSION}'
py_version = '${PYTHON_VERSION}'
score = '${SCORE}'
created = '${CREATED_DATE}'
new_header = f'''\"\"\" pyplots.ai
{spec_id}: {title}
Library: {library} {lib_version} | Python {py_version}
Quality: {score}/100 | Created: {created}
\"\"\"'''
with open(impl_file, 'r') as f:
content = f.read()
pattern = r'^\"\"\".*?\"\"\"'
new_content = re.sub(pattern, new_header, content, count=1, flags=re.DOTALL)
with open(impl_file, 'w') as f:
f.write(new_content)
"
echo "::notice::Updated implementation header with quality score ${SCORE}"
fi
# Commit and push
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add "$METADATA_FILE" "$IMPL_FILE"
if ! git diff --cached --quiet; then
git commit -m "chore(${LIBRARY}): update quality score ${SCORE} and review feedback for ${SPEC_ID}"
git push origin "$BRANCH"
echo "::notice::Changes committed to ${BRANCH}"
fi
- name: Handle review failure
if: steps.attempts.outputs.count != '3' && steps.review.conclusion == 'failure'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUM: ${{ steps.pr.outputs.pr_number }}
run: |
gh pr edit "$PR_NUM" --add-label "ai-review-failed"
gh pr comment "$PR_NUM" --body "## :warning: AI Review Failed
The AI review action failed or timed out.
**Options:**
1. Re-run the workflow manually
2. Request manual human review
---
:robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*"
- name: Add verdict label and take action
if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUM: ${{ steps.pr.outputs.pr_number }}
SPEC_ID: ${{ steps.pr.outputs.specification_id }}
LIBRARY: ${{ steps.pr.outputs.library }}
SCORE: ${{ steps.score.outputs.score }}
ATTEMPT: ${{ steps.attempts.outputs.display }}
ATTEMPT_COUNT: ${{ steps.attempts.outputs.count }}
ISSUE_NUMBER: ${{ steps.pr.outputs.issue_number }}
run: |
# Score thresholds:
# >= 90: Excellent (approved, merge immediately)
# < 90: Rejected (repair loop, up to 3 attempts)
# After 3 attempts: >= 50 = merge, < 50 = close PR
if [ "$SCORE" -ge 90 ]; then
# Approved - merge immediately
gh pr edit "$PR_NUM" --add-label "ai-approved"
echo "::notice::Added ai-approved label (score $SCORE >= 90)"
echo "Triggering impl-merge.yml for approved PR"
gh workflow run impl-merge.yml -f pr_number="$PR_NUM"
elif [ "$ATTEMPT_COUNT" -eq 3 ]; then
# This is the 4th review (after 3 repair attempts)
if [ "$SCORE" -ge 50 ]; then
# Score 50-89 after 3 attempts: acceptable, merge to repo
gh pr edit "$PR_NUM" --add-label "ai-approved"
gh pr comment "$PR_NUM" --body "## AI Review - Final Status
### Score: ${SCORE}/100 (Acceptable)
After **3 repair attempts**, ${LIBRARY} reached ${SCORE}/100.
Score ≥ 50 is acceptable for the repository. Merging.
---
:robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*"
echo "Triggering impl-merge.yml for acceptable PR"
gh workflow run impl-merge.yml -f pr_number="$PR_NUM"
else
# Score < 50 after 3 attempts: not acceptable, close PR
gh pr edit "$PR_NUM" --add-label "quality-poor"
gh pr comment "$PR_NUM" --body "## AI Review - Final Status
### Score: ${SCORE}/100 (Poor Quality)
After **3 repair attempts**, the score is still below 50. This is not acceptable for the repository.
**This is treated like an auto-reject.** The PR will be closed and any old implementation will be removed from main.
**Options:**
1. Regenerate from scratch with \`generate:${LIBRARY}\` label
2. Manual complete rewrite
---
:robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*"
gh pr close "$PR_NUM"
# Remove old implementation from main if it exists
IMPL_FILE="plots/${SPEC_ID}/implementations/${LIBRARY}.py"
META_FILE="plots/${SPEC_ID}/metadata/${LIBRARY}.yaml"
git fetch origin main
if git show origin/main:"$IMPL_FILE" &>/dev/null; then
echo "::notice::Removing old implementation from main: $IMPL_FILE"
git checkout main
git pull origin main
# Delete implementation and metadata
rm -f "$IMPL_FILE" "$META_FILE"
# Commit and push
git add -A
git commit -m "chore(${LIBRARY}): remove ${SPEC_ID} impl (score ${SCORE} < 50 after 3 attempts)"
git push origin main
echo "::notice::Old implementation removed from main"
else
echo "::notice::No old implementation found on main to remove"
fi
if [ -n "$ISSUE_NUMBER" ]; then
gh issue edit "$ISSUE_NUMBER" --add-label "impl:${LIBRARY}:failed" 2>/dev/null || true
gh issue comment "$ISSUE_NUMBER" --body "**${LIBRARY}** implementation: score ${SCORE}/100 after 3 attempts (< 50 = rejected). PR #${PR_NUM} closed. Old implementation removed from repository."
fi
fi
else
# Score < 90, still have repair attempts left
gh pr edit "$PR_NUM" --add-label "ai-rejected"
if [ "$SCORE" -lt 50 ]; then
gh pr edit "$PR_NUM" --add-label "quality-poor"
echo "::notice::Added ai-rejected + quality-poor labels (score $SCORE < 50)"
else
echo "::notice::Added ai-rejected label (score $SCORE: 50-89)"
fi
echo "Triggering impl-repair.yml for rejected PR (attempt $ATTEMPT)"
gh pr edit "$PR_NUM" --add-label "ai-attempt-${ATTEMPT}" 2>/dev/null || true
gh workflow run impl-repair.yml \
-f pr_number="$PR_NUM" \
-f specification_id="$SPEC_ID" \
-f library="$LIBRARY" \
-f attempt="$ATTEMPT"
fi