Skip to content

Commit d04d929

Browse files
committed
feat: add complete automation infrastructure
Core Components: - Spec template (.template.md) for consistent plot specifications - Example spec: scatter-basic-001.md - Plot generator with Claude + versioned rules + self-review loop - Three GitHub Actions workflows for full automation Workflows: 1. spec-to-code.yml: Auto-generates code when issue gets 'approved' label - Extracts spec from issue - Generates matplotlib + seaborn implementations - Self-review loop (max 3 attempts) - Creates PR automatically 2. test-and-preview.yml: Tests code and generates preview images - Multi-version testing (Python 3.10-3.13) - Generates preview PNGs - Uploads to GCS - Comments on PR with preview links 3. quality-check.yml: AI quality evaluation with Claude Vision - Downloads previews from GCS - Evaluates against spec quality criteria - Scores each implementation (0-100) - Comments with detailed feedback - Adds labels (quality-approved or quality-check-failed) This infrastructure enables: - Complete automation from GitHub Issue → Code → Test → Preview → Quality - Self-review and quality gates built-in - No manual steps required for plot generation - Production-ready code output Next: Test with scatter-basic-001 issue
1 parent b1c85aa commit d04d929

6 files changed

Lines changed: 1105 additions & 0 deletions

File tree

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
name: Quality Check
2+
3+
on:
4+
workflow_run:
5+
workflows: ["Test and Generate Previews"]
6+
types: [completed]
7+
8+
jobs:
9+
quality-check:
10+
name: AI Quality Evaluation
11+
runs-on: ubuntu-latest
12+
if: ${{ github.event.workflow_run.conclusion == 'success' }}
13+
permissions:
14+
contents: read
15+
pull-requests: write
16+
issues: write
17+
18+
steps:
19+
- name: Checkout code
20+
uses: actions/checkout@v4
21+
22+
- name: Set up Python
23+
uses: actions/setup-python@v5
24+
with:
25+
python-version: '3.12'
26+
27+
- name: Install dependencies
28+
run: |
29+
pip install anthropic requests
30+
31+
- name: Download preview metadata
32+
uses: actions/download-artifact@v4
33+
with:
34+
name: preview-metadata
35+
run-id: ${{ github.event.workflow_run.id }}
36+
github-token: ${{ secrets.GITHUB_TOKEN }}
37+
38+
- name: Load metadata
39+
id: metadata
40+
run: |
41+
if [ ! -f preview_metadata.json ]; then
42+
echo "No preview metadata found, skipping quality check"
43+
exit 0
44+
fi
45+
46+
PR_NUMBER=$(jq -r '.pr_number' preview_metadata.json)
47+
BUCKET=$(jq -r '.bucket' preview_metadata.json)
48+
BASE_PATH=$(jq -r '.base_path' preview_metadata.json)
49+
50+
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
51+
echo "bucket=$BUCKET" >> $GITHUB_OUTPUT
52+
echo "base_path=$BASE_PATH" >> $GITHUB_OUTPUT
53+
54+
- name: Setup Google Cloud authentication
55+
uses: google-github-actions/auth@v2
56+
with:
57+
credentials_json: ${{ secrets.GCS_CREDENTIALS }}
58+
59+
- name: Download preview images
60+
run: |
61+
mkdir -p preview_images
62+
gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/
63+
64+
- name: Run quality evaluation
65+
id: quality_eval
66+
env:
67+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
68+
run: |
69+
cat > quality_evaluator.py <<'EOF'
70+
import os
71+
import sys
72+
import json
73+
import base64
74+
from pathlib import Path
75+
import anthropic
76+
77+
def load_spec(spec_id: str) -> str:
78+
"""Load spec content"""
79+
spec_path = Path(f"specs/{spec_id}.md")
80+
if not spec_path.exists():
81+
return ""
82+
return spec_path.read_text()
83+
84+
def encode_image(image_path: str) -> str:
85+
"""Encode image as base64"""
86+
with open(image_path, 'rb') as f:
87+
return base64.standard_b64encode(f.read()).decode('utf-8')
88+
89+
def evaluate_plot(spec_id: str, library: str, variant: str, image_path: str) -> dict:
90+
"""Evaluate a single plot implementation"""
91+
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
92+
93+
# Load spec
94+
spec_content = load_spec(spec_id)
95+
if not spec_content:
96+
return {"error": f"Spec not found: {spec_id}"}
97+
98+
# Encode image
99+
image_data = encode_image(image_path)
100+
101+
# Build evaluation prompt
102+
prompt = f"""You are an expert data visualization reviewer.
103+
104+
# Task
105+
Evaluate this plot implementation against its specification.
106+
107+
# Specification
108+
{spec_content}
109+
110+
# Implementation Details
111+
- Library: {library}
112+
- Variant: {variant}
113+
114+
# Quality Evaluation Criteria
115+
Review the plot image against EACH quality criterion listed in the specification.
116+
117+
# Response Format
118+
Provide your evaluation in the following JSON format:
119+
120+
{{
121+
"overall_score": <0-100>,
122+
"verdict": "<PASS|FAIL>",
123+
"criteria_results": [
124+
{{
125+
"criterion": "<criterion text>",
126+
"status": "<PASS|FAIL>",
127+
"comment": "<brief explanation>"
128+
}}
129+
],
130+
"strengths": ["<strength 1>", "<strength 2>"],
131+
"improvements": ["<improvement 1>", "<improvement 2>"],
132+
"summary": "<2-3 sentence overall assessment>"
133+
}}
134+
135+
# Scoring Guidelines
136+
- 90-100: Excellent - All criteria met, production ready
137+
- 85-89: Good - Minor issues, acceptable
138+
- 75-84: Needs improvement - Some criteria failed
139+
- <75: Rejected - Major issues, regeneration required
140+
141+
# Instructions
142+
1. Review the image carefully
143+
2. Check EACH quality criterion from the spec
144+
3. Provide specific, actionable feedback
145+
4. Be objective and constructive
146+
147+
Provide ONLY the JSON response, no additional text."""
148+
149+
# Call Claude with vision
150+
response = client.messages.create(
151+
model="claude-sonnet-4-20250514",
152+
max_tokens=2000,
153+
messages=[{
154+
"role": "user",
155+
"content": [
156+
{
157+
"type": "image",
158+
"source": {
159+
"type": "base64",
160+
"media_type": "image/png",
161+
"data": image_data
162+
}
163+
},
164+
{
165+
"type": "text",
166+
"text": prompt
167+
}
168+
]
169+
}]
170+
)
171+
172+
# Parse response
173+
response_text = response.content[0].text
174+
175+
# Extract JSON (handle markdown code blocks)
176+
if "```json" in response_text:
177+
response_text = response_text.split("```json")[1].split("```")[0].strip()
178+
elif "```" in response_text:
179+
response_text = response_text.split("```")[1].split("```")[0].strip()
180+
181+
try:
182+
result = json.loads(response_text)
183+
return result
184+
except json.JSONDecodeError:
185+
return {"error": "Failed to parse response", "raw": response_text}
186+
187+
# Main execution
188+
if __name__ == "__main__":
189+
results = []
190+
191+
# Process all preview images
192+
preview_dir = Path("preview_images")
193+
for img_file in preview_dir.glob("*.png"):
194+
# Parse filename: {spec_id}_{library}_{variant}.png
195+
parts = img_file.stem.split('_')
196+
if len(parts) >= 3:
197+
spec_id = '_'.join(parts[:-2])
198+
library = parts[-2]
199+
variant = parts[-1]
200+
201+
print(f"🔍 Evaluating: {spec_id}/{library}/{variant}")
202+
result = evaluate_plot(spec_id, library, variant, str(img_file))
203+
result["spec_id"] = spec_id
204+
result["library"] = library
205+
result["variant"] = variant
206+
results.append(result)
207+
208+
# Save results
209+
with open("quality_results.json", "w") as f:
210+
json.dump(results, f, indent=2)
211+
212+
# Print summary
213+
print("\n" + "="*60)
214+
print("Quality Evaluation Results")
215+
print("="*60)
216+
for r in results:
217+
score = r.get("overall_score", 0)
218+
verdict = r.get("verdict", "UNKNOWN")
219+
print(f"{r['spec_id']}/{r['library']}/{r['variant']}: {score}/100 - {verdict}")
220+
print("="*60)
221+
EOF
222+
223+
python quality_evaluator.py
224+
225+
- name: Parse quality results
226+
id: results
227+
run: |
228+
if [ ! -f quality_results.json ]; then
229+
echo "No quality results found"
230+
exit 0
231+
fi
232+
233+
# Calculate overall verdict
234+
TOTAL=$(jq '. | length' quality_results.json)
235+
PASSED=$(jq '[.[] | select(.verdict == "PASS")] | length' quality_results.json)
236+
FAILED=$((TOTAL - PASSED))
237+
238+
MIN_SCORE=$(jq '[.[] | .overall_score] | min' quality_results.json)
239+
240+
echo "total=$TOTAL" >> $GITHUB_OUTPUT
241+
echo "passed=$PASSED" >> $GITHUB_OUTPUT
242+
echo "failed=$FAILED" >> $GITHUB_OUTPUT
243+
echo "min_score=$MIN_SCORE" >> $GITHUB_OUTPUT
244+
245+
# Overall verdict: PASS if all pass and min_score >= 85
246+
if [ $FAILED -eq 0 ] && [ $MIN_SCORE -ge 85 ]; then
247+
echo "overall_verdict=PASS" >> $GITHUB_OUTPUT
248+
else
249+
echo "overall_verdict=FAIL" >> $GITHUB_OUTPUT
250+
fi
251+
252+
- name: Generate quality report
253+
if: steps.results.outputs.total != ''
254+
run: |
255+
cat > quality_report.md <<'EOF'
256+
## 🤖 Quality Check Results
257+
258+
**Overall Verdict:** ${{ steps.results.outputs.overall_verdict }}
259+
260+
**Summary:**
261+
- Total implementations: ${{ steps.results.outputs.total }}
262+
- Passed: ${{ steps.results.outputs.passed }}
263+
- Failed: ${{ steps.results.outputs.failed }}
264+
- Minimum score: ${{ steps.results.outputs.min_score }}/100
265+
266+
### Detailed Results
267+
268+
EOF
269+
270+
# Add detailed results for each implementation
271+
jq -r '.[] | "#### \(.spec_id) / \(.library) / \(.variant)\n\n**Score:** \(.overall_score)/100 | **Verdict:** \(.verdict)\n\n**Summary:** \(.summary)\n\n**Strengths:**\n\(.strengths | map("- " + .) | join("\n"))\n\n**Improvements Needed:**\n\(.improvements | map("- " + .) | join("\n"))\n\n---\n"' quality_results.json >> quality_report.md
272+
273+
echo "" >> quality_report.md
274+
echo "*Generated by [quality-check workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" >> quality_report.md
275+
276+
- name: Comment on PR with quality results
277+
if: steps.results.outputs.total != ''
278+
uses: actions/github-script@v7
279+
with:
280+
script: |
281+
const fs = require('fs');
282+
const report = fs.readFileSync('quality_report.md', 'utf8');
283+
284+
await github.rest.issues.createComment({
285+
owner: context.repo.owner,
286+
repo: context.repo.repo,
287+
issue_number: ${{ steps.metadata.outputs.pr_number }},
288+
body: report
289+
});
290+
291+
- name: Add labels based on verdict
292+
if: steps.results.outputs.total != ''
293+
uses: actions/github-script@v7
294+
with:
295+
script: |
296+
const verdict = '${{ steps.results.outputs.overall_verdict }}';
297+
const prNumber = ${{ steps.metadata.outputs.pr_number }};
298+
299+
if (verdict === 'PASS') {
300+
await github.rest.issues.addLabels({
301+
owner: context.repo.owner,
302+
repo: context.repo.repo,
303+
issue_number: prNumber,
304+
labels: ['quality-approved']
305+
});
306+
} else {
307+
await github.rest.issues.addLabels({
308+
owner: context.repo.owner,
309+
repo: context.repo.repo,
310+
issue_number: prNumber,
311+
labels: ['quality-check-failed']
312+
});
313+
}

0 commit comments

Comments
 (0)