@@ -14,19 +14,13 @@ jobs:
1414 contents : read
1515 pull-requests : write
1616 issues : write
17+ id-token : write
1718
1819 steps :
1920 - name : Checkout code
2021 uses : actions/checkout@v4
21-
22- - name : Set up Python
23- uses : actions/setup-python@v5
2422 with :
25- python-version : ' 3.12'
26-
27- - name : Install dependencies
28- run : |
29- pip install anthropic requests
23+ fetch-depth : 0
3024
3125 - name : Download preview metadata
3226 uses : actions/download-artifact@v4
@@ -46,10 +40,12 @@ jobs:
4640 PR_NUMBER=$(jq -r '.pr_number' preview_metadata.json)
4741 BUCKET=$(jq -r '.bucket' preview_metadata.json)
4842 BASE_PATH=$(jq -r '.base_path' preview_metadata.json)
43+ CHANGED_FILES=$(jq -r '.changed_files[]' preview_metadata.json | tr '\n' ' ')
4944
5045 echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
5146 echo "bucket=$BUCKET" >> $GITHUB_OUTPUT
5247 echo "base_path=$BASE_PATH" >> $GITHUB_OUTPUT
48+ echo "changed_files=$CHANGED_FILES" >> $GITHUB_OUTPUT
5349
5450 - name : Setup Google Cloud authentication
5551 uses : google-github-actions/auth@v2
@@ -59,255 +55,48 @@ jobs:
5955 - name : Download preview images
6056 run : |
6157 mkdir -p preview_images
62- gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/
63-
64- - name : Run quality evaluation
65- id : quality_eval
66- env :
67- ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
68- run : |
69- cat > quality_evaluator.py <<'EOF'
70- import os
71- import sys
72- import json
73- import base64
74- from pathlib import Path
75- import anthropic
76-
77- def load_spec(spec_id: str) -> str:
78- """Load spec content"""
79- spec_path = Path(f"specs/{spec_id}.md")
80- if not spec_path.exists():
81- return ""
82- return spec_path.read_text()
83-
84- def encode_image(image_path: str) -> str:
85- """Encode image as base64"""
86- with open(image_path, 'rb') as f:
87- return base64.standard_b64encode(f.read()).decode('utf-8')
88-
89- def evaluate_plot(spec_id: str, library: str, variant: str, image_path: str) -> dict:
90- """Evaluate a single plot implementation"""
91- client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
92-
93- # Load spec
94- spec_content = load_spec(spec_id)
95- if not spec_content:
96- return {"error": f"Spec not found: {spec_id}"}
97-
98- # Encode image
99- image_data = encode_image(image_path)
100-
101- # Build evaluation prompt
102- prompt = f"""You are an expert data visualization reviewer.
103-
104- # Task
105- Evaluate this plot implementation against its specification.
106-
107- # Specification
108- {spec_content}
109-
110- # Implementation Details
111- - Library: {library}
112- - Variant: {variant}
113-
114- # Quality Evaluation Criteria
115- Review the plot image against EACH quality criterion listed in the specification.
116-
117- # Response Format
118- Provide your evaluation in the following JSON format:
119-
120- {{
121- "overall_score": <0-100>,
122- "verdict": "<PASS|FAIL>",
123- "criteria_results": [
124- {{
125- "criterion": "<criterion text>",
126- "status": "<PASS|FAIL>",
127- "comment": "<brief explanation>"
128- }}
129- ],
130- "strengths": ["<strength 1>", "<strength 2>"],
131- "improvements": ["<improvement 1>", "<improvement 2>"],
132- "summary": "<2-3 sentence overall assessment>"
133- }}
134-
135- # Scoring Guidelines
136- - 90-100: Excellent - All criteria met, production ready
137- - 85-89: Good - Minor issues, acceptable
138- - 75-84: Needs improvement - Some criteria failed
139- - <75: Rejected - Major issues, regeneration required
140-
141- # Instructions
142- 1. Review the image carefully
143- 2. Check EACH quality criterion from the spec
144- 3. Provide specific, actionable feedback
145- 4. Be objective and constructive
146-
147- Provide ONLY the JSON response, no additional text."""
148-
149- # Call Claude with vision
150- response = client.messages.create(
151- model="claude-sonnet-4-20250514",
152- max_tokens=2000,
153- messages=[{
154- "role": "user",
155- "content": [
156- {
157- "type": "image",
158- "source": {
159- "type": "base64",
160- "media_type": "image/png",
161- "data": image_data
162- }
163- },
164- {
165- "type": "text",
166- "text": prompt
167- }
168- ]
169- }]
170- )
171-
172- # Parse response
173- response_text = response.content[0].text
174-
175- # Extract JSON (handle markdown code blocks)
176- if "```json" in response_text:
177- response_text = response_text.split("```json")[1].split("```")[0].strip()
178- elif "```" in response_text:
179- response_text = response_text.split("```")[1].split("```")[0].strip()
180-
181- try:
182- result = json.loads(response_text)
183- return result
184- except json.JSONDecodeError:
185- return {"error": "Failed to parse response", "raw": response_text}
186-
187- # Main execution
188- if __name__ == "__main__":
189- results = []
190-
191- # Process all preview images
192- preview_dir = Path("preview_images")
193- for img_file in preview_dir.glob("*.png"):
194- # Parse filename: {spec_id}_{library}_{variant}.png
195- parts = img_file.stem.split('_')
196- if len(parts) >= 3:
197- spec_id = '_'.join(parts[:-2])
198- library = parts[-2]
199- variant = parts[-1]
58+ gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/ || echo "No images to download"
20059
201- print(f"🔍 Evaluating: {spec_id}/{library}/{variant}")
202- result = evaluate_plot(spec_id, library, variant, str(img_file))
203- result["spec_id"] = spec_id
204- result["library"] = library
205- result["variant"] = variant
206- results.append(result)
207-
208- # Save results
209- with open("quality_results.json", "w") as f:
210- json.dump(results, f, indent=2)
211-
212- # Print summary
213- print("\n" + "="*60)
214- print("Quality Evaluation Results")
215- print("="*60)
216- for r in results:
217- score = r.get("overall_score", 0)
218- verdict = r.get("verdict", "UNKNOWN")
219- print(f"{r['spec_id']}/{r['library']}/{r['variant']}: {score}/100 - {verdict}")
220- print("="*60)
221- EOF
222-
223- python quality_evaluator.py
224-
225- - name : Parse quality results
226- id : results
227- run : |
228- if [ ! -f quality_results.json ]; then
229- echo "No quality results found"
230- exit 0
231- fi
232-
233- # Calculate overall verdict
234- TOTAL=$(jq '. | length' quality_results.json)
235- PASSED=$(jq '[.[] | select(.verdict == "PASS")] | length' quality_results.json)
236- FAILED=$((TOTAL - PASSED))
237-
238- MIN_SCORE=$(jq '[.[] | .overall_score] | min' quality_results.json)
239-
240- echo "total=$TOTAL" >> $GITHUB_OUTPUT
241- echo "passed=$PASSED" >> $GITHUB_OUTPUT
242- echo "failed=$FAILED" >> $GITHUB_OUTPUT
243- echo "min_score=$MIN_SCORE" >> $GITHUB_OUTPUT
244-
245- # Overall verdict: PASS if all pass and min_score >= 85
246- if [ $FAILED -eq 0 ] && [ $MIN_SCORE -ge 85 ]; then
247- echo "overall_verdict=PASS" >> $GITHUB_OUTPUT
248- else
249- echo "overall_verdict=FAIL" >> $GITHUB_OUTPUT
250- fi
251-
252- - name : Generate quality report
253- if : steps.results.outputs.total != ''
254- run : |
255- cat > quality_report.md <<'EOF'
256- ## 🤖 Quality Check Results
257-
258- **Overall Verdict:** ${{ steps.results.outputs.overall_verdict }}
259-
260- **Summary:**
261- - Total implementations: ${{ steps.results.outputs.total }}
262- - Passed: ${{ steps.results.outputs.passed }}
263- - Failed: ${{ steps.results.outputs.failed }}
264- - Minimum score: ${{ steps.results.outputs.min_score }}/100
265-
266- ### Detailed Results
267-
268- EOF
269-
270- # Add detailed results for each implementation
271- jq -r '.[] | "#### \(.spec_id) / \(.library) / \(.variant)\n\n**Score:** \(.overall_score)/100 | **Verdict:** \(.verdict)\n\n**Summary:** \(.summary)\n\n**Strengths:**\n\(.strengths | map("- " + .) | join("\n"))\n\n**Improvements Needed:**\n\(.improvements | map("- " + .) | join("\n"))\n\n---\n"' quality_results.json >> quality_report.md
272-
273- echo "" >> quality_report.md
274- echo "*Generated by [quality-check workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" >> quality_report.md
275-
276- - name : Comment on PR with quality results
277- if : steps.results.outputs.total != ''
278- uses : actions/github-script@v7
279- with :
280- script : |
281- const fs = require('fs');
282- const report = fs.readFileSync('quality_report.md', 'utf8');
283-
284- await github.rest.issues.createComment({
285- owner: context.repo.owner,
286- repo: context.repo.repo,
287- issue_number: ${{ steps.metadata.outputs.pr_number }},
288- body: report
289- });
290-
291- - name : Add labels based on verdict
292- if : steps.results.outputs.total != ''
293- uses : actions/github-script@v7
60+ - name : Quality evaluation with Claude Code
61+ uses : anthropics/claude-code-action@v1
29462 with :
295- script : |
296- const verdict = '${{ steps.results.outputs.overall_verdict }}';
297- const prNumber = ${{ steps.metadata.outputs.pr_number }};
298-
299- if (verdict === 'PASS') {
300- await github.rest.issues.addLabels({
301- owner: context.repo.owner,
302- repo: context.repo.repo,
303- issue_number: prNumber,
304- labels: ['quality-approved']
305- });
306- } else {
307- await github.rest.issues.addLabels({
308- owner: context.repo.owner,
309- repo: context.repo.repo,
310- issue_number: prNumber,
311- labels: ['quality-check-failed']
312- });
313- }
63+ claude_code_oauth_token : ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
64+ prompt : |
65+ TASK: Evaluate plot implementations against specifications
66+
67+ PR: #${{ steps.metadata.outputs.pr_number }}
68+ Preview images location: preview_images/
69+
70+ Instructions:
71+ 1. List all PNG files in preview_images/ directory
72+ 2. For each preview image:
73+ a. Parse filename to extract: spec_id, library, variant
74+ b. Read corresponding spec file: specs/{spec_id}.md
75+ c. View the preview image
76+ d. Evaluate against quality criteria in spec
77+
78+ 3. For each implementation, check:
79+ - Does it meet ALL quality criteria listed in spec?
80+ - Are visual elements clear and readable?
81+ - Are colors appropriate and accessible?
82+ - Is the layout well-structured?
83+ - Score: 0-100 (≥85 to pass)
84+
85+ 4. Generate quality report with:
86+ - Overall verdict (PASS if all ≥85, FAIL otherwise)
87+ - Score for each implementation
88+ - Specific feedback for each quality criterion
89+ - Strengths and improvements needed
90+
91+ 5. Use gh CLI to:
92+ - Post quality report as comment on PR #${{ steps.metadata.outputs.pr_number }}
93+ - Add label "quality-approved" if PASS
94+ - Add label "quality-check-failed" if FAIL
95+
96+ Scoring Guidelines:
97+ - 90-100: Excellent - All criteria met, production ready
98+ - 85-89: Good - Minor issues, acceptable
99+ - 75-84: Needs improvement
100+ - <75: Rejected - Major issues
101+
102+ Be objective and constructive in your feedback.
0 commit comments