Skip to content

Commit c6e008f

Browse files
committed
refactor: convert workflows to use Claude Code Action
Replace direct Anthropic API calls with Claude Code Action for AI tasks. This allows the workflows to use the existing CLAUDE_CODE_OAUTH_TOKEN instead of requiring a separate ANTHROPIC_API_KEY. Changes: spec-to-code.yml: - ✅ Trigger remains label-based ('approved' label on issue) - ✅ Spec extraction remains unchanged - ✅ Code generation now uses Claude Code Action - ✅ Provides detailed prompt with rules and requirements - ✅ Claude Code reads specs, rules, and generates implementations - ✅ Self-review loop integrated into Claude Code execution - ✅ Automatic commit and PR creation quality-check.yml: - ✅ Trigger remains workflow_run based - ✅ Preview image download from GCS unchanged - ✅ Quality evaluation now uses Claude Code Action - ✅ Claude Code views images (Vision) and evaluates against specs - ✅ Uses gh CLI to comment on PR and add labels - ✅ Scores each implementation (0-100, ≥85 to pass) Benefits: - No separate API key needed - Uses existing Claude Code Max subscription - Integrated with Claude Code ecosystem - Cleaner, more maintainable code - Claude Code handles commits automatically - Better error handling and retries Hybrid Approach: - Label triggers → Automation starts - Claude Code → AI-powered tasks (generation, evaluation) - Regular Actions → Infrastructure (tests, GCS upload, PR creation) This enables full automation using only CLAUDE_CODE_OAUTH_TOKEN and GCS_CREDENTIALS secrets.
1 parent 1b3cc51 commit c6e008f

File tree

2 files changed

+120
-328
lines changed

2 files changed

+120
-328
lines changed

.github/workflows/quality-check.yml

Lines changed: 47 additions & 258 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,13 @@ jobs:
1414
contents: read
1515
pull-requests: write
1616
issues: write
17+
id-token: write
1718

1819
steps:
1920
- name: Checkout code
2021
uses: actions/checkout@v4
21-
22-
- name: Set up Python
23-
uses: actions/setup-python@v5
2422
with:
25-
python-version: '3.12'
26-
27-
- name: Install dependencies
28-
run: |
29-
pip install anthropic requests
23+
fetch-depth: 0
3024

3125
- name: Download preview metadata
3226
uses: actions/download-artifact@v4
@@ -46,10 +40,12 @@ jobs:
4640
PR_NUMBER=$(jq -r '.pr_number' preview_metadata.json)
4741
BUCKET=$(jq -r '.bucket' preview_metadata.json)
4842
BASE_PATH=$(jq -r '.base_path' preview_metadata.json)
43+
CHANGED_FILES=$(jq -r '.changed_files[]' preview_metadata.json | tr '\n' ' ')
4944
5045
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
5146
echo "bucket=$BUCKET" >> $GITHUB_OUTPUT
5247
echo "base_path=$BASE_PATH" >> $GITHUB_OUTPUT
48+
echo "changed_files=$CHANGED_FILES" >> $GITHUB_OUTPUT
5349
5450
- name: Setup Google Cloud authentication
5551
uses: google-github-actions/auth@v2
@@ -59,255 +55,48 @@ jobs:
5955
- name: Download preview images
6056
run: |
6157
mkdir -p preview_images
62-
gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/
63-
64-
- name: Run quality evaluation
65-
id: quality_eval
66-
env:
67-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
68-
run: |
69-
cat > quality_evaluator.py <<'EOF'
70-
import os
71-
import sys
72-
import json
73-
import base64
74-
from pathlib import Path
75-
import anthropic
76-
77-
def load_spec(spec_id: str) -> str:
78-
"""Load spec content"""
79-
spec_path = Path(f"specs/{spec_id}.md")
80-
if not spec_path.exists():
81-
return ""
82-
return spec_path.read_text()
83-
84-
def encode_image(image_path: str) -> str:
85-
"""Encode image as base64"""
86-
with open(image_path, 'rb') as f:
87-
return base64.standard_b64encode(f.read()).decode('utf-8')
88-
89-
def evaluate_plot(spec_id: str, library: str, variant: str, image_path: str) -> dict:
90-
"""Evaluate a single plot implementation"""
91-
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
92-
93-
# Load spec
94-
spec_content = load_spec(spec_id)
95-
if not spec_content:
96-
return {"error": f"Spec not found: {spec_id}"}
97-
98-
# Encode image
99-
image_data = encode_image(image_path)
100-
101-
# Build evaluation prompt
102-
prompt = f"""You are an expert data visualization reviewer.
103-
104-
# Task
105-
Evaluate this plot implementation against its specification.
106-
107-
# Specification
108-
{spec_content}
109-
110-
# Implementation Details
111-
- Library: {library}
112-
- Variant: {variant}
113-
114-
# Quality Evaluation Criteria
115-
Review the plot image against EACH quality criterion listed in the specification.
116-
117-
# Response Format
118-
Provide your evaluation in the following JSON format:
119-
120-
{{
121-
"overall_score": <0-100>,
122-
"verdict": "<PASS|FAIL>",
123-
"criteria_results": [
124-
{{
125-
"criterion": "<criterion text>",
126-
"status": "<PASS|FAIL>",
127-
"comment": "<brief explanation>"
128-
}}
129-
],
130-
"strengths": ["<strength 1>", "<strength 2>"],
131-
"improvements": ["<improvement 1>", "<improvement 2>"],
132-
"summary": "<2-3 sentence overall assessment>"
133-
}}
134-
135-
# Scoring Guidelines
136-
- 90-100: Excellent - All criteria met, production ready
137-
- 85-89: Good - Minor issues, acceptable
138-
- 75-84: Needs improvement - Some criteria failed
139-
- <75: Rejected - Major issues, regeneration required
140-
141-
# Instructions
142-
1. Review the image carefully
143-
2. Check EACH quality criterion from the spec
144-
3. Provide specific, actionable feedback
145-
4. Be objective and constructive
146-
147-
Provide ONLY the JSON response, no additional text."""
148-
149-
# Call Claude with vision
150-
response = client.messages.create(
151-
model="claude-sonnet-4-20250514",
152-
max_tokens=2000,
153-
messages=[{
154-
"role": "user",
155-
"content": [
156-
{
157-
"type": "image",
158-
"source": {
159-
"type": "base64",
160-
"media_type": "image/png",
161-
"data": image_data
162-
}
163-
},
164-
{
165-
"type": "text",
166-
"text": prompt
167-
}
168-
]
169-
}]
170-
)
171-
172-
# Parse response
173-
response_text = response.content[0].text
174-
175-
# Extract JSON (handle markdown code blocks)
176-
if "```json" in response_text:
177-
response_text = response_text.split("```json")[1].split("```")[0].strip()
178-
elif "```" in response_text:
179-
response_text = response_text.split("```")[1].split("```")[0].strip()
180-
181-
try:
182-
result = json.loads(response_text)
183-
return result
184-
except json.JSONDecodeError:
185-
return {"error": "Failed to parse response", "raw": response_text}
186-
187-
# Main execution
188-
if __name__ == "__main__":
189-
results = []
190-
191-
# Process all preview images
192-
preview_dir = Path("preview_images")
193-
for img_file in preview_dir.glob("*.png"):
194-
# Parse filename: {spec_id}_{library}_{variant}.png
195-
parts = img_file.stem.split('_')
196-
if len(parts) >= 3:
197-
spec_id = '_'.join(parts[:-2])
198-
library = parts[-2]
199-
variant = parts[-1]
58+
gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/ || echo "No images to download"
20059
201-
print(f"🔍 Evaluating: {spec_id}/{library}/{variant}")
202-
result = evaluate_plot(spec_id, library, variant, str(img_file))
203-
result["spec_id"] = spec_id
204-
result["library"] = library
205-
result["variant"] = variant
206-
results.append(result)
207-
208-
# Save results
209-
with open("quality_results.json", "w") as f:
210-
json.dump(results, f, indent=2)
211-
212-
# Print summary
213-
print("\n" + "="*60)
214-
print("Quality Evaluation Results")
215-
print("="*60)
216-
for r in results:
217-
score = r.get("overall_score", 0)
218-
verdict = r.get("verdict", "UNKNOWN")
219-
print(f"{r['spec_id']}/{r['library']}/{r['variant']}: {score}/100 - {verdict}")
220-
print("="*60)
221-
EOF
222-
223-
python quality_evaluator.py
224-
225-
- name: Parse quality results
226-
id: results
227-
run: |
228-
if [ ! -f quality_results.json ]; then
229-
echo "No quality results found"
230-
exit 0
231-
fi
232-
233-
# Calculate overall verdict
234-
TOTAL=$(jq '. | length' quality_results.json)
235-
PASSED=$(jq '[.[] | select(.verdict == "PASS")] | length' quality_results.json)
236-
FAILED=$((TOTAL - PASSED))
237-
238-
MIN_SCORE=$(jq '[.[] | .overall_score] | min' quality_results.json)
239-
240-
echo "total=$TOTAL" >> $GITHUB_OUTPUT
241-
echo "passed=$PASSED" >> $GITHUB_OUTPUT
242-
echo "failed=$FAILED" >> $GITHUB_OUTPUT
243-
echo "min_score=$MIN_SCORE" >> $GITHUB_OUTPUT
244-
245-
# Overall verdict: PASS if all pass and min_score >= 85
246-
if [ $FAILED -eq 0 ] && [ $MIN_SCORE -ge 85 ]; then
247-
echo "overall_verdict=PASS" >> $GITHUB_OUTPUT
248-
else
249-
echo "overall_verdict=FAIL" >> $GITHUB_OUTPUT
250-
fi
251-
252-
- name: Generate quality report
253-
if: steps.results.outputs.total != ''
254-
run: |
255-
cat > quality_report.md <<'EOF'
256-
## 🤖 Quality Check Results
257-
258-
**Overall Verdict:** ${{ steps.results.outputs.overall_verdict }}
259-
260-
**Summary:**
261-
- Total implementations: ${{ steps.results.outputs.total }}
262-
- Passed: ${{ steps.results.outputs.passed }}
263-
- Failed: ${{ steps.results.outputs.failed }}
264-
- Minimum score: ${{ steps.results.outputs.min_score }}/100
265-
266-
### Detailed Results
267-
268-
EOF
269-
270-
# Add detailed results for each implementation
271-
jq -r '.[] | "#### \(.spec_id) / \(.library) / \(.variant)\n\n**Score:** \(.overall_score)/100 | **Verdict:** \(.verdict)\n\n**Summary:** \(.summary)\n\n**Strengths:**\n\(.strengths | map("- " + .) | join("\n"))\n\n**Improvements Needed:**\n\(.improvements | map("- " + .) | join("\n"))\n\n---\n"' quality_results.json >> quality_report.md
272-
273-
echo "" >> quality_report.md
274-
echo "*Generated by [quality-check workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" >> quality_report.md
275-
276-
- name: Comment on PR with quality results
277-
if: steps.results.outputs.total != ''
278-
uses: actions/github-script@v7
279-
with:
280-
script: |
281-
const fs = require('fs');
282-
const report = fs.readFileSync('quality_report.md', 'utf8');
283-
284-
await github.rest.issues.createComment({
285-
owner: context.repo.owner,
286-
repo: context.repo.repo,
287-
issue_number: ${{ steps.metadata.outputs.pr_number }},
288-
body: report
289-
});
290-
291-
- name: Add labels based on verdict
292-
if: steps.results.outputs.total != ''
293-
uses: actions/github-script@v7
60+
- name: Quality evaluation with Claude Code
61+
uses: anthropics/claude-code-action@v1
29462
with:
295-
script: |
296-
const verdict = '${{ steps.results.outputs.overall_verdict }}';
297-
const prNumber = ${{ steps.metadata.outputs.pr_number }};
298-
299-
if (verdict === 'PASS') {
300-
await github.rest.issues.addLabels({
301-
owner: context.repo.owner,
302-
repo: context.repo.repo,
303-
issue_number: prNumber,
304-
labels: ['quality-approved']
305-
});
306-
} else {
307-
await github.rest.issues.addLabels({
308-
owner: context.repo.owner,
309-
repo: context.repo.repo,
310-
issue_number: prNumber,
311-
labels: ['quality-check-failed']
312-
});
313-
}
63+
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
64+
prompt: |
65+
TASK: Evaluate plot implementations against specifications
66+
67+
PR: #${{ steps.metadata.outputs.pr_number }}
68+
Preview images location: preview_images/
69+
70+
Instructions:
71+
1. List all PNG files in preview_images/ directory
72+
2. For each preview image:
73+
a. Parse filename to extract: spec_id, library, variant
74+
b. Read corresponding spec file: specs/{spec_id}.md
75+
c. View the preview image
76+
d. Evaluate against quality criteria in spec
77+
78+
3. For each implementation, check:
79+
- Does it meet ALL quality criteria listed in spec?
80+
- Are visual elements clear and readable?
81+
- Are colors appropriate and accessible?
82+
- Is the layout well-structured?
83+
- Score: 0-100 (≥85 to pass)
84+
85+
4. Generate quality report with:
86+
- Overall verdict (PASS if all ≥85, FAIL otherwise)
87+
- Score for each implementation
88+
- Specific feedback for each quality criterion
89+
- Strengths and improvements needed
90+
91+
5. Use gh CLI to:
92+
- Post quality report as comment on PR #${{ steps.metadata.outputs.pr_number }}
93+
- Add label "quality-approved" if PASS
94+
- Add label "quality-check-failed" if FAIL
95+
96+
Scoring Guidelines:
97+
- 90-100: Excellent - All criteria met, production ready
98+
- 85-89: Good - Minor issues, acceptable
99+
- 75-84: Needs improvement
100+
- <75: Rejected - Major issues
101+
102+
Be objective and constructive in your feedback.

0 commit comments

Comments
 (0)