|
| 1 | +name: Quality Check |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_run: |
| 5 | + workflows: ["Test and Generate Previews"] |
| 6 | + types: [completed] |
| 7 | + |
| 8 | +jobs: |
| 9 | + quality-check: |
| 10 | + name: AI Quality Evaluation |
| 11 | + runs-on: ubuntu-latest |
| 12 | + if: ${{ github.event.workflow_run.conclusion == 'success' }} |
| 13 | + permissions: |
| 14 | + contents: read |
| 15 | + pull-requests: write |
| 16 | + issues: write |
| 17 | + |
| 18 | + steps: |
| 19 | + - name: Checkout code |
| 20 | + uses: actions/checkout@v4 |
| 21 | + |
| 22 | + - name: Set up Python |
| 23 | + uses: actions/setup-python@v5 |
| 24 | + with: |
| 25 | + python-version: '3.12' |
| 26 | + |
| 27 | + - name: Install dependencies |
| 28 | + run: | |
| 29 | + pip install anthropic requests |
| 30 | +
|
| 31 | + - name: Download preview metadata |
| 32 | + uses: actions/download-artifact@v4 |
| 33 | + with: |
| 34 | + name: preview-metadata |
| 35 | + run-id: ${{ github.event.workflow_run.id }} |
| 36 | + github-token: ${{ secrets.GITHUB_TOKEN }} |
| 37 | + |
| 38 | + - name: Load metadata |
| 39 | + id: metadata |
| 40 | + run: | |
| 41 | + if [ ! -f preview_metadata.json ]; then |
| 42 | + echo "No preview metadata found, skipping quality check" |
| 43 | + exit 0 |
| 44 | + fi |
| 45 | +
|
| 46 | + PR_NUMBER=$(jq -r '.pr_number' preview_metadata.json) |
| 47 | + BUCKET=$(jq -r '.bucket' preview_metadata.json) |
| 48 | + BASE_PATH=$(jq -r '.base_path' preview_metadata.json) |
| 49 | +
|
| 50 | + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT |
| 51 | + echo "bucket=$BUCKET" >> $GITHUB_OUTPUT |
| 52 | + echo "base_path=$BASE_PATH" >> $GITHUB_OUTPUT |
| 53 | +
|
| 54 | + - name: Setup Google Cloud authentication |
| 55 | + uses: google-github-actions/auth@v2 |
| 56 | + with: |
| 57 | + credentials_json: ${{ secrets.GCS_CREDENTIALS }} |
| 58 | + |
| 59 | + - name: Download preview images |
| 60 | + run: | |
| 61 | + mkdir -p preview_images |
| 62 | + gsutil -m cp -r "gs://${{ steps.metadata.outputs.bucket }}/${{ steps.metadata.outputs.base_path }}/*" preview_images/ |
| 63 | +
|
| 64 | + - name: Run quality evaluation |
| 65 | + id: quality_eval |
| 66 | + env: |
| 67 | + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
| 68 | + run: | |
| 69 | + cat > quality_evaluator.py <<'EOF' |
| 70 | + import os |
| 71 | + import sys |
| 72 | + import json |
| 73 | + import base64 |
| 74 | + from pathlib import Path |
| 75 | + import anthropic |
| 76 | +
|
| 77 | + def load_spec(spec_id: str) -> str: |
| 78 | + """Load spec content""" |
| 79 | + spec_path = Path(f"specs/{spec_id}.md") |
| 80 | + if not spec_path.exists(): |
| 81 | + return "" |
| 82 | + return spec_path.read_text() |
| 83 | +
|
| 84 | + def encode_image(image_path: str) -> str: |
| 85 | + """Encode image as base64""" |
| 86 | + with open(image_path, 'rb') as f: |
| 87 | + return base64.standard_b64encode(f.read()).decode('utf-8') |
| 88 | +
|
| 89 | + def evaluate_plot(spec_id: str, library: str, variant: str, image_path: str) -> dict: |
| 90 | + """Evaluate a single plot implementation""" |
| 91 | + client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) |
| 92 | +
|
| 93 | + # Load spec |
| 94 | + spec_content = load_spec(spec_id) |
| 95 | + if not spec_content: |
| 96 | + return {"error": f"Spec not found: {spec_id}"} |
| 97 | +
|
| 98 | + # Encode image |
| 99 | + image_data = encode_image(image_path) |
| 100 | +
|
| 101 | + # Build evaluation prompt |
| 102 | + prompt = f"""You are an expert data visualization reviewer. |
| 103 | +
|
| 104 | + # Task |
| 105 | + Evaluate this plot implementation against its specification. |
| 106 | +
|
| 107 | + # Specification |
| 108 | + {spec_content} |
| 109 | +
|
| 110 | + # Implementation Details |
| 111 | + - Library: {library} |
| 112 | + - Variant: {variant} |
| 113 | +
|
| 114 | + # Quality Evaluation Criteria |
| 115 | + Review the plot image against EACH quality criterion listed in the specification. |
| 116 | +
|
| 117 | + # Response Format |
| 118 | + Provide your evaluation in the following JSON format: |
| 119 | +
|
| 120 | + {{ |
| 121 | + "overall_score": <0-100>, |
| 122 | + "verdict": "<PASS|FAIL>", |
| 123 | + "criteria_results": [ |
| 124 | + {{ |
| 125 | + "criterion": "<criterion text>", |
| 126 | + "status": "<PASS|FAIL>", |
| 127 | + "comment": "<brief explanation>" |
| 128 | + }} |
| 129 | + ], |
| 130 | + "strengths": ["<strength 1>", "<strength 2>"], |
| 131 | + "improvements": ["<improvement 1>", "<improvement 2>"], |
| 132 | + "summary": "<2-3 sentence overall assessment>" |
| 133 | + }} |
| 134 | +
|
| 135 | + # Scoring Guidelines |
| 136 | + - 90-100: Excellent - All criteria met, production ready |
| 137 | + - 85-89: Good - Minor issues, acceptable |
| 138 | + - 75-84: Needs improvement - Some criteria failed |
| 139 | + - <75: Rejected - Major issues, regeneration required |
| 140 | +
|
| 141 | + # Instructions |
| 142 | + 1. Review the image carefully |
| 143 | + 2. Check EACH quality criterion from the spec |
| 144 | + 3. Provide specific, actionable feedback |
| 145 | + 4. Be objective and constructive |
| 146 | +
|
| 147 | + Provide ONLY the JSON response, no additional text.""" |
| 148 | +
|
| 149 | + # Call Claude with vision |
| 150 | + response = client.messages.create( |
| 151 | + model="claude-sonnet-4-20250514", |
| 152 | + max_tokens=2000, |
| 153 | + messages=[{ |
| 154 | + "role": "user", |
| 155 | + "content": [ |
| 156 | + { |
| 157 | + "type": "image", |
| 158 | + "source": { |
| 159 | + "type": "base64", |
| 160 | + "media_type": "image/png", |
| 161 | + "data": image_data |
| 162 | + } |
| 163 | + }, |
| 164 | + { |
| 165 | + "type": "text", |
| 166 | + "text": prompt |
| 167 | + } |
| 168 | + ] |
| 169 | + }] |
| 170 | + ) |
| 171 | +
|
| 172 | + # Parse response |
| 173 | + response_text = response.content[0].text |
| 174 | +
|
| 175 | + # Extract JSON (handle markdown code blocks) |
| 176 | + if "```json" in response_text: |
| 177 | + response_text = response_text.split("```json")[1].split("```")[0].strip() |
| 178 | + elif "```" in response_text: |
| 179 | + response_text = response_text.split("```")[1].split("```")[0].strip() |
| 180 | +
|
| 181 | + try: |
| 182 | + result = json.loads(response_text) |
| 183 | + return result |
| 184 | + except json.JSONDecodeError: |
| 185 | + return {"error": "Failed to parse response", "raw": response_text} |
| 186 | +
|
| 187 | + # Main execution |
| 188 | + if __name__ == "__main__": |
| 189 | + results = [] |
| 190 | +
|
| 191 | + # Process all preview images |
| 192 | + preview_dir = Path("preview_images") |
| 193 | + for img_file in preview_dir.glob("*.png"): |
| 194 | + # Parse filename: {spec_id}_{library}_{variant}.png |
| 195 | + parts = img_file.stem.split('_') |
| 196 | + if len(parts) >= 3: |
| 197 | + spec_id = '_'.join(parts[:-2]) |
| 198 | + library = parts[-2] |
| 199 | + variant = parts[-1] |
| 200 | +
|
| 201 | + print(f"🔍 Evaluating: {spec_id}/{library}/{variant}") |
| 202 | + result = evaluate_plot(spec_id, library, variant, str(img_file)) |
| 203 | + result["spec_id"] = spec_id |
| 204 | + result["library"] = library |
| 205 | + result["variant"] = variant |
| 206 | + results.append(result) |
| 207 | +
|
| 208 | + # Save results |
| 209 | + with open("quality_results.json", "w") as f: |
| 210 | + json.dump(results, f, indent=2) |
| 211 | +
|
| 212 | + # Print summary |
| 213 | + print("\n" + "="*60) |
| 214 | + print("Quality Evaluation Results") |
| 215 | + print("="*60) |
| 216 | + for r in results: |
| 217 | + score = r.get("overall_score", 0) |
| 218 | + verdict = r.get("verdict", "UNKNOWN") |
| 219 | + print(f"{r['spec_id']}/{r['library']}/{r['variant']}: {score}/100 - {verdict}") |
| 220 | + print("="*60) |
| 221 | + EOF |
| 222 | +
|
| 223 | + python quality_evaluator.py |
| 224 | +
|
| 225 | + - name: Parse quality results |
| 226 | + id: results |
| 227 | + run: | |
| 228 | + if [ ! -f quality_results.json ]; then |
| 229 | + echo "No quality results found" |
| 230 | + exit 0 |
| 231 | + fi |
| 232 | +
|
| 233 | + # Calculate overall verdict |
| 234 | + TOTAL=$(jq '. | length' quality_results.json) |
| 235 | + PASSED=$(jq '[.[] | select(.verdict == "PASS")] | length' quality_results.json) |
| 236 | + FAILED=$((TOTAL - PASSED)) |
| 237 | +
|
| 238 | + MIN_SCORE=$(jq '[.[] | .overall_score] | min' quality_results.json) |
| 239 | +
|
| 240 | + echo "total=$TOTAL" >> $GITHUB_OUTPUT |
| 241 | + echo "passed=$PASSED" >> $GITHUB_OUTPUT |
| 242 | + echo "failed=$FAILED" >> $GITHUB_OUTPUT |
| 243 | + echo "min_score=$MIN_SCORE" >> $GITHUB_OUTPUT |
| 244 | +
|
| 245 | + # Overall verdict: PASS if all pass and min_score >= 85 |
| 246 | + if [ $FAILED -eq 0 ] && [ $MIN_SCORE -ge 85 ]; then |
| 247 | + echo "overall_verdict=PASS" >> $GITHUB_OUTPUT |
| 248 | + else |
| 249 | + echo "overall_verdict=FAIL" >> $GITHUB_OUTPUT |
| 250 | + fi |
| 251 | +
|
| 252 | + - name: Generate quality report |
| 253 | + if: steps.results.outputs.total != '' |
| 254 | + run: | |
| 255 | + cat > quality_report.md <<'EOF' |
| 256 | + ## 🤖 Quality Check Results |
| 257 | +
|
| 258 | + **Overall Verdict:** ${{ steps.results.outputs.overall_verdict }} |
| 259 | +
|
| 260 | + **Summary:** |
| 261 | + - Total implementations: ${{ steps.results.outputs.total }} |
| 262 | + - Passed: ${{ steps.results.outputs.passed }} |
| 263 | + - Failed: ${{ steps.results.outputs.failed }} |
| 264 | + - Minimum score: ${{ steps.results.outputs.min_score }}/100 |
| 265 | +
|
| 266 | + ### Detailed Results |
| 267 | +
|
| 268 | + EOF |
| 269 | +
|
| 270 | + # Add detailed results for each implementation |
| 271 | + jq -r '.[] | "#### \(.spec_id) / \(.library) / \(.variant)\n\n**Score:** \(.overall_score)/100 | **Verdict:** \(.verdict)\n\n**Summary:** \(.summary)\n\n**Strengths:**\n\(.strengths | map("- " + .) | join("\n"))\n\n**Improvements Needed:**\n\(.improvements | map("- " + .) | join("\n"))\n\n---\n"' quality_results.json >> quality_report.md |
| 272 | +
|
| 273 | + echo "" >> quality_report.md |
| 274 | + echo "*Generated by [quality-check workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" >> quality_report.md |
| 275 | +
|
| 276 | + - name: Comment on PR with quality results |
| 277 | + if: steps.results.outputs.total != '' |
| 278 | + uses: actions/github-script@v7 |
| 279 | + with: |
| 280 | + script: | |
| 281 | + const fs = require('fs'); |
| 282 | + const report = fs.readFileSync('quality_report.md', 'utf8'); |
| 283 | +
|
| 284 | + await github.rest.issues.createComment({ |
| 285 | + owner: context.repo.owner, |
| 286 | + repo: context.repo.repo, |
| 287 | + issue_number: ${{ steps.metadata.outputs.pr_number }}, |
| 288 | + body: report |
| 289 | + }); |
| 290 | +
|
| 291 | + - name: Add labels based on verdict |
| 292 | + if: steps.results.outputs.total != '' |
| 293 | + uses: actions/github-script@v7 |
| 294 | + with: |
| 295 | + script: | |
| 296 | + const verdict = '${{ steps.results.outputs.overall_verdict }}'; |
| 297 | + const prNumber = ${{ steps.metadata.outputs.pr_number }}; |
| 298 | +
|
| 299 | + if (verdict === 'PASS') { |
| 300 | + await github.rest.issues.addLabels({ |
| 301 | + owner: context.repo.owner, |
| 302 | + repo: context.repo.repo, |
| 303 | + issue_number: prNumber, |
| 304 | + labels: ['quality-approved'] |
| 305 | + }); |
| 306 | + } else { |
| 307 | + await github.rest.issues.addLabels({ |
| 308 | + owner: context.repo.owner, |
| 309 | + repo: context.repo.repo, |
| 310 | + issue_number: prNumber, |
| 311 | + labels: ['quality-check-failed'] |
| 312 | + }); |
| 313 | + } |
0 commit comments