Skip to content

Commit 3b02df0

Browse files
feat(autocurrency): add agent-driven currency fix loop
Add a workflow and script that automatically diagnoses and fixes CI failures on auto-update PRs for vLLM and SGLang. Architecture (Option C from design doc): - workflow_run triggers on PR CI failure for auto-update/* branches - Circuit breaker via GitHub Actions cache prevents infinite loops - Git commit counting limits to 3 fix attempts before escalating - Calls Bedrock Claude Opus 4.6 to reason about failures - Applies returned file edits, commits, and pushes to PR branch - PR CI re-runs naturally on new commit Trigger decision: uses startsWith(branch, 'auto-update/') filter rather than PR label check because workflow_run payload doesn't include labels and the branch naming is deterministic. Scope: Ubuntu variants only (PR - vLLM EC2, PR - vLLM SageMaker, PR - SGLang EC2, PR - SGLang SageMaker). Will expand to amzn2023 variants after testing. Design doc: https://quip-amazon.com/TeVQAeJ1f0D7
1 parent d2f9d72 commit 3b02df0

2 files changed

Lines changed: 456 additions & 0 deletions

File tree

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# agent-currency-fix.yml — Automatically diagnoses and fixes CI failures on
2+
# auto-update PRs by calling Bedrock Claude to reason about the failure and
3+
# generate file edits.
4+
#
5+
# Triggers when any PR CI workflow completes with failure on an auto-update/* branch.
6+
# Pushes fix commits to the PR branch, which re-triggers CI naturally.
7+
# Stops after 3 attempts or when CI passes.
8+
9+
name: Currency Fix Agent
10+
11+
on:
12+
workflow_run:
13+
workflows:
14+
- "PR - vLLM EC2"
15+
- "PR - vLLM SageMaker"
16+
- "PR - SGLang EC2"
17+
- "PR - SGLang SageMaker"
18+
types: [completed]
19+
20+
concurrency:
21+
group: currency-fix-${{ github.event.workflow_run.head_branch }}
22+
cancel-in-progress: true
23+
24+
permissions:
25+
contents: read
26+
actions: read
27+
28+
env:
29+
MAX_ATTEMPTS: 3
30+
31+
jobs:
32+
fix-agent:
33+
# Only run when: (1) failed, (2) on an auto-update branch
34+
if: >-
35+
github.event.workflow_run.conclusion == 'failure' &&
36+
startsWith(github.event.workflow_run.head_branch, 'auto-update/')
37+
runs-on:
38+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
39+
fleet:default-runner
40+
buildspec-override:true
41+
steps:
42+
# --- Circuit breaker: check if already exhausted (no auth needed) ---
43+
- name: Check exhaustion flag
44+
id: exhausted
45+
uses: actions/cache/restore@v4
46+
with:
47+
path: .agent-exhausted
48+
key: agent-exhausted-${{ github.event.workflow_run.head_branch }}
49+
50+
- name: Skip if exhausted
51+
if: steps.exhausted.outputs.cache-hit == 'true'
52+
run: |
53+
echo "::notice::Agent already exhausted for this branch. Skipping."
54+
exit 0
55+
56+
# --- Auth ---
57+
- name: Decode the GitHub App Private Key
58+
if: steps.exhausted.outputs.cache-hit != 'true'
59+
id: decode
60+
run: |
61+
private_key=$(echo "${{ secrets.ASIMOVBOT_APP_PRIVATE_KEY }}" | base64 -d | awk 'BEGIN {ORS="\\n"} {print}' | head -c -2) &> /dev/null
62+
echo "::add-mask::$private_key"
63+
echo "private-key=$private_key" >> "$GITHUB_OUTPUT"
64+
65+
- name: Generate GitHub App Token
66+
if: steps.exhausted.outputs.cache-hit != 'true'
67+
id: app-token
68+
uses: actions/create-github-app-token@v1
69+
with:
70+
app-id: ${{ vars.ASIMOVBOT_APP_ID }}
71+
private-key: ${{ steps.decode.outputs.private-key }}
72+
73+
# --- Checkout PR branch ---
74+
- name: Checkout PR branch
75+
if: steps.exhausted.outputs.cache-hit != 'true'
76+
uses: actions/checkout@v5
77+
with:
78+
ref: ${{ github.event.workflow_run.head_branch }}
79+
fetch-depth: 0
80+
token: ${{ steps.app-token.outputs.token }}
81+
82+
# --- Count previous attempts ---
83+
- name: Count previous attempts
84+
if: steps.exhausted.outputs.cache-hit != 'true'
85+
id: retry
86+
run: |
87+
ATTEMPTS=$(git log --oneline origin/main..HEAD --grep='\[agent-fix\]' | wc -l)
88+
echo "attempts=$ATTEMPTS" >> $GITHUB_OUTPUT
89+
echo "Agent fix attempts so far: $ATTEMPTS"
90+
if [ "$ATTEMPTS" -ge "${{ env.MAX_ATTEMPTS }}" ]; then
91+
echo "max_reached=true" >> $GITHUB_OUTPUT
92+
fi
93+
94+
# --- Save exhaustion flag if max reached ---
95+
- name: Save exhaustion flag
96+
if: steps.retry.outputs.max_reached == 'true'
97+
run: echo "exhausted" > .agent-exhausted
98+
99+
- name: Cache exhaustion flag
100+
if: steps.retry.outputs.max_reached == 'true'
101+
uses: actions/cache/save@v4
102+
with:
103+
path: .agent-exhausted
104+
key: agent-exhausted-${{ github.event.workflow_run.head_branch }}
105+
106+
# --- Escalate if max reached ---
107+
- name: Escalate to human
108+
if: steps.retry.outputs.max_reached == 'true'
109+
env:
110+
GH_TOKEN: ${{ steps.app-token.outputs.token }}
111+
run: |
112+
# Find the PR number for this branch
113+
PR_NUMBER=$(gh pr list --head "${{ github.event.workflow_run.head_branch }}" --json number --jq '.[0].number')
114+
if [ -n "$PR_NUMBER" ]; then
115+
gh pr comment "$PR_NUMBER" --body "🔴 **Currency Fix Agent exhausted** (${{ env.MAX_ATTEMPTS }} attempts). Needs human review.
116+
117+
Failed workflow: ${{ github.event.workflow_run.html_url }}"
118+
fi
119+
120+
# TODO: Enable Slack notification after configuring webhook payload support
121+
# if [ -n "${SLACK_WEBHOOK_URL}" ]; then
122+
# curl -s --max-time 10 -X POST "$SLACK_WEBHOOK_URL" \
123+
# -H 'Content-Type: application/json' \
124+
# -d "{
125+
# \"workflow_name\": \"agent_exhausted\",
126+
# \"framework_name\": \"$(echo '${{ github.event.workflow_run.head_branch }}' | sed 's|auto-update/||' | sed 's|-[0-9].*||')\",
127+
# \"pr_url\": \"https://github.com/${{ github.repository }}/pull/${PR_NUMBER}\",
128+
# \"run_url\": \"${{ github.event.workflow_run.html_url }}\"
129+
# }" || true
130+
# fi
131+
132+
# --- Download failed workflow logs ---
133+
- name: Download failed run logs
134+
if: steps.exhausted.outputs.cache-hit != 'true' && steps.retry.outputs.max_reached != 'true'
135+
env:
136+
GH_TOKEN: ${{ steps.app-token.outputs.token }}
137+
run: |
138+
mkdir -p /tmp/ci-logs
139+
gh api "/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/logs" \
140+
> /tmp/ci-logs.zip || true
141+
if [ -f /tmp/ci-logs.zip ] && [ -s /tmp/ci-logs.zip ]; then
142+
unzip -o /tmp/ci-logs.zip -d /tmp/ci-logs/ || true
143+
fi
144+
145+
# --- Extract framework from branch name ---
146+
- name: Determine framework
147+
if: steps.exhausted.outputs.cache-hit != 'true' && steps.retry.outputs.max_reached != 'true'
148+
id: framework
149+
run: |
150+
BRANCH="${{ github.event.workflow_run.head_branch }}"
151+
# auto-update/vllm-0.21.0 → vllm
152+
FRAMEWORK=$(echo "$BRANCH" | sed 's|auto-update/||' | sed 's|-[0-9].*||')
153+
echo "name=$FRAMEWORK" >> $GITHUB_OUTPUT
154+
echo "Framework: $FRAMEWORK"
155+
156+
# --- Run the fix agent ---
157+
- name: Run fix agent
158+
if: steps.exhausted.outputs.cache-hit != 'true' && steps.retry.outputs.max_reached != 'true'
159+
id: fix
160+
env:
161+
AWS_REGION: us-west-2
162+
run: |
163+
python3 scripts/autocurrency/agent-fix.py \
164+
--logs-dir /tmp/ci-logs/ \
165+
--framework "${{ steps.framework.outputs.name }}" \
166+
--branch "${{ github.event.workflow_run.head_branch }}"
167+
168+
# --- Commit and push if fix was generated ---
169+
- name: Commit and push fix
170+
if: steps.exhausted.outputs.cache-hit != 'true' && steps.retry.outputs.max_reached != 'true' && steps.fix.outcome == 'success'
171+
run: |
172+
if git diff --quiet && git diff --cached --quiet; then
173+
echo "::notice::No changes generated by agent. Nothing to push."
174+
exit 0
175+
fi
176+
git config user.name "asimov-bot[bot]"
177+
git config user.email "asimov-bot[bot]@users.noreply.github.com"
178+
git add -A
179+
DESCRIPTION=$(cat /tmp/agent-fix-description.txt 2>/dev/null || echo "automated fix")
180+
git commit -m "[agent-fix] ${DESCRIPTION}"
181+
git push

0 commit comments

Comments
 (0)