Skip to content

Commit c2bee98

Browse files
authored
fix: incremental PR review, auto-approve, and bot operational improvements (#263)
* fix: reduce PR review false positives, increase context budget * fix: incremental PR review, auto-approve, and bot operational improvements
1 parent cd60ca6 commit c2bee98

9 files changed

Lines changed: 1172 additions & 48 deletions

File tree

.github/workflows/auto-approve.yml

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
name: Auto-Approve Clean PRs
2+
3+
on:
4+
workflow_run:
5+
workflows: [".github/workflows/base.yml", "PyDeequ Bot"]
6+
types: [completed]
7+
8+
permissions:
9+
pull-requests: write
10+
actions: read
11+
12+
jobs:
13+
approve:
14+
runs-on: ubuntu-latest
15+
if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target'
16+
timeout-minutes: 2
17+
18+
steps:
19+
- name: Find PR and check both conditions
20+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
21+
with:
22+
script: |
23+
const sha = context.payload.workflow_run.head_sha;
24+
const owner = context.repo.owner;
25+
const repo = context.repo.repo;
26+
27+
// Find the PR for this SHA
28+
let prNumber = null;
29+
const prs = context.payload.workflow_run.pull_requests;
30+
if (prs && prs.length > 0) {
31+
prNumber = prs[0].number;
32+
} else {
33+
const {data: searchResult} = await github.rest.pulls.list({
34+
owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30
35+
});
36+
const match = searchResult.find(pr => pr.head.sha === sha);
37+
if (match) {
38+
prNumber = match.number;
39+
}
40+
}
41+
42+
if (!prNumber) {
43+
core.info(`No open PR found for SHA ${sha}, skipping`);
44+
return;
45+
}
46+
47+
core.info(`Found PR #${prNumber} for SHA ${sha}`);
48+
49+
// Verify the PR head SHA still matches (no new push since trigger)
50+
const {data: pr} = await github.rest.pulls.get({
51+
owner, repo, pull_number: prNumber
52+
});
53+
if (pr.head.sha !== sha) {
54+
core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`);
55+
return;
56+
}
57+
58+
// Condition 1: CI must have passed for this SHA
59+
const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({
60+
owner, repo, head_sha: sha, status: 'completed'
61+
});
62+
const ciRun = workflowRuns.workflow_runs.find(r =>
63+
r.name === '.github/workflows/base.yml' && r.conclusion === 'success'
64+
);
65+
if (!ciRun) {
66+
core.info(`CI has not passed for SHA ${sha}, skipping`);
67+
return;
68+
}
69+
70+
// Condition 2: Bot must have posted a clean review for this SHA
71+
const {data: reviews} = await github.rest.pulls.listReviews({
72+
owner, repo, pull_number: prNumber
73+
});
74+
75+
const CLEAN_MARKER = '<!-- deequ-bot:clean -->';
76+
77+
const latestBot = reviews
78+
.filter(r => r.user.login === 'github-actions[bot]')
79+
.sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0];
80+
81+
if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) {
82+
core.info('Bot has not posted a clean review for this SHA, skipping');
83+
return;
84+
}
85+
86+
// Both conditions met — check for existing approval to prevent doubles
87+
const botApprovals = reviews.filter(r =>
88+
r.user.login === 'github-actions[bot]' && r.state === 'APPROVED'
89+
);
90+
if (botApprovals.length > 0) {
91+
core.info('Bot already approved this PR, skipping');
92+
return;
93+
}
94+
95+
// Approve
96+
core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`);
97+
await github.rest.pulls.createReview({
98+
owner, repo, pull_number: prNumber,
99+
event: 'APPROVE',
100+
body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*`
101+
});

.github/workflows/issue-bot.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,20 @@ jobs:
6161
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
6262
EVENT_TYPE: ${{ github.event_name }}
6363
EVENT_ACTION: ${{ github.event.action }}
64+
EVENT_BEFORE: ${{ github.event.before }}
65+
EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }}
6466
GITHUB_ACTOR: ${{ github.actor }}
6567
KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
6668
KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
6769
BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
6870
GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
6971
GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
70-
ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }}
71-
ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }}
72-
PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }}
73-
FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }}
72+
SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt
73+
SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt
74+
SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt
75+
SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt
76+
CODEBASE_SRC_DIR: pydeequ
77+
CODEBASE_FILE_EXT: .py
7478
DRY_RUN: ${{ inputs.dry_run || 'false' }}
7579
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
7680
run: python -m issue_bot.main analyze

.github/workflows/stale.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Manage Stale Issues and PRs
2+
3+
on:
4+
schedule:
5+
- cron: '0 9 * * MON'
6+
workflow_dispatch:
7+
8+
permissions:
9+
issues: write
10+
pull-requests: write
11+
12+
jobs:
13+
stale:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
17+
with:
18+
days-before-stale: 60
19+
days-before-close: 14
20+
stale-issue-label: 'stale'
21+
stale-pr-label: 'stale'
22+
stale-issue-message: >
23+
This issue has been inactive for 60 days. It will be closed in 14 days
24+
if there is no further activity. If this is still relevant, please comment
25+
to keep it open.
26+
stale-pr-message: >
27+
This PR has been inactive for 60 days. It will be closed in 14 days
28+
if there is no further activity. If you are still working on this,
29+
please push an update or comment to keep it open.
30+
close-issue-message: >
31+
Closed due to inactivity. Feel free to reopen if this is still relevant.
32+
close-pr-message: >
33+
Closed due to inactivity. Feel free to reopen if you'd like to continue this work.
34+
exempt-issue-labels: 'bug,enhancement,help-wanted'
35+
exempt-pr-labels: 'help-wanted'
36+
operations-per-run: 50

scripts/issue_bot/config.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def __init__(self):
1717
sys.exit(1)
1818
self.repo = _require("GITHUB_REPOSITORY")
1919
self.actor = os.getenv("GITHUB_ACTOR", "")
20+
self.event_before = os.getenv("EVENT_BEFORE", "")
21+
self.event_after = os.getenv("EVENT_AFTER", "")
2022

2123
self.bedrock_model_id = os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-opus-4-6-v1")
2224

@@ -32,14 +34,16 @@ def __init__(self):
3234
self.enable_repo_search = os.getenv("ENABLE_REPO_SEARCH", "true").lower() == "true"
3335

3436
self.upstream_repo = os.getenv("UPSTREAM_REPO", "awslabs/python-deequ")
37+
self.codebase_src_dir = os.getenv("CODEBASE_SRC_DIR", "pydeequ")
38+
self.codebase_file_ext = os.getenv("CODEBASE_FILE_EXT", ".py")
3539

36-
self.bedrock_timeout = 120
37-
self.max_context_chars = 200000
40+
self.bedrock_timeout = 240
41+
self.max_context_chars = 800000
3842
self.max_github_search_results = 8
3943
self.github_api_timeout = 10
4044
self.allowed_labels = {
4145
"bug", "enhancement", "question", "documentation",
42-
"help-wanted", "analyzer", "check", "spark-compatibility", "installation",
46+
"help wanted", "python",
4347
}
4448

4549

scripts/issue_bot/github_client.py

Lines changed: 87 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def __init__(self, cfg):
1111
self._repo = cfg.repo
1212
self._timeout = cfg.github_api_timeout
1313
self._dry_run = cfg.dry_run
14+
self._cfg = cfg
1415
self._repo_root = os.getenv("GITHUB_WORKSPACE", os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
1516
self._headers = {
1617
"Authorization": f"token {self._token}",
@@ -48,6 +49,59 @@ def get_pr_diff(self, number):
4849
logger.error(f"PR diff fetch failed: {e}")
4950
return ""
5051

52+
def get_compare_diff(self, base_sha, head_sha):
53+
"""Fetch the diff between two commits using the Compare API.
54+
Returns the diff text, or empty string on failure (e.g. force-push
55+
where base_sha no longer exists)."""
56+
headers = {**self._headers, "Accept": "application/vnd.github.v3.diff"}
57+
try:
58+
resp = requests.get(
59+
f"https://api.github.com/repos/{self._repo}/compare/{base_sha}...{head_sha}",
60+
headers=headers, timeout=self._timeout,
61+
)
62+
if resp.status_code == 200:
63+
return resp.text
64+
logger.warning(f"Compare API {base_sha[:7]}...{head_sha[:7]}: {resp.status_code}")
65+
return ""
66+
except Exception as e:
67+
logger.error(f"Compare diff failed: {e}")
68+
return ""
69+
70+
def get_ci_status(self, sha):
71+
"""Check commit statuses and check runs. Returns (passed, summary).
72+
passed: True (all green), False (something failed), None (pending/unknown)."""
73+
status = self._get(f"/repos/{self._repo}/commits/{sha}/status")
74+
if status is None:
75+
return None, "CI status unavailable"
76+
combined_state = status.get("state", "pending")
77+
78+
check_data = self._get(f"/repos/{self._repo}/commits/{sha}/check-runs")
79+
runs = check_data.get("check_runs", []) if check_data else []
80+
81+
def _is_own_check(name):
82+
lower = name.lower()
83+
return "bot" in lower and ("analyze" in lower or "/ act" in lower)
84+
85+
external_runs = [r for r in runs if not _is_own_check(r.get("name", ""))]
86+
87+
failed = []
88+
pending = []
89+
for r in external_runs:
90+
if r.get("status") != "completed":
91+
pending.append(r["name"])
92+
elif r.get("conclusion") not in ("success", "neutral", "skipped"):
93+
failed.append(r["name"])
94+
95+
if failed:
96+
return False, f"CI failing: {', '.join(failed)}"
97+
if pending:
98+
return None, f"CI pending: {', '.join(pending)}"
99+
if combined_state == "failure":
100+
return False, "CI failing (status checks)"
101+
if combined_state == "pending":
102+
return None, "CI pending (status checks)"
103+
return True, "CI passed"
104+
51105
def get_pr_files(self, number):
52106
return self._get(f"/repos/{self._repo}/pulls/{number}/files") or []
53107

@@ -64,16 +118,18 @@ def get_pr_review_comments(self, number, max_pages=10):
64118
page += 1
65119
return comments
66120

67-
def get_codebase_map(self, src_dir="pydeequ"):
68-
"""List all Python source files (excluding tests) as relative paths."""
121+
def get_codebase_map(self):
122+
"""List source files (excluding tests) as relative paths."""
123+
src_dir = self._cfg.codebase_src_dir
124+
file_ext = self._cfg.codebase_file_ext
69125
full_dir = os.path.join(self._repo_root, src_dir)
70126
prefix = self._repo_root.rstrip("/") + "/"
71127
try:
72128
paths = []
73129
for root, dirs, files in os.walk(full_dir):
74-
dirs[:] = [d for d in dirs if d not in ("tests", "__pycache__", ".git")]
130+
dirs[:] = [d for d in dirs if d not in ("examples", "__pycache__", ".git", "tests", "test")]
75131
for f in files:
76-
if f.endswith(".py"):
132+
if f.endswith(file_ext):
77133
full = os.path.join(root, f)
78134
rel = full[len(prefix):] if full.startswith(prefix) else full
79135
paths.append(rel)
@@ -116,9 +172,9 @@ def post_comment(self, number, body):
116172
return True
117173
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})
118174

119-
def post_pr_review(self, number, summary, inline_comments):
175+
def post_pr_review(self, number, summary, inline_comments, event="COMMENT"):
120176
if self._dry_run:
121-
logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments")
177+
logger.info(f"[DRY RUN] PR review on #{number}: {len(inline_comments)} inline comments, event={event}")
122178
return True
123179

124180
# Get valid diff lines per file from the PR
@@ -134,31 +190,34 @@ def post_pr_review(self, number, summary, inline_comments):
134190
else:
135191
invalid_comments.append(ic)
136192

193+
body = summary
194+
if invalid_comments:
195+
body += "\n\n**Additional feedback:**\n"
196+
for ic in invalid_comments:
197+
line_ref = f":{ic['line']}" if ic.get('line') else ""
198+
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
199+
200+
payload = {"body": body, "event": event}
137201
if valid_comments:
138-
body = summary
139-
if invalid_comments:
140-
body += "\n\n**Additional feedback:**\n"
141-
for ic in invalid_comments:
142-
line_ref = f":{ic['line']}" if ic.get('line') else ""
143-
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
144-
payload = {"body": body, "event": "REQUEST_CHANGES", "comments": valid_comments}
145-
try:
146-
resp = requests.post(
147-
f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
148-
headers=self._headers, json=payload, timeout=self._timeout,
149-
)
150-
if resp.status_code in (200, 201):
151-
return True
152-
logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
153-
except Exception as e:
154-
logger.error(f"PR review API failed: {e}, falling back to comment")
155-
156-
# Fallback: post all as regular comment
157-
all_comments = inline_comments
202+
payload["comments"] = valid_comments
203+
204+
try:
205+
resp = requests.post(
206+
f"https://api.github.com/repos/{self._repo}/pulls/{number}/reviews",
207+
headers=self._headers, json=payload, timeout=self._timeout,
208+
)
209+
if resp.status_code in (200, 201):
210+
return True
211+
logger.error(f"PR review API failed: {resp.status_code}, falling back to comment")
212+
logger.error(f"Response: {resp.text[:500]}")
213+
except Exception as e:
214+
logger.error(f"PR review API failed: {e}, falling back to comment")
215+
216+
# Fallback: post as regular comment if review API fails
158217
body = summary
159-
if all_comments:
218+
if inline_comments:
160219
body += "\n\n**Inline feedback:**\n"
161-
for ic in all_comments:
220+
for ic in inline_comments:
162221
line_ref = f":{ic['line']}" if ic.get('line') else ""
163222
body += f"\n`{ic['file']}{line_ref}` — {ic['comment']}\n"
164223
return self._post(f"/repos/{self._repo}/issues/{number}/comments", {"body": body})

0 commit comments

Comments
 (0)