Skip to content

Commit f04ee69

Browse files
authored
Help mitigate sporadic network failures (#17535)
1 parent d63c7ad commit f04ee69

File tree

2 files changed

+260
-0
lines changed

2 files changed

+260
-0
lines changed
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
#!/usr/bin/env python3
2+
"""Rerun up to two failed jobs for recent pull request CI runs.
3+
4+
Scans recent failed runs of build-pull-request.yml, ignores the synthetic
5+
required-status-check job, and reruns eligible failed jobs once.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import json
11+
import os
12+
import subprocess
13+
import sys
14+
import urllib.parse
15+
from datetime import datetime, timezone
16+
from pathlib import Path
17+
18+
LOOKBACK_HOURS = 2
19+
MAX_ELIGIBLE_FAILURES = 2
20+
21+
22+
def main() -> None:
23+
repository = os.environ.get("GITHUB_REPOSITORY") or get_current_repository()
24+
owner, repo = repository.split("/", 1)
25+
lookback_cutoff = datetime.now(timezone.utc).timestamp() - LOOKBACK_HOURS * 60 * 60
26+
27+
recent_runs = list_recent_pull_request_runs(owner, repo, lookback_cutoff)
28+
latest_run_by_pull_request = latest_run_per_pull_request(recent_runs)
29+
30+
processed_runs: list[str] = []
31+
rerun_jobs: list[str] = []
32+
33+
for run in latest_run_by_pull_request.values():
34+
if run["status"] != "completed" or run.get("conclusion") != "failure":
35+
continue
36+
37+
if run["run_attempt"] > 1:
38+
processed_runs.append(f"Skipped {format_run(run)}: already rerun once.")
39+
continue
40+
41+
jobs = list_jobs_for_run(owner, repo, run["id"])
42+
failed_real_jobs = [
43+
job
44+
for job in jobs
45+
if job.get("conclusion") == "failure" and job["name"] != "required-status-check"
46+
]
47+
48+
if not failed_real_jobs:
49+
processed_runs.append(f"Skipped {format_run(run)}: only synthetic jobs failed.")
50+
continue
51+
52+
if len(failed_real_jobs) > MAX_ELIGIBLE_FAILURES:
53+
processed_runs.append(
54+
f"Skipped {format_run(run)}: {len(failed_real_jobs)} failed jobs exceeded limit {MAX_ELIGIBLE_FAILURES}."
55+
)
56+
continue
57+
58+
for job in failed_real_jobs:
59+
try:
60+
github_request("POST", f"/repos/{owner}/{repo}/actions/jobs/{job['id']}/rerun")
61+
rerun_jobs.append(f"{format_run(run)}: reran {job['name']} ({job['id']}).")
62+
except subprocess.CalledProcessError as e:
63+
message = read_process_error(e)
64+
processed_runs.append(
65+
f"Failed rerun for {format_run(run)} job {job['name']} ({job['id']}): {message}"
66+
)
67+
68+
if not processed_runs and not rerun_jobs:
69+
processed_runs.append("No recent failed PR runs matched the rerun policy.")
70+
71+
if rerun_jobs:
72+
for message in rerun_jobs:
73+
print(f"::notice::{message}")
74+
else:
75+
print("::notice::No eligible failed jobs were rerun.")
76+
77+
for message in processed_runs:
78+
print(message)
79+
80+
write_summary(processed_runs, rerun_jobs)
81+
82+
83+
def list_recent_pull_request_runs(owner: str, repo: str, lookback_cutoff: float) -> list[dict]:
84+
per_page = 100
85+
runs: list[dict] = []
86+
page = 1
87+
88+
while True:
89+
response = github_request(
90+
"GET",
91+
f"/repos/{owner}/{repo}/actions/workflows/build-pull-request.yml/runs",
92+
{
93+
"event": "pull_request",
94+
"per_page": str(per_page),
95+
"page": str(page),
96+
},
97+
)
98+
99+
page_runs = response["workflow_runs"]
100+
if not page_runs:
101+
break
102+
103+
runs.extend(page_runs)
104+
105+
oldest_run = page_runs[-1]
106+
if parse_github_time(oldest_run["created_at"]).timestamp() < lookback_cutoff:
107+
break
108+
109+
if len(page_runs) < per_page:
110+
break
111+
112+
page += 1
113+
114+
return [run for run in runs if parse_github_time(run["created_at"]).timestamp() >= lookback_cutoff]
115+
116+
117+
def latest_run_per_pull_request(runs: list[dict]) -> dict[int, dict]:
118+
latest_by_pr: dict[int, dict] = {}
119+
120+
for run in runs:
121+
pr_number = get_pr_number(run)
122+
if pr_number is None:
123+
continue
124+
125+
existing = latest_by_pr.get(pr_number)
126+
if existing is None or parse_github_time(run["created_at"]) > parse_github_time(existing["created_at"]):
127+
latest_by_pr[pr_number] = run
128+
129+
return latest_by_pr
130+
131+
132+
def list_jobs_for_run(owner: str, repo: str, run_id: int) -> list[dict]:
133+
per_page = 100
134+
jobs: list[dict] = []
135+
page = 1
136+
137+
while True:
138+
response = github_request(
139+
"GET",
140+
f"/repos/{owner}/{repo}/actions/runs/{run_id}/jobs",
141+
{
142+
"filter": "latest",
143+
"per_page": str(per_page),
144+
"page": str(page),
145+
},
146+
)
147+
148+
page_jobs = response["jobs"]
149+
jobs.extend(page_jobs)
150+
if len(page_jobs) < per_page:
151+
return jobs
152+
153+
page += 1
154+
155+
156+
def github_request(method: str, path: str, query: dict[str, str] | None = None) -> dict:
157+
url = path.removeprefix("/")
158+
if query:
159+
url += "?" + urllib.parse.urlencode(query)
160+
161+
result = subprocess.run(
162+
[
163+
"gh",
164+
"api",
165+
"--method",
166+
method,
167+
"-H",
168+
"Accept: application/vnd.github+json",
169+
"-H",
170+
"X-GitHub-Api-Version: 2022-11-28",
171+
url,
172+
],
173+
capture_output=True,
174+
text=True,
175+
check=True,
176+
)
177+
178+
if not result.stdout.strip():
179+
return {}
180+
return json.loads(result.stdout)
181+
182+
183+
def read_process_error(error: subprocess.CalledProcessError) -> str:
184+
return error.stderr.strip() or error.stdout.strip() or f"exit code {error.returncode}"
185+
186+
187+
def get_current_repository() -> str:
188+
result = subprocess.run(
189+
["gh", "repo", "view", "--json", "nameWithOwner", "--jq", ".nameWithOwner"],
190+
capture_output=True,
191+
text=True,
192+
check=True,
193+
)
194+
return result.stdout.strip()
195+
196+
197+
def get_pr_number(run: dict) -> int | None:
198+
pull_requests = run.get("pull_requests") or []
199+
if not pull_requests:
200+
return None
201+
return pull_requests[0].get("number")
202+
203+
204+
def format_run(run: dict) -> str:
205+
pr_number = get_pr_number(run)
206+
pr_label = f"PR #{pr_number}" if pr_number is not None else "PR unknown"
207+
return f"{pr_label}, run {run['id']}, attempt {run['run_attempt']}"
208+
209+
210+
def parse_github_time(value: str) -> datetime:
211+
return datetime.fromisoformat(value.replace("Z", "+00:00"))
212+
213+
214+
def write_summary(processed_runs: list[str], rerun_jobs: list[str]) -> None:
215+
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
216+
if not summary_path:
217+
return
218+
219+
lines = [
220+
"# Rerun flaky PR jobs",
221+
"",
222+
f"Eligible reruns: {len(rerun_jobs)}",
223+
"",
224+
]
225+
lines.extend(f"- {message}" for message in processed_runs + rerun_jobs)
226+
Path(summary_path).write_text("\n".join(lines) + "\n")
227+
228+
229+
if __name__ == "__main__":
230+
try:
231+
main()
232+
except Exception as e: # fail the job on unexpected errors
233+
print(f"::error::{e}", file=sys.stderr)
234+
raise
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Rerun flaky PR jobs
2+
3+
on:
4+
schedule:
5+
- cron: "*/15 * * * *"
6+
workflow_dispatch:
7+
8+
concurrency:
9+
group: rerun-flaky-pr-jobs
10+
cancel-in-progress: false
11+
12+
permissions:
13+
actions: write
14+
contents: read
15+
16+
jobs:
17+
rerun-failed-jobs:
18+
runs-on: ubuntu-latest
19+
timeout-minutes: 10
20+
steps:
21+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
22+
23+
- name: Rerun eligible failed jobs
24+
env:
25+
GH_TOKEN: ${{ github.token }}
26+
run: python .github/scripts/rerun-flaky-pr-jobs.py

0 commit comments

Comments
 (0)