Skip to content

Commit 86c5c17

Browse files
committed
Trigger flaky PR job reruns on workflow failure
Replace the 15-minute schedule with a workflow_run trigger that fires when 'Build pull request' completes with failure, and simplify the rerun script to process just that one run.
1 parent 0917e81 commit 86c5c17

2 files changed

Lines changed: 60 additions & 246 deletions

File tree

Lines changed: 53 additions & 243 deletions
Original file line numberDiff line numberDiff line change
@@ -1,294 +1,104 @@
11
#!/usr/bin/env python3
2-
"""Rerun up to two failed jobs for recent pull request CI runs.
2+
"""Rerun up to two failed jobs for the pull request CI run that triggered this workflow.
33
4-
Scans recent failed pull request workflow runs, ignores the synthetic
5-
required-status-check job, and reruns eligible failed jobs up to two times.
4+
Reads the triggering workflow run id from the WORKFLOW_RUN_ID environment variable,
5+
ignores the synthetic required-status-check job, and reruns eligible failed jobs
6+
up to two times.
67
"""
78

89
from __future__ import annotations
910

1011
import json
1112
import os
1213
import subprocess
13-
import sys
1414
import urllib.parse
15-
from datetime import datetime, timezone
16-
from pathlib import Path
1715

18-
LOOKBACK_HOURS = 2
1916
MAX_FAILED_JOBS_PER_WORKFLOW_RUN = 5
2017
MAX_RERUN_ATTEMPTS = 2
2118

2219

2320
def main() -> None:
24-
repository = os.environ.get("GITHUB_REPOSITORY") or get_current_repository()
25-
owner, repo = repository.split("/", 1)
26-
lookback_cutoff = datetime.now(timezone.utc).timestamp() - LOOKBACK_HOURS * 60 * 60
21+
owner, repo = os.environ["GITHUB_REPOSITORY"].split("/", 1)
22+
run_id = os.environ["WORKFLOW_RUN_ID"]
2723

28-
recent_runs = list_recent_pull_request_runs(owner, repo, lookback_cutoff)
29-
latest_run_by_pull_request = latest_run_per_pull_request_workflow(owner, repo, recent_runs)
30-
31-
processed_runs: list[str] = []
32-
rerun_jobs: list[str] = []
33-
34-
for run in latest_run_by_pull_request.values():
35-
if run["status"] != "completed" or run.get("conclusion") != "failure":
36-
continue
37-
38-
rerun_attempts = run["run_attempt"] - 1
39-
if rerun_attempts >= MAX_RERUN_ATTEMPTS:
40-
processed_runs.append(
41-
f"Skipped {format_run(run)}: already rerun {rerun_attempts} times."
42-
)
43-
continue
44-
45-
jobs = list_jobs_for_run(owner, repo, run["id"])
46-
failed_real_jobs = [
47-
job
48-
for job in jobs
49-
if job.get("conclusion") == "failure" and job["name"] != "required-status-check"
50-
]
51-
52-
if not failed_real_jobs:
53-
processed_runs.append(f"Skipped {format_run(run)}: only synthetic jobs failed.")
54-
continue
55-
56-
if len(failed_real_jobs) > MAX_FAILED_JOBS_PER_WORKFLOW_RUN:
57-
processed_runs.append(
58-
f"Skipped {format_run(run)}: {len(failed_real_jobs)} failed jobs exceeded limit {MAX_FAILED_JOBS_PER_WORKFLOW_RUN}."
59-
)
60-
continue
61-
62-
try:
63-
github_request("POST", f"/repos/{owner}/{repo}/actions/runs/{run['id']}/rerun-failed-jobs")
64-
rerun_jobs.append(f"{format_run(run)}: reran failed jobs {format_jobs(failed_real_jobs)}.")
65-
except subprocess.CalledProcessError as e:
66-
message = read_process_error(e)
67-
processed_runs.append(
68-
f"Failed rerun for {format_run(run)} jobs {format_jobs(failed_real_jobs)}: {message}"
69-
)
70-
71-
if not processed_runs and not rerun_jobs:
72-
processed_runs.append("No recent failed PR runs matched the rerun policy.")
73-
74-
if rerun_jobs:
75-
for message in rerun_jobs:
76-
print(f"::notice::{message}")
77-
else:
78-
print("::notice::No eligible failed jobs were rerun.")
24+
run = gh_get(f"/repos/{owner}/{repo}/actions/runs/{run_id}")
25+
pr_number = resolve_pr_number(owner, repo, run)
26+
pr_label = f"PR #{pr_number}" if pr_number is not None else "PR unknown"
27+
label = f"{pr_label}, run {run['id']}, attempt {run['run_attempt']}"
7928

80-
for message in processed_runs:
81-
print(message)
29+
if run["status"] != "completed" or run.get("conclusion") != "failure":
30+
print(f"Skipped {label}: status={run['status']}, conclusion={run.get('conclusion')}.")
31+
return
8232

83-
write_summary(processed_runs, rerun_jobs)
33+
rerun_attempts = run["run_attempt"] - 1
34+
if rerun_attempts >= MAX_RERUN_ATTEMPTS:
35+
print(f"Skipped {label}: already rerun {rerun_attempts} times.")
36+
return
8437

38+
failed_real_jobs = [
39+
job
40+
for job in list_jobs_for_run(owner, repo, run["id"])
41+
if job.get("conclusion") == "failure"
42+
and not job["name"].endswith("required-status-check")
43+
]
8544

86-
def list_recent_pull_request_runs(owner: str, repo: str, lookback_cutoff: float) -> list[dict]:
87-
per_page = 100
88-
runs: list[dict] = []
89-
page = 1
45+
if not failed_real_jobs:
46+
print(f"Skipped {label}: only synthetic jobs failed.")
47+
return
9048

91-
while True:
92-
response = github_request_object(
93-
"GET",
94-
f"/repos/{owner}/{repo}/actions/runs",
95-
{
96-
"event": "pull_request",
97-
"per_page": str(per_page),
98-
"page": str(page),
99-
},
49+
if len(failed_real_jobs) > MAX_FAILED_JOBS_PER_WORKFLOW_RUN:
50+
print(
51+
f"Skipped {label}: {len(failed_real_jobs)} failed jobs"
52+
f" exceeded limit {MAX_FAILED_JOBS_PER_WORKFLOW_RUN}."
10053
)
54+
return
10155

102-
page_runs = response["workflow_runs"]
103-
if not page_runs:
104-
break
105-
106-
runs.extend(page_runs)
107-
108-
oldest_run = page_runs[-1]
109-
if parse_github_time(oldest_run["created_at"]).timestamp() < lookback_cutoff:
110-
break
111-
112-
if len(page_runs) < per_page:
113-
break
114-
115-
page += 1
116-
117-
return [run for run in runs if parse_github_time(run["created_at"]).timestamp() >= lookback_cutoff]
118-
119-
120-
def latest_run_per_pull_request_workflow(owner: str, repo: str, runs: list[dict]) -> dict[tuple[int, int], dict]:
121-
latest_by_pr_workflow: dict[tuple[int, int], dict] = {}
122-
branch_cache: dict[tuple[str, str], int | None] = {}
123-
124-
for run in runs:
125-
pr_number = resolve_pr_number(owner, repo, run, branch_cache)
126-
if pr_number is None:
127-
continue
128-
129-
key = (pr_number, run["workflow_id"])
130-
existing = latest_by_pr_workflow.get(key)
131-
if existing is None or parse_github_time(run["created_at"]) > parse_github_time(existing["created_at"]):
132-
latest_by_pr_workflow[key] = run
133-
134-
return latest_by_pr_workflow
56+
subprocess.run(
57+
["gh", "api", "--method", "POST", f"repos/{owner}/{repo}/actions/runs/{run['id']}/rerun-failed-jobs"],
58+
check=True,
59+
)
60+
job_list = ", ".join(f"{j['name']} ({j['id']})" for j in failed_real_jobs)
61+
print(f"::notice::{label}: reran failed jobs {job_list}.")
13562

13663

13764
def list_jobs_for_run(owner: str, repo: str, run_id: int) -> list[dict]:
138-
per_page = 100
13965
jobs: list[dict] = []
14066
page = 1
141-
14267
while True:
143-
response = github_request_object(
144-
"GET",
68+
response = gh_get(
14569
f"/repos/{owner}/{repo}/actions/runs/{run_id}/jobs",
146-
{
147-
"filter": "latest",
148-
"per_page": str(per_page),
149-
"page": str(page),
150-
},
70+
{"filter": "latest", "per_page": "100", "page": str(page)},
15171
)
152-
153-
page_jobs = response["jobs"]
154-
jobs.extend(page_jobs)
155-
if len(page_jobs) < per_page:
72+
jobs.extend(response["jobs"])
73+
if len(response["jobs"]) < 100:
15674
return jobs
157-
15875
page += 1
15976

16077

161-
def github_request(method: str, path: str, query: dict[str, str] | None = None) -> dict | list[dict]:
78+
def gh_get(path: str, query: dict[str, str] | None = None):
16279
url = path.removeprefix("/")
16380
if query:
16481
url += "?" + urllib.parse.urlencode(query)
82+
result = subprocess.run(["gh", "api", url], capture_output=True, text=True, check=True)
83+
return json.loads(result.stdout) if result.stdout.strip() else {}
16584

166-
result = subprocess.run(
167-
[
168-
"gh",
169-
"api",
170-
"--method",
171-
method,
172-
"-H",
173-
"Accept: application/vnd.github+json",
174-
"-H",
175-
"X-GitHub-Api-Version: 2022-11-28",
176-
url,
177-
],
178-
capture_output=True,
179-
text=True,
180-
check=True,
181-
)
182-
183-
if not result.stdout.strip():
184-
return {}
185-
return json.loads(result.stdout)
186-
187-
188-
def github_request_object(method: str, path: str, query: dict[str, str] | None = None) -> dict:
189-
response = github_request(method, path, query)
190-
if isinstance(response, list):
191-
raise TypeError(f"Expected object response for {path}")
192-
return response
193-
194-
195-
def github_request_list(method: str, path: str, query: dict[str, str] | None = None) -> list[dict]:
196-
response = github_request(method, path, query)
197-
if not isinstance(response, list):
198-
raise TypeError(f"Expected list response for {path}")
199-
return response
200-
201-
202-
def read_process_error(error: subprocess.CalledProcessError) -> str:
203-
return error.stderr.strip() or error.stdout.strip() or f"exit code {error.returncode}"
204-
205-
206-
def get_current_repository() -> str:
207-
result = subprocess.run(
208-
["gh", "repo", "view", "--json", "nameWithOwner", "--jq", ".nameWithOwner"],
209-
capture_output=True,
210-
text=True,
211-
check=True,
212-
)
213-
return result.stdout.strip()
214-
215-
216-
def resolve_pr_number(
217-
owner: str, repo: str, run: dict, branch_cache: dict[tuple[str, str], int | None]
218-
) -> int | None:
219-
cached_pr_number = run.get("resolved_pr_number")
220-
if isinstance(cached_pr_number, int):
221-
return cached_pr_number
22285

86+
def resolve_pr_number(owner: str, repo: str, run: dict) -> int | None:
22387
pull_requests = run.get("pull_requests") or []
22488
if pull_requests:
225-
pr_number = pull_requests[0].get("number")
226-
run["resolved_pr_number"] = pr_number
227-
return pr_number
89+
return pull_requests[0].get("number")
22890

229-
head_repository = run.get("head_repository") or {}
230-
head_owner = (head_repository.get("owner") or {}).get("login")
91+
head_owner = ((run.get("head_repository") or {}).get("owner") or {}).get("login")
23192
head_branch = run.get("head_branch")
23293
if not head_owner or not head_branch:
233-
run["resolved_pr_number"] = None
23494
return None
23595

236-
cache_key = (head_owner, head_branch)
237-
if cache_key not in branch_cache:
238-
pull_requests = github_request_list(
239-
"GET",
240-
f"/repos/{owner}/{repo}/pulls",
241-
{
242-
"state": "open",
243-
"head": f"{head_owner}:{head_branch}",
244-
"per_page": "5",
245-
},
246-
)
247-
pr_number = None
248-
for pull_request in pull_requests:
249-
if pull_request.get("head", {}).get("sha") == run.get("head_sha"):
250-
pr_number = pull_request["number"]
251-
break
252-
if pr_number is None and pull_requests:
253-
pr_number = pull_requests[0]["number"]
254-
branch_cache[cache_key] = pr_number
255-
256-
run["resolved_pr_number"] = branch_cache[cache_key]
257-
return branch_cache[cache_key]
258-
259-
260-
def format_run(run: dict) -> str:
261-
pr_number = run.get("resolved_pr_number")
262-
pr_label = f"PR #{pr_number}" if pr_number is not None else "PR unknown"
263-
return f"{pr_label}, run {run['id']}, attempt {run['run_attempt']}"
264-
265-
266-
def format_jobs(jobs: list[dict]) -> str:
267-
return ", ".join(f"{job['name']} ({job['id']})" for job in jobs)
268-
269-
270-
def parse_github_time(value: str) -> datetime:
271-
return datetime.fromisoformat(value.replace("Z", "+00:00"))
272-
273-
274-
def write_summary(processed_runs: list[str], rerun_jobs: list[str]) -> None:
275-
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
276-
if not summary_path:
277-
return
278-
279-
lines = [
280-
"# Rerun flaky PR jobs",
281-
"",
282-
f"Eligible reruns: {len(rerun_jobs)}",
283-
"",
284-
]
285-
lines.extend(f"- {message}" for message in processed_runs + rerun_jobs)
286-
Path(summary_path).write_text("\n".join(lines) + "\n")
96+
matches = gh_get(
97+
f"/repos/{owner}/{repo}/pulls",
98+
{"state": "open", "head": f"{head_owner}:{head_branch}", "per_page": "1"},
99+
)
100+
return matches[0]["number"] if matches else None
287101

288102

289103
if __name__ == "__main__":
290-
try:
291-
main()
292-
except Exception as e: # fail the job on unexpected errors
293-
print(f"::error::{e}", file=sys.stderr)
294-
raise
104+
main()

.github/workflows/rerun-flaky-pr-jobs.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
name: Rerun flaky PR jobs
22

33
on:
4-
schedule:
5-
- cron: "*/15 * * * *"
6-
workflow_dispatch:
4+
workflow_run:
5+
workflows:
6+
- "Build pull request"
7+
types:
8+
- completed
79

810
concurrency:
911
group: rerun-flaky-pr-jobs
@@ -17,10 +19,12 @@ jobs:
1719
rerun-failed-jobs:
1820
runs-on: ubuntu-latest
1921
timeout-minutes: 10
22+
if: github.event.workflow_run.conclusion == 'failure'
2023
steps:
2124
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
2225

2326
- name: Rerun eligible failed jobs
2427
env:
2528
GH_TOKEN: ${{ github.token }}
29+
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
2630
run: python .github/scripts/rerun-flaky-pr-jobs.py

0 commit comments

Comments
 (0)