Skip to content

Commit a2e657a

Browse files
authored
Add runner status script (#1185)
This script is handy for looking at the workload of the machines for the offload test suite. It might be especially helpful in the near future, when the split build/test feature is commonplace. Presently, it displays which jobs are being allocated (Donev, Active, Queued) to which machine SKUs. It should, in the future, display "build job" and "test job" allocations, for greater clarity. It accepts one optional argument, to scope the output to a specific SKU, but leave it empty to get status on all SKUs. For example, here's the output of this command: <img width="1457" height="1171" alt="image" src="https://github.com/user-attachments/assets/f16097fe-e30c-4fea-96cb-4018bd1feb75" /> Fixes #1227 Assisted by Github Copilot
1 parent b457504 commit a2e657a

1 file changed

Lines changed: 317 additions & 0 deletions

File tree

utils/runner_status.py

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
"""
2+
Show runner job status across all workflows in llvm/offload-test-suite.
3+
4+
Includes queued/in-progress runs and recently completed runs so you
5+
can see what is (or was) occupying the runners.
6+
7+
Requires the GitHub CLI (`gh`) to be installed and authenticated
8+
(`gh auth login`). API calls are issued through `gh api`.
9+
10+
Usage:
11+
python runner_status.py [vendor]
12+
13+
vendor: intel | amd | nvidia | qc (omit to show all vendors)
14+
"""
15+
16+
import sys
17+
import os
18+
import json
19+
import subprocess
20+
from concurrent.futures import ThreadPoolExecutor, as_completed
21+
from datetime import datetime, timezone, timedelta
22+
23+
OWNER = "llvm"
24+
REPO = "offload-test-suite"
25+
VALID_VENDORS = ("intel", "amd", "nvidia", "qc")
26+
COMPLETED_WINDOW_HOURS = 3
27+
28+
# ANSI color codes per vendor
29+
VENDOR_COLORS = {
30+
"intel": "\033[34m", # blue
31+
"amd": "\033[31m", # red
32+
"nvidia": "\033[32m", # green
33+
"qc": "\033[90m", # gray
34+
}
35+
RESET = "\033[0m"
36+
37+
# Workflow names that are exclusive to a specific vendor
38+
VENDOR_WORKFLOW_KEYWORDS = {
39+
"intel": "intel",
40+
"amd": "amd",
41+
"nvidia": "nvidia",
42+
"qc": "qc",
43+
}
44+
45+
46+
def runner_label(vendor):
47+
return f"hlsl-windows-{vendor}"
48+
49+
50+
def colorize(vendor, text):
51+
"""Wrap text in the vendor's ANSI color."""
52+
c = VENDOR_COLORS.get(vendor, "")
53+
return f"{c}{text}{RESET}" if c else text
54+
55+
56+
def api_get(path):
57+
"""Issue a GitHub API GET via `gh api` and return the parsed JSON.
58+
59+
Raises subprocess.CalledProcessError on non-zero exit (e.g. 403).
60+
"""
61+
result = subprocess.run(
62+
["gh", "api", "-H", "Accept: application/vnd.github+json", path],
63+
capture_output=True,
64+
text=True,
65+
check=True,
66+
)
67+
return json.loads(result.stdout)
68+
69+
70+
def get_runners(label):
71+
"""Fetch self-hosted runners that have the given label. Returns None on error."""
72+
try:
73+
path = f"/repos/{OWNER}/{REPO}/actions/runners?per_page=100"
74+
runners = api_get(path).get("runners", [])
75+
return [r for r in runners if label in [l["name"] for l in r.get("labels", [])]]
76+
except subprocess.CalledProcessError:
77+
return None
78+
79+
80+
def run_could_match_vendor(run, vendor):
81+
"""Quick heuristic: can this run possibly have jobs for the given vendor?
82+
83+
Scheduled/dispatch runs whose workflow name contains another vendor's
84+
keyword are skipped. PR runs (Execution Testing) and ambiguous runs
85+
are always kept.
86+
"""
87+
name_lower = run["name"].lower()
88+
# "Execution Testing" (PR matrix) always includes intel, sometimes others
89+
if "execution testing" in name_lower or "hlsl test" in name_lower:
90+
return True
91+
# If the workflow name mentions a specific vendor, only match that one
92+
for v, kw in VENDOR_WORKFLOW_KEYWORDS.items():
93+
if kw in name_lower:
94+
return v == vendor
95+
return True
96+
97+
98+
def get_runs(vendors):
99+
"""Fetch runs that are queued, in_progress, or recently completed.
100+
101+
When a single vendor is requested, only fetches runs whose workflow
102+
could plausibly contain jobs for that vendor.
103+
"""
104+
results = []
105+
for status in ("queued", "in_progress"):
106+
path = f"/repos/{OWNER}/{REPO}/actions/runs?status={status}&per_page=100"
107+
results.extend(api_get(path)["workflow_runs"])
108+
109+
# Also grab recently completed runs (within COMPLETED_WINDOW_HOURS)
110+
cutoff = datetime.now(timezone.utc) - timedelta(hours=COMPLETED_WINDOW_HOURS)
111+
path = f"/repos/{OWNER}/{REPO}/actions/runs?status=completed&per_page=50"
112+
for r in api_get(path)["workflow_runs"]:
113+
updated = datetime.fromisoformat(r["updated_at"].replace("Z", "+00:00"))
114+
if updated >= cutoff:
115+
results.append(r)
116+
117+
# Deduplicate by run ID
118+
seen = set()
119+
unique = []
120+
for r in results:
121+
if r["id"] not in seen:
122+
seen.add(r["id"])
123+
unique.append(r)
124+
125+
# Pre-filter: if only one vendor requested, skip runs that clearly
126+
# belong to a different vendor (avoids fetching their jobs).
127+
if len(vendors) == 1:
128+
vendor = vendors[0]
129+
unique = [r for r in unique if run_could_match_vendor(r, vendor)]
130+
131+
return unique
132+
133+
134+
def prefetch_jobs(runs, jobs_cache):
135+
"""Fetch jobs for all runs in parallel to minimize wall-clock time."""
136+
to_fetch = [r for r in runs if r["id"] not in jobs_cache]
137+
if not to_fetch:
138+
return
139+
140+
def fetch_one(run_id):
141+
return run_id, get_jobs(run_id)
142+
143+
with ThreadPoolExecutor(max_workers=8) as pool:
144+
futures = {pool.submit(fetch_one, r["id"]): r["id"] for r in to_fetch}
145+
for future in as_completed(futures):
146+
run_id, jobs = future.result()
147+
jobs_cache[run_id] = jobs
148+
149+
150+
def get_jobs(run_id):
151+
path = f"/repos/{OWNER}/{REPO}/actions/runs/{run_id}/jobs?per_page=100"
152+
return api_get(path)["jobs"]
153+
154+
155+
def tz_abbrev(dt):
156+
"""Get short timezone abbreviation, e.g. 'PDT' instead of 'Pacific Daylight Time'."""
157+
name = dt.strftime("%Z")
158+
if len(name) <= 5:
159+
return name
160+
return "".join(w[0] for w in name.split())
161+
162+
163+
def format_time(iso_str):
164+
"""Convert ISO timestamp to short time like '12:40 PM PDT / 7:40 PM UTC'."""
165+
dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
166+
h = dt.hour % 12 or 12
167+
ampm = "AM" if dt.hour < 12 else "PM"
168+
utc = f"{h}:{dt.minute:02d} {ampm} UTC"
169+
local = dt.astimezone()
170+
lh = local.hour % 12 or 12
171+
lampm = "AM" if local.hour < 12 else "PM"
172+
tz = tz_abbrev(local)
173+
return f"{lh}:{local.minute:02d} {lampm} {tz} / {utc}"
174+
175+
176+
def print_vendor_table(vendor, runs, jobs_cache, runners_cache):
177+
"""Print the status table for a single vendor. Returns True if any rows."""
178+
label = runner_label(vendor)
179+
180+
# Fetch and cache runners for this label
181+
if label not in runners_cache:
182+
runners_cache[label] = get_runners(label)
183+
runners = runners_cache[label]
184+
if runners is not None:
185+
online = len([r for r in runners if r.get("status") == "online"])
186+
runner_info = f", {online}/{len(runners)} online"
187+
else:
188+
runner_info = ""
189+
190+
rows = []
191+
active_details = []
192+
193+
for run in runs:
194+
run_id = run["id"]
195+
if run_id not in jobs_cache:
196+
jobs_cache[run_id] = get_jobs(run_id)
197+
jobs = jobs_cache[run_id]
198+
199+
# Match by label OR by runner name containing the vendor (covers
200+
# workflow_dispatch runs that didn't pin a vendor SKU label).
201+
vendor_jobs = [
202+
j
203+
for j in jobs
204+
if label in j.get("labels", [])
205+
or vendor.lower() in (j.get("runner_name") or "").lower()
206+
]
207+
if not vendor_jobs:
208+
continue
209+
210+
title = run["display_title"]
211+
workflow = run["name"]
212+
created = format_time(run["created_at"])
213+
214+
done = len([j for j in vendor_jobs if j["status"] == "completed"])
215+
active = [j for j in vendor_jobs if j["status"] == "in_progress"]
216+
queued = len([j for j in vendor_jobs if j["status"] == "queued"])
217+
218+
# Skip runs where all jobs are done (nothing active or queued)
219+
if not active and queued == 0:
220+
continue
221+
222+
if run["event"] == "schedule":
223+
prefix = "[Scheduled]"
224+
elif run["event"] == "pull_request":
225+
prefix = "[PR]"
226+
else:
227+
prefix = f"[{run['event']}]"
228+
run_label = f"{prefix} {title} ({created})"
229+
rows.append((run_label, workflow, done, len(active), queued))
230+
231+
for j in active:
232+
short = j["name"].split(",")[-1].strip().rstrip(") / build")
233+
active_details.append((title, short, j.get("runner_name", "?")))
234+
235+
header_text = f"=== {vendor.upper()} (runner: {label}{runner_info}) ==="
236+
print(colorize(vendor, header_text))
237+
print()
238+
239+
if not rows:
240+
print(f"No runs with {vendor} jobs found.\n")
241+
return False
242+
243+
now_local = datetime.now().astimezone()
244+
local_str = now_local.strftime("%#I:%M %p ") + tz_abbrev(now_local)
245+
utc_str = now_local.astimezone(timezone.utc).strftime("%#I:%M %p UTC")
246+
timestamp = f"as of {local_str} / {utc_str}"
247+
248+
col1_w = max(len(r[0]) for r in rows)
249+
col2_w = max(max(len(r[1]) for r in rows), len("Workflow"))
250+
run_col_header = f"Run ({timestamp})"
251+
col1_w = max(col1_w, len(run_col_header))
252+
header = (
253+
f"{run_col_header:<{col1_w}} {'Workflow':<{col2_w}}"
254+
f" {'Done':>6} {'Active':>6} {'Queued':>6}"
255+
)
256+
sep = "=" * len(header)
257+
258+
print(colorize(vendor, sep))
259+
print(header)
260+
print(colorize(vendor, sep))
261+
for run_label, workflow, done, active, queued in rows:
262+
active_str = str(active) if active == 0 else f"*{active}*"
263+
print(
264+
f"{run_label:<{col1_w}} {workflow:<{col2_w}}"
265+
f" {done:>6} {active_str:>6} {queued:>6}"
266+
)
267+
print(colorize(vendor, sep))
268+
269+
total_done = sum(r[2] for r in rows)
270+
total_active = sum(r[3] for r in rows)
271+
total_queued = sum(r[4] for r in rows)
272+
print(
273+
f"{'TOTAL':<{col1_w}} {'':<{col2_w}}"
274+
f" {total_done:>6} {total_active:>6} {total_queued:>6}"
275+
)
276+
print()
277+
278+
if active_details:
279+
print(colorize(vendor, f"Currently running on {vendor}:"))
280+
for title, job, runner_name in active_details:
281+
print(f" -> {job} (runner: {runner_name}, run: {title})")
282+
else:
283+
print(f"No {vendor} jobs currently running.")
284+
print()
285+
return True
286+
287+
288+
def main():
289+
if len(sys.argv) >= 2:
290+
vendor = sys.argv[1].lower()
291+
if vendor not in VALID_VENDORS:
292+
print(f"Unknown vendor '{vendor}'. Choose from: {', '.join(VALID_VENDORS)}")
293+
print(f"Usage: python {os.path.basename(__file__)} [vendor]")
294+
sys.exit(1)
295+
vendors = [vendor]
296+
else:
297+
vendors = list(VALID_VENDORS)
298+
299+
runs = get_runs(vendors)
300+
if not runs:
301+
print("No queued, in-progress, or recently completed runs found.")
302+
return
303+
304+
runs.sort(key=lambda r: r["created_at"])
305+
306+
# Fetch all jobs in parallel upfront
307+
jobs_cache = {}
308+
prefetch_jobs(runs, jobs_cache)
309+
310+
runners_cache = {}
311+
312+
for v in vendors:
313+
print_vendor_table(v, runs, jobs_cache, runners_cache)
314+
315+
316+
if __name__ == "__main__":
317+
main()

0 commit comments

Comments
 (0)