Skip to content

Commit 7ea6b5c

Browse files
authored
misc: Phoenix runner management scripts (#1314)
1 parent b2468f1 commit 7ea6b5c

19 files changed

Lines changed: 1388 additions & 110 deletions

misc/runners/common/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Common Runner Management Scripts
2+
3+
Site-agnostic scripts shared between the Frontier and Phoenix runner setups.
4+
All shared logic lives here; site directories contain only site-specific files
5+
(`config.sh` and scripts unique to that cluster).
6+
7+
Scripts are invoked via the dispatcher at `misc/runners/runner.sh`:
8+
```bash
9+
bash misc/runners/runner.sh <site> <command> [args...]
10+
```
11+
12+
## Scripts
13+
14+
| Script | Purpose |
15+
|---|---|
16+
| `runner-lib.sh` | Shared library: GitHub API helpers, EXE-based process discovery, parallel node sweep, start/stop primitives. Sourced by site `config.sh` files. |
17+
| `check-runners.sh` | Per-node health check: Runner.Listener processes with name, idle/BUSY, slurm PATH, RSS. Optional cgroup memory footer. |
18+
| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. |
19+
| `rebalance-runners.sh` | Compute optimal distribution and move runners across nodes. Handles offline runners. Writes `runner.node`. Dry run by default. |
20+
| `restart-runner.sh` | Stop and restart one runner on a given node. Verifies slurm in PATH. Writes `runner.node`. |
21+
| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. |
22+
| `move-runner.sh` | Move a runner to a different login node by name. Stops on current node, starts on target. Writes `runner.node`. |
23+
| `stop-runner.sh` | Stop a runner process and remove its GitHub registration. |
24+
| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft PRs and master. Dry run by default. |
25+
| `create-runner.sh` | Download, register, and start a new runner. Requires `runner_install_dir()` and `TARBALL_CACHE_DIR` from site config. Usage: `create-runner <name> <node> [install-dir]` |
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env bash
2+
# Check runner health across all login nodes.
3+
#
4+
# Sourced by site wrappers (frontier/check-runners.sh, phoenix/check-runners.sh)
5+
# after config.sh is loaded. Shows Runner.Listener processes per node with
6+
# name, busy/idle status, slurm availability, and RSS memory.
7+
# If CGROUP_LIMIT > 0, also shows per-node total memory vs the cgroup limit.
8+
#
9+
# Usage: bash check-runners.sh
10+
set -euo pipefail
11+
12+
declare -f sync_runner_nodes > /dev/null 2>&1 && {
13+
echo "==> Syncing runner node locations..."
14+
sync_runner_nodes
15+
}
16+
17+
for node in "${NODES[@]}"; do
18+
echo "=== $node ==="
19+
ssh $SSH_OPTS "$node" '
20+
found=0
21+
for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do
22+
found=1
23+
exe=$(readlink -f /proc/$p/exe 2>/dev/null || echo "???")
24+
dir=$(dirname "$(dirname "$exe")" 2>/dev/null || echo "???")
25+
name=$(basename "$dir")
26+
worker=$(ps aux | grep "Runner.Worker" | grep "$dir" | grep -v grep | awk "{print \$2}" | head -1)
27+
[ -n "$worker" ] && status="BUSY" || status="idle"
28+
rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?")
29+
slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0)
30+
[ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING"
31+
printf " %-30s %5s slurm=%-7s %s MB\n" "$name" "$status" "$slurm_ok" "$rss"
32+
done
33+
[ "$found" -eq 0 ] && echo " (no runners)"
34+
' 2>/dev/null || echo " (unreachable)"
35+
36+
if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then
37+
rss=$(ssh $SSH_OPTS "$node" \
38+
"ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \
39+
2>/dev/null || echo "?")
40+
[[ "$rss" =~ ^[0-9]+$ ]] || rss=0
41+
echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---"
42+
fi
43+
echo ""
44+
done
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#!/usr/bin/env bash
2+
# Create, register, and start a GitHub Actions runner.
3+
#
4+
# Sourced by misc/runners/runner.sh after config is loaded.
5+
# Config must define runner_install_dir() and may set TARBALL_CACHE_DIR.
6+
#
7+
# runner_install_dir <name> [override-dir]
8+
# Returns the directory where the runner should be installed.
9+
# If override-dir is given it is used directly; otherwise the site
10+
# computes the path (e.g. SHARED_DIR/<name> on Frontier, or an
11+
# auto-numbered actions-runner-N/ directory on Phoenix).
12+
#
13+
# TARBALL_CACHE_DIR
14+
# If non-empty, the runner tarball is cached here and reused across
15+
# installs (useful on Frontier where shared Lustre is visible from all
16+
# login nodes). If empty or unset, a fresh download is made for each
17+
# runner and the temporary file is removed after extraction.
18+
#
19+
# Usage: runner.sh <site> create-runner <name> <node> [install-dir]
20+
# name Runner name (e.g. frontier-23, phoenix-11)
21+
# node Login node to start the runner on
22+
# install-dir Optional: override the computed installation directory
23+
set -euo pipefail
24+
25+
RUNNER_NAME="${1:?Usage: create-runner <name> <node> [install-dir]}"
26+
TARGET_NODE="${2:?Usage: create-runner <name> <node> [install-dir]}"
27+
INSTALL_DIR_OVERRIDE="${3:-}"
28+
29+
RUNNER_DIR=$(runner_install_dir "$RUNNER_NAME" "$INSTALL_DIR_OVERRIDE")
30+
RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}"
31+
TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
32+
TARBALL_URL="https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}"
33+
34+
echo "=== Creating runner ==="
35+
echo " Name: $RUNNER_NAME"
36+
echo " Node: $TARGET_NODE"
37+
echo " Directory: $RUNNER_DIR"
38+
echo " Org: $ORG"
39+
echo " Group: $RUNNER_GROUP"
40+
echo " Label: $RUNNER_LABEL"
41+
echo " Version: $RUNNER_VERSION"
42+
echo ""
43+
44+
if [ -d "$RUNNER_DIR" ]; then
45+
echo "ERROR: Directory already exists: $RUNNER_DIR" >&2
46+
exit 1
47+
fi
48+
49+
# --- Download tarball ---
50+
if [ -n "${TARBALL_CACHE_DIR:-}" ]; then
51+
if [ ! -f "$TARBALL_CACHE_DIR/$TARBALL" ]; then
52+
echo "==> Downloading runner v${RUNNER_VERSION} to cache..."
53+
tmp="$TARBALL_CACHE_DIR/$TARBALL.tmp.$$"
54+
curl -fsSL "$TARBALL_URL" -o "$tmp"
55+
mv "$tmp" "$TARBALL_CACHE_DIR/$TARBALL"
56+
fi
57+
tarball_path="$TARBALL_CACHE_DIR/$TARBALL"
58+
else
59+
echo "==> Downloading runner v${RUNNER_VERSION}..."
60+
mkdir -p "$RUNNER_DIR"
61+
tarball_path="$RUNNER_DIR/runner-download.tmp.$$"
62+
curl -fsSL "$TARBALL_URL" -o "$tarball_path"
63+
fi
64+
65+
# --- Extract ---
66+
mkdir -p "$RUNNER_DIR"
67+
echo "==> Extracting into $RUNNER_DIR..."
68+
tar xzf "$tarball_path" -C "$RUNNER_DIR"
69+
[ -z "${TARBALL_CACHE_DIR:-}" ] && rm -f "$tarball_path"
70+
71+
if [ ! -f "$RUNNER_DIR/run.sh" ]; then
72+
echo "ERROR: Extraction failed — run.sh not found in $RUNNER_DIR" >&2
73+
exit 1
74+
fi
75+
76+
# --- Register ---
77+
echo "==> Fetching registration token..."
78+
token=$(gh_registration_token)
79+
if [ -z "$token" ]; then
80+
echo "ERROR: Failed to get registration token." >&2
81+
echo " Run: gh auth refresh -h github.com -s admin:org" >&2
82+
exit 1
83+
fi
84+
85+
echo "==> Configuring runner..."
86+
"$RUNNER_DIR/config.sh" \
87+
--url "https://github.com/$ORG" \
88+
--token "$token" \
89+
--name "$RUNNER_NAME" \
90+
--runnergroup "$RUNNER_GROUP" \
91+
--labels "$RUNNER_LABEL" \
92+
--work "_work" \
93+
--unattended \
94+
--replace
95+
echo "==> Configured."
96+
97+
# --- Start ---
98+
echo "==> Starting on $TARGET_NODE..."
99+
if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then
100+
echo "$TARGET_NODE" > "$RUNNER_DIR/runner.node"
101+
pids=$(find_pids "$TARGET_NODE" "$RUNNER_DIR")
102+
pid=${pids%% *}
103+
if has_slurm "$TARGET_NODE" "$pid"; then
104+
echo "==> OK: $RUNNER_NAME running on $TARGET_NODE (PID $pid, slurm in PATH)"
105+
else
106+
echo "==> WARNING: $RUNNER_NAME running on $TARGET_NODE (PID $pid) but slurm MISSING from PATH"
107+
fi
108+
else
109+
echo "ERROR: $RUNNER_NAME did not start on $TARGET_NODE" >&2
110+
exit 1
111+
fi
112+
113+
echo ""
114+
echo "==> Log: $RUNNER_DIR/runner.log"
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env bash
2+
# List all runners combining GitHub API status with live node process info.
3+
#
4+
# Sourced by site wrappers (frontier/list-runners.sh, phoenix/list-runners.sh)
5+
# after config.sh is loaded. Uses a parallel SSH sweep across all nodes
6+
# simultaneously (one SSH per node regardless of runner count).
7+
# Shows name, GitHub status, node, slurm availability, and RSS.
8+
# If CGROUP_LIMIT > 0, also shows a per-node memory summary.
9+
#
10+
# Usage: bash list-runners.sh
11+
set -euo pipefail
12+
13+
declare -f sync_runner_nodes > /dev/null 2>&1 && {
14+
echo "==> Syncing runner node locations..."
15+
sync_runner_nodes
16+
}
17+
18+
tmpdir=$(mktemp -d)
19+
trap 'rm -rf "$tmpdir"' EXIT
20+
21+
sweep_all_nodes "$tmpdir"
22+
23+
# Parse sweep results into associative arrays
24+
declare -A runner_node runner_rss runner_slurm
25+
for node in "${NODES[@]}"; do
26+
while IFS= read -r line; do
27+
read -r _s sweep_node dir rss slurm_ok <<< "$line"
28+
runner_node["$dir"]="$sweep_node"
29+
runner_rss["$dir"]="$rss"
30+
runner_slurm["$dir"]="$slurm_ok"
31+
done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true)
32+
done
33+
34+
# Fetch GitHub API status
35+
declare -A gh_status gh_busy
36+
while read -r _id name status busy; do
37+
gh_status["$name"]="$status"
38+
gh_busy["$name"]="$busy"
39+
done < <(gh_list_runners)
40+
41+
# Print table
42+
printf "%-25s %-8s %-20s %-8s %s\n" "NAME" "GITHUB" "NODE" "SLURM" "RSS"
43+
printf "%s\n" "$(printf '%.0s-' {1..70})"
44+
45+
while IFS= read -r dir; do
46+
name=$(get_runner_name "$dir")
47+
[ -z "$name" ] && continue
48+
49+
[ "${gh_busy[$name]:-false}" = "true" ] && gh_col="BUSY" || gh_col="${gh_status[$name]:-unknown}"
50+
51+
actual_node="${runner_node[$dir]:-}"
52+
rss="${runner_rss[$dir]:-—}"
53+
slurm="${runner_slurm[$dir]:-—}"
54+
55+
if [ -z "$actual_node" ]; then
56+
printf "%-25s %-8s %-20s %-8s %s\n" "$name" "$gh_col" "offline" "" ""
57+
continue
58+
fi
59+
60+
# Flag stale runner.node entries
61+
node_col="$actual_node"
62+
if [ -f "$dir/runner.node" ]; then
63+
recorded=$(cat "$dir/runner.node")
64+
[ "$actual_node" != "$recorded" ] && node_col="${actual_node} *(stale: ${recorded})"
65+
fi
66+
67+
printf "%-25s %-8s %-20s %-8s %sMB\n" "$name" "$gh_col" "$node_col" "$slurm" "$rss"
68+
done < <(find_runner_dirs)
69+
70+
# Per-node memory summary (only when site has a cgroup limit)
71+
if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then
72+
echo ""
73+
echo "=== Per-node memory ==="
74+
for node in "${NODES[@]}"; do
75+
count=$(ssh $SSH_OPTS "$node" \
76+
"ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0)
77+
rss=$(ssh $SSH_OPTS "$node" \
78+
"ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \
79+
2>/dev/null || echo "?")
80+
[[ "$rss" =~ ^[0-9]+$ ]] || rss=0
81+
echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)"
82+
done
83+
fi

misc/runners/common/move-runner.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env bash
2+
# Move a runner to a different login node.
3+
#
4+
# Sourced by site wrappers (frontier/move-runner.sh, phoenix/move-runner.sh)
5+
# after config.sh is loaded. Finds the runner by name, stops it on its current
6+
# node, and starts it on the target node. Retries start once after 5 seconds.
7+
#
8+
# Usage: bash move-runner.sh <runner-name> <target-node>
9+
set -euo pipefail
10+
11+
RUNNER_NAME="${1:?Usage: $0 <runner-name> <target-node>}"
12+
TARGET_NODE="${2:?Usage: $0 <runner-name> <target-node>}"
13+
14+
# Validate target node
15+
valid=0
16+
for node in "${NODES[@]}"; do
17+
[ "$node" = "$TARGET_NODE" ] && valid=1 && break
18+
done
19+
if [ "$valid" -eq 0 ]; then
20+
echo "ERROR: '$TARGET_NODE' is not a valid login node." >&2
21+
echo " Valid nodes: ${NODES[*]}" >&2
22+
exit 1
23+
fi
24+
25+
# Find runner directory by name
26+
runner_dir=""
27+
while IFS= read -r dir; do
28+
if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then
29+
runner_dir="$dir"
30+
break
31+
fi
32+
done < <(find_runner_dirs)
33+
34+
if [ -z "$runner_dir" ]; then
35+
echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2
36+
exit 1
37+
fi
38+
39+
declare -f sync_runner_nodes > /dev/null 2>&1 && {
40+
echo "==> Syncing runner node locations..."
41+
sync_runner_nodes
42+
}
43+
44+
echo "==> Locating $RUNNER_NAME..."
45+
current_node=$(find_node "$runner_dir")
46+
47+
if [ "$current_node" = "$TARGET_NODE" ]; then
48+
echo "==> $RUNNER_NAME is already running on $TARGET_NODE. Nothing to do."
49+
exit 0
50+
fi
51+
52+
if [ "$current_node" != "offline" ]; then
53+
echo "==> Stopping $RUNNER_NAME on $current_node..."
54+
stop_runner "$current_node" "$runner_dir"
55+
fi
56+
57+
echo "==> Starting $RUNNER_NAME on $TARGET_NODE..."
58+
if start_runner "$TARGET_NODE" "$runner_dir"; then
59+
echo "$TARGET_NODE" > "$runner_dir/runner.node"
60+
echo "==> $RUNNER_NAME is now running on $TARGET_NODE."
61+
else
62+
echo " First start attempt failed. Retrying in 5 seconds..."
63+
sleep 5
64+
if start_runner "$TARGET_NODE" "$runner_dir"; then
65+
echo "$TARGET_NODE" > "$runner_dir/runner.node"
66+
echo "==> $RUNNER_NAME is now running on $TARGET_NODE."
67+
else
68+
echo "ERROR: $RUNNER_NAME failed to start on $TARGET_NODE after retry." >&2
69+
exit 1
70+
fi
71+
fi

0 commit comments

Comments
 (0)