Skip to content

Commit caa669f

Browse files
fix: scalable process lifecycle via PGID-based cleanup (#180)
1 parent 264d718 commit caa669f

2 files changed

Lines changed: 60 additions & 92 deletions

File tree

pi/skills/control-agent/startup-pi.sh

Lines changed: 23 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
# Stale .alias symlinks pointing to removed sockets also get cleaned.
1313
# Then starts the slack-bridge process with the current control-agent UUID.
1414
#
15-
# This script is the SOLE owner of the bridge lifecycle. start.sh only does
16-
# pre-cleanup (kill stale processes, release port) — it never launches the bridge.
15+
# Process lifecycle is managed via process groups (see runtime/start.sh).
16+
# When start.sh kills the old control-agent PGID, all spawned services
17+
# (bridge, workers, etc.) are automatically terminated. This script only needs
18+
# to launch new services; cleanup is handled by the process group mechanism.
1719

1820
set -euo pipefail
1921

20-
# Prevent varlock SEA binary from misinterpreting argv when called from a
21-
# session that was itself launched via varlock (PKG_EXECPATH leaks into child
22-
# processes and causes `varlock run` to treat subcommands as Node module paths).
22+
# Prevent varlock SEA binary from misinterpreting argv
2323
unset PKG_EXECPATH 2>/dev/null || true
2424

2525
RUNTIME_NODE_HELPER="$HOME/runtime/bin/lib/runtime-node.sh"
@@ -71,7 +71,7 @@ echo "Cleaned $cleaned stale socket(s)."
7171

7272
# Restart Slack bridge with current control-agent UUID
7373
echo ""
74-
echo "=== Slack Bridge Restart ==="
74+
echo "=== Slack Bridge Startup ==="
7575

7676
# Find control-agent UUID from alias
7777
CONTROL_ALIAS="$SOCKET_DIR/control-agent.alias"
@@ -86,56 +86,10 @@ fi
8686
BRIDGE_LOG_DIR="$HOME/.pi/agent/logs"
8787
BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log"
8888
BRIDGE_DIR="/opt/baudbot/current/slack-bridge"
89-
BRIDGE_TMUX_SESSION="slack-bridge"
89+
BRIDGE_TMUX_SESSION="baudbot-slack-bridge"
9090

9191
mkdir -p "$BRIDGE_LOG_DIR"
9292

93-
# --- Kill anything holding port 7890, any existing bridge tmux session,
94-
# and any leftover old-style PID-file supervisor.
95-
echo "Cleaning up old bridge..."
96-
97-
# Kill the tmux session first — this stops the restart loop from respawning
98-
# the bridge while we're trying to clean up the port.
99-
tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true
100-
101-
# Kill ALL bridge processes (broker-bridge.mjs and bridge.mjs) to prevent
102-
# orphaned processes from holding port 7890 after control-agent restarts.
103-
# This is more aggressive than just killing port holders, but prevents the
104-
# common failure mode where a bridge process survives tmux session cleanup
105-
# (e.g., detached, zombied, or in a different session tree).
106-
BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true)
107-
if [ -n "$BRIDGE_PIDS" ]; then
108-
echo "Killing all bridge processes (SIGTERM): $BRIDGE_PIDS"
109-
echo "$BRIDGE_PIDS" | xargs kill 2>/dev/null || true
110-
# Wait up to 3s for graceful shutdown
111-
for i in 1 2 3; do
112-
sleep 1
113-
BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true)
114-
[ -z "$BRIDGE_PIDS" ] && break
115-
done
116-
# Force-kill anything that didn't exit
117-
if [ -n "$BRIDGE_PIDS" ]; then
118-
echo "Force-killing stubborn bridge processes: $BRIDGE_PIDS"
119-
echo "$BRIDGE_PIDS" | xargs kill -9 2>/dev/null || true
120-
sleep 1
121-
fi
122-
fi
123-
124-
# Final safety check: kill anything still on port 7890
125-
PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true)
126-
if [ -n "$PORT_PIDS" ]; then
127-
echo "Force-killing remaining processes on port 7890: $PORT_PIDS"
128-
echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
129-
sleep 1
130-
fi
131-
132-
OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
133-
if [ -f "$OLD_PID_FILE" ]; then
134-
OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)"
135-
[ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true
136-
rm -f "$OLD_PID_FILE"
137-
fi
138-
13993
# --- Detect bridge mode ---
14094
BRIDGE_SCRIPT=""
14195
if [ -f "$BRIDGE_DIR/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c '
@@ -156,7 +110,7 @@ fi
156110
if [ -z "$BRIDGE_SCRIPT" ]; then
157111
echo "No Slack transport configured (missing broker keys and socket tokens); skipping bridge startup."
158112
echo ""
159-
echo "=== Cleanup Complete ==="
113+
echo "=== Startup Complete ==="
160114
exit 0
161115
fi
162116

@@ -167,8 +121,22 @@ fi
167121
# - Tracks consecutive fast failures (<60s runtime) and gives up after 10
168122
# - Backs off: 5s base + 2s per failure, capped at 60s
169123
# - Kills port holders before retrying (avoids EADDRINUSE spin)
124+
#
125+
# Note: tmux creates its own session (PGID), so it's not killed by process group
126+
# termination. We need to explicitly kill old sessions before creating new ones.
170127
MAX_CONSECUTIVE_FAILURES=10
171128

129+
# Kill all agent tmux sessions (prefix: baudbot-*)
130+
# Tmux sessions create their own PGID, so they survive process group cleanup.
131+
# Using a naming convention allows us to kill all agent sessions without
132+
# tracking individual session names.
133+
AGENT_SESSIONS=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep '^baudbot-' || true)
134+
if [ -n "$AGENT_SESSIONS" ]; then
135+
echo "Killing agent tmux sessions: $AGENT_SESSIONS"
136+
echo "$AGENT_SESSIONS" | xargs -r -I{} tmux kill-session -t {} 2>/dev/null || true
137+
sleep 1
138+
fi
139+
172140
echo "Starting slack-bridge ($BRIDGE_SCRIPT) via tmux..."
173141
NODE_BIN_DIR="${NODE_BIN_DIR:-$HOME/opt/node/bin}"
174142
if command -v bb_resolve_runtime_node_bin_dir >/dev/null 2>&1; then
@@ -228,4 +196,4 @@ else
228196
fi
229197

230198
echo ""
231-
echo "=== Cleanup Complete ==="
199+
echo "=== Startup Complete ==="

start.sh

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ set -euo pipefail
1414
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
1515
# shellcheck source=bin/lib/runtime-node.sh
1616
source "$SCRIPT_DIR/bin/lib/runtime-node.sh"
17-
# bridge-restart-policy.sh no longer needed — bridge is started by
18-
# startup-pi.sh, not start.sh (see PR #164)
1917
cd ~
2018

2119
NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"
@@ -24,7 +22,6 @@ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"
2422
export PATH="$HOME/.varlock/bin:$NODE_BIN_DIR:$PATH"
2523

2624
# Work around varlock telemetry config crash by opting out at runtime.
27-
# This avoids loading anonymousId from user config and keeps startup deterministic.
2825
export VARLOCK_TELEMETRY_DISABLED=1
2926

3027
# Validate and load secrets via varlock
@@ -33,7 +30,7 @@ varlock load --path ~/.config/ || {
3330
exit 1
3431
}
3532
set -a
36-
# shellcheck disable=SC1090 # path is dynamic (agent home)
33+
# shellcheck disable=SC1090
3734
source ~/.config/.env
3835
set +a
3936

@@ -48,7 +45,6 @@ umask 077
4845
~/runtime/bin/redact-logs.sh 2>/dev/null || true
4946

5047
# Verify deployed runtime integrity against deploy manifest.
51-
# Modes: off | warn | strict (default: warn)
5248
INTEGRITY_MODE="${BAUDBOT_STARTUP_INTEGRITY_MODE:-warn}"
5349
if [ -x "$HOME/runtime/bin/verify-manifest.sh" ]; then
5450
if ! BAUDBOT_STARTUP_INTEGRITY_MODE="$INTEGRITY_MODE" "$HOME/runtime/bin/verify-manifest.sh"; then
@@ -66,15 +62,13 @@ if [ -d "$SOCKET_DIR" ]; then
6662
if command -v fuser &>/dev/null; then
6763
for sock in "$SOCKET_DIR"/*.sock; do
6864
[ -e "$sock" ] || continue
69-
# If no process has the socket open, it's stale
7065
if ! fuser "$sock" &>/dev/null 2>&1; then
7166
rm -f "$sock"
7267
fi
7368
done
7469
else
7570
echo " fuser not found, skipping socket cleanup (install psmisc)"
7671
fi
77-
# Clean broken alias symlinks
7872
for alias in "$SOCKET_DIR"/*.alias; do
7973
[ -L "$alias" ] || continue
8074
target=$(readlink "$alias")
@@ -84,35 +78,33 @@ if [ -d "$SOCKET_DIR" ]; then
8478
done
8579
fi
8680

87-
# ── Slack bridge cleanup (bridge is started by startup-pi.sh) ──
88-
# The bridge needs the control-agent's session UUID (PI_SESSION_ID) to deliver
89-
# messages to the correct socket. That UUID isn't known until pi starts and
90-
# registers its socket. So we DON'T start the bridge here — the control-agent's
91-
# startup-pi.sh handles it after the session is live.
92-
#
93-
# We DO kill any stale bridge processes from previous runs to avoid port
94-
# conflicts when startup-pi.sh launches a fresh one.
95-
BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
96-
if [ -f "$BRIDGE_PID_FILE" ]; then
97-
old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)"
98-
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
99-
echo "Stopping stale bridge supervisor (PID $old_pid)..."
100-
kill "$old_pid" 2>/dev/null || true
101-
sleep 1
102-
kill -9 "$old_pid" 2>/dev/null || true
81+
# ── Process Group Management ──
82+
# Kill old control-agent process group to ensure clean slate.
83+
# This automatically terminates all spawned services (bridge, workers, etc.)
84+
# without needing to track individual PIDs or process names.
85+
CONTROL_PGID_FILE="$HOME/.pi/agent/control-agent.pgid"
86+
87+
if [ -f "$CONTROL_PGID_FILE" ]; then
88+
OLD_PGID=$(cat "$CONTROL_PGID_FILE" 2>/dev/null || echo "")
89+
if [ -n "$OLD_PGID" ] && kill -0 -"$OLD_PGID" 2>/dev/null; then
90+
echo "Terminating old control-agent process group (PGID $OLD_PGID)..."
91+
kill -TERM -"$OLD_PGID" 2>/dev/null || true
92+
# Wait up to 5s for graceful shutdown
93+
for _i in 1 2 3 4 5; do
94+
if ! kill -0 -"$OLD_PGID" 2>/dev/null; then
95+
echo " Process group terminated cleanly"
96+
break
97+
fi
98+
sleep 1
99+
done
100+
# Force-kill any survivors
101+
if kill -0 -"$OLD_PGID" 2>/dev/null; then
102+
echo " Force-killing stubborn processes in group $OLD_PGID..."
103+
kill -KILL -"$OLD_PGID" 2>/dev/null || true
104+
sleep 1
105+
fi
103106
fi
104-
rm -f "$BRIDGE_PID_FILE"
105-
fi
106-
# Kill the tmux session too (startup-pi.sh uses this)
107-
tmux kill-session -t slack-bridge 2>/dev/null || true
108-
# Force-release port 7890 in case anything survived
109-
PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)"
110-
if [ -n "$PORT_PIDS" ]; then
111-
echo "Releasing port 7890 (PIDs: $PORT_PIDS)..."
112-
echo "$PORT_PIDS" | xargs kill 2>/dev/null || true
113-
sleep 1
114-
PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)"
115-
[ -n "$PORT_PIDS" ] && echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
107+
rm -f "$CONTROL_PGID_FILE"
116108
fi
117109

118110
# Set session name (read by auto-name.ts extension)
@@ -134,6 +126,14 @@ else
134126
exit 1
135127
fi
136128

137-
# Start control-agent
129+
# Start control-agent.
130+
# Save our PID as the process group ID for cleanup on next restart.
131+
# When systemd launches start.sh (Type=simple), our PID is already the
132+
# process group leader. `exec pi` replaces this process in-place (same PID,
133+
# same PGID), so all child processes (bridge, workers) inherit the group.
134+
# On restart, killing -$PGID terminates the entire tree automatically.
135+
#
138136
# --session-control: enables inter-session communication (handled by control.ts extension)
139-
pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent"
137+
echo "Starting control-agent..."
138+
echo $$ > "$CONTROL_PGID_FILE"
139+
exec pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent"

0 commit comments

Comments
 (0)