@@ -14,8 +14,6 @@ set -euo pipefail
1414SCRIPT_DIR=" $( cd " $( dirname " $0 " ) " && pwd) "
1515# shellcheck source=bin/lib/runtime-node.sh
1616source " $SCRIPT_DIR /bin/lib/runtime-node.sh"
17- # bridge-restart-policy.sh no longer needed — bridge is started by
18- # startup-pi.sh, not start.sh (see PR #164)
1917cd ~
2018
2119NODE_BIN_DIR=" $( bb_resolve_runtime_node_bin_dir " $HOME " ) "
@@ -24,7 +22,6 @@ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"
2422export PATH=" $HOME /.varlock/bin:$NODE_BIN_DIR :$PATH "
2523
2624# Work around varlock telemetry config crash by opting out at runtime.
27- # This avoids loading anonymousId from user config and keeps startup deterministic.
2825export VARLOCK_TELEMETRY_DISABLED=1
2926
3027# Validate and load secrets via varlock
@@ -33,7 +30,7 @@ varlock load --path ~/.config/ || {
3330 exit 1
3431}
3532set -a
36- # shellcheck disable=SC1090 # path is dynamic (agent home)
33+ # shellcheck disable=SC1090
3734source ~ /.config/.env
3835set +a
3936
@@ -48,7 +45,6 @@ umask 077
4845~ /runtime/bin/redact-logs.sh 2> /dev/null || true
4946
5047# Verify deployed runtime integrity against deploy manifest.
51- # Modes: off | warn | strict (default: warn)
5248INTEGRITY_MODE=" ${BAUDBOT_STARTUP_INTEGRITY_MODE:- warn} "
5349if [ -x " $HOME /runtime/bin/verify-manifest.sh" ]; then
5450 if ! BAUDBOT_STARTUP_INTEGRITY_MODE=" $INTEGRITY_MODE " " $HOME /runtime/bin/verify-manifest.sh" ; then
@@ -66,15 +62,13 @@ if [ -d "$SOCKET_DIR" ]; then
6662 if command -v fuser & > /dev/null; then
6763 for sock in " $SOCKET_DIR " /* .sock; do
6864 [ -e " $sock " ] || continue
69- # If no process has the socket open, it's stale
7065 if ! fuser " $sock " & > /dev/null 2>&1 ; then
7166 rm -f " $sock "
7267 fi
7368 done
7469 else
7570 echo " fuser not found, skipping socket cleanup (install psmisc)"
7671 fi
77- # Clean broken alias symlinks
7872 for alias in " $SOCKET_DIR " /* .alias; do
7973 [ -L " $alias " ] || continue
8074 target=$( readlink " $alias " )
@@ -84,35 +78,33 @@ if [ -d "$SOCKET_DIR" ]; then
8478 done
8579fi
8680
87- # ── Slack bridge cleanup (bridge is started by startup-pi.sh) ──
88- # The bridge needs the control-agent's session UUID (PI_SESSION_ID) to deliver
89- # messages to the correct socket. That UUID isn't known until pi starts and
90- # registers its socket. So we DON'T start the bridge here — the control-agent's
91- # startup-pi.sh handles it after the session is live.
92- #
93- # We DO kill any stale bridge processes from previous runs to avoid port
94- # conflicts when startup-pi.sh launches a fresh one.
95- BRIDGE_PID_FILE=" $HOME /.pi/agent/slack-bridge.pid"
96- if [ -f " $BRIDGE_PID_FILE " ]; then
97- old_pid=" $( cat " $BRIDGE_PID_FILE " 2> /dev/null || true) "
98- if [ -n " $old_pid " ] && kill -0 " $old_pid " 2> /dev/null; then
99- echo " Stopping stale bridge supervisor (PID $old_pid )..."
100- kill " $old_pid " 2> /dev/null || true
101- sleep 1
102- kill -9 " $old_pid " 2> /dev/null || true
81+ # ── Process Group Management ──
82+ # Kill old control-agent process group to ensure clean slate.
83+ # This automatically terminates all spawned services (bridge, workers, etc.)
84+ # without needing to track individual PIDs or process names.
85+ CONTROL_PGID_FILE=" $HOME /.pi/agent/control-agent.pgid"
86+
87+ if [ -f " $CONTROL_PGID_FILE " ]; then
88+ OLD_PGID=$( cat " $CONTROL_PGID_FILE " 2> /dev/null || echo " " )
89+ if [ -n " $OLD_PGID " ] && kill -0 -" $OLD_PGID " 2> /dev/null; then
90+ echo " Terminating old control-agent process group (PGID $OLD_PGID )..."
91+ kill -TERM -" $OLD_PGID " 2> /dev/null || true
92+ # Wait up to 5s for graceful shutdown
93+ for _i in 1 2 3 4 5; do
94+ if ! kill -0 -" $OLD_PGID " 2> /dev/null; then
95+ echo " Process group terminated cleanly"
96+ break
97+ fi
98+ sleep 1
99+ done
100+ # Force-kill any survivors
101+ if kill -0 -" $OLD_PGID " 2> /dev/null; then
102+ echo " Force-killing stubborn processes in group $OLD_PGID ..."
103+ kill -KILL -" $OLD_PGID " 2> /dev/null || true
104+ sleep 1
105+ fi
103106 fi
104- rm -f " $BRIDGE_PID_FILE "
105- fi
106- # Kill the tmux session too (startup-pi.sh uses this)
107- tmux kill-session -t slack-bridge 2> /dev/null || true
108- # Force-release port 7890 in case anything survived
109- PORT_PIDS=" $( lsof -ti :7890 2> /dev/null || true) "
110- if [ -n " $PORT_PIDS " ]; then
111- echo " Releasing port 7890 (PIDs: $PORT_PIDS )..."
112- echo " $PORT_PIDS " | xargs kill 2> /dev/null || true
113- sleep 1
114- PORT_PIDS=" $( lsof -ti :7890 2> /dev/null || true) "
115- [ -n " $PORT_PIDS " ] && echo " $PORT_PIDS " | xargs kill -9 2> /dev/null || true
107+ rm -f " $CONTROL_PGID_FILE "
116108fi
117109
118110# Set session name (read by auto-name.ts extension)
@@ -134,6 +126,14 @@ else
134126 exit 1
135127fi
136128
137- # Start control-agent
129+ # Start control-agent.
130+ # Save our PID as the process group ID for cleanup on next restart.
131+ # When systemd launches start.sh (Type=simple), our PID is already the
132+ # process group leader. `exec pi` replaces this process in-place (same PID,
133+ # same PGID), so all child processes (bridge, workers) inherit the group.
134+ # On restart, killing -$PGID terminates the entire tree automatically.
135+ #
138136# --session-control: enables inter-session communication (handled by control.ts extension)
139- pi --session-control --model " $MODEL " --skill ~ /.pi/agent/skills/control-agent " /skill:control-agent"
137+ echo " Starting control-agent..."
138+ echo $$ > " $CONTROL_PGID_FILE "
139+ exec pi --session-control --model " $MODEL " --skill ~ /.pi/agent/skills/control-agent " /skill:control-agent"
0 commit comments