Skip to content

Commit 1024a36

Browse files
author
Baudbot
committed
fix: single owner for Slack bridge lifecycle
Two changes to eliminate the dual-launch race condition: 1. start.sh: Remove bridge launch — only does pre-cleanup now (kill stale PID, tmux session, port holders). The bridge needs PI_SESSION_ID which isn't known until pi starts, so start.sh can't own it. 2. startup-cleanup.sh: Add max retries + backoff to restart loop - Tracks consecutive fast failures (<60s runtime) - Gives up after 10 consecutive fast failures (logs FATAL) - Backs off: 5s base + 2s per failure, capped at 60s - Kills port holders before retrying (prevents EADDRINUSE spin) - Logs attempt count, runtime duration, and failure state Previously, start.sh launched a bridge as a background subshell before pi, and startup-cleanup.sh launched another in tmux after pi started. This caused port conflicts, orphaned supervisors, and dropped messages when the pre-pi bridge couldn't find the session socket.
1 parent 3771d6b commit 1024a36

2 files changed

Lines changed: 37 additions & 13 deletions

File tree

pi/skills/control-agent/startup-cleanup.sh

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
# Any .sock file whose UUID is NOT in the live set gets removed.
99
# Stale .alias symlinks pointing to removed sockets also get cleaned.
1010
# Then restarts the slack-bridge process with the current control-agent UUID.
11+
#
12+
# This script is the SOLE owner of the bridge lifecycle. start.sh only does
13+
# pre-cleanup (kill stale processes, release port) — it never launches the bridge.
1114

1215
set -euo pipefail
1316

@@ -143,9 +146,15 @@ if [ -z "$BRIDGE_SCRIPT" ]; then
143146
exit 0
144147
fi
145148

146-
# --- Launch bridge in a tmux session with restart loop ---
147-
# The tmux session stays alive independently of this script (same pattern as
148-
# sentry-agent). If the bridge crashes, the loop restarts it after 5 seconds.
149+
# --- Launch bridge in a tmux session with supervised restart loop ---
150+
# The restart loop:
151+
# - Re-reads .env on every restart (picks up config changes)
152+
# - Unsets SLACK_BROKER_* before sourcing (avoids stale parent env)
153+
# - Tracks consecutive fast failures (<60s runtime) and gives up after 10
154+
# - Backs off: 5s base + 2s per failure, capped at 60s
155+
# - Kills port holders before retrying (avoids EADDRINUSE spin)
156+
MAX_CONSECUTIVE_FAILURES=10
157+
149158
echo "Starting slack-bridge ($BRIDGE_SCRIPT) via tmux..."
150159
NODE_BIN_DIR="${NODE_BIN_DIR:-$HOME/opt/node/bin}"
151160
if command -v bb_resolve_runtime_node_bin_dir >/dev/null 2>&1; then
@@ -161,20 +170,35 @@ tmux new-session -d -s "$BRIDGE_TMUX_SESSION" "\
161170
export PATH=$NODE_BIN_DIR:\$PATH; \
162171
export PI_SESSION_ID=$MY_UUID; \
163172
cd $BRIDGE_DIR; \
173+
consecutive_failures=0; \
164174
while true; do \
165-
echo \"[\$(date -Is)] bridge: starting $BRIDGE_SCRIPT\" >> $BRIDGE_LOG_FILE; \
175+
echo \"[\$(date -Is)] bridge: starting $BRIDGE_SCRIPT (attempt \$((consecutive_failures + 1)))\" >> $BRIDGE_LOG_FILE; \
176+
start_time=\$(date +%s); \
166177
for v in \$(env | grep ^SLACK_BROKER_ | cut -d= -f1 || true); do unset \$v; done; \
167178
set -a; source \$HOME/.config/.env; set +a; \
168179
node $BRIDGE_SCRIPT >> $BRIDGE_LOG_FILE 2>&1; \
169180
exit_code=\$?; \
170-
echo \"[\$(date -Is)] bridge: exited with code \$exit_code, restarting in 5s\" >> $BRIDGE_LOG_FILE; \
171-
sleep 5; \
172-
tries=0; \
173-
while lsof -ti :7890 >/dev/null 2>&1 && [ \$tries -lt 10 ]; do \
174-
echo \"[\$(date -Is)] bridge: port 7890 still in use, waiting...\" >> $BRIDGE_LOG_FILE; \
175-
sleep 2; \
176-
tries=\$((tries + 1)); \
177-
done; \
181+
runtime=\$(( \$(date +%s) - start_time )); \
182+
echo \"[\$(date -Is)] bridge: exited with code \$exit_code after \${runtime}s\" >> $BRIDGE_LOG_FILE; \
183+
if [ \$runtime -ge 60 ]; then \
184+
consecutive_failures=0; \
185+
else \
186+
consecutive_failures=\$((consecutive_failures + 1)); \
187+
fi; \
188+
if [ \$consecutive_failures -ge $MAX_CONSECUTIVE_FAILURES ]; then \
189+
echo \"[\$(date -Is)] bridge: FATAL — \$consecutive_failures consecutive fast failures, giving up\" >> $BRIDGE_LOG_FILE; \
190+
break; \
191+
fi; \
192+
delay=\$((5 + consecutive_failures * 2)); \
193+
[ \$delay -gt 60 ] && delay=60; \
194+
echo \"[\$(date -Is)] bridge: restarting in \${delay}s (failures: \$consecutive_failures/$MAX_CONSECUTIVE_FAILURES)\" >> $BRIDGE_LOG_FILE; \
195+
sleep \$delay; \
196+
port_pids=\$(lsof -ti :7890 2>/dev/null || true); \
197+
if [ -n \"\$port_pids\" ]; then \
198+
echo \"[\$(date -Is)] bridge: port 7890 still held, killing: \$port_pids\" >> $BRIDGE_LOG_FILE; \
199+
echo \"\$port_pids\" | xargs kill -9 2>/dev/null || true; \
200+
sleep 1; \
201+
fi; \
178202
done"
179203

180204
echo "Bridge tmux session: $BRIDGE_TMUX_SESSION"

start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
1515
# shellcheck source=bin/lib/runtime-node.sh
1616
source "$SCRIPT_DIR/bin/lib/runtime-node.sh"
1717
# bridge-restart-policy.sh no longer needed — bridge is started by
18-
# startup-cleanup.sh, not start.sh (see PR #XXX)
18+
# startup-cleanup.sh, not start.sh (see PR #164)
1919
cd ~
2020

2121
NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"

0 commit comments

Comments
 (0)