Skip to content

Commit fc4fb6e

Browse files
baudbot-agentBaudbot
andauthored
fix: replace subshell bridge supervisor with tmux session (#158)
Co-authored-by: Baudbot <hornet@agentmail.to>
1 parent 7fc9762 commit fc4fb6e

1 file changed

Lines changed: 46 additions & 81 deletions

File tree

pi/skills/control-agent/startup-cleanup.sh

Lines changed: 46 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@ set -euo pipefail
1616
# processes and causes `varlock run` to treat subcommands as Node module paths).
1717
unset PKG_EXECPATH 2>/dev/null || true
1818

19-
BRIDGE_POLICY_HELPER="$HOME/runtime/bin/lib/bridge-restart-policy.sh"
20-
if [ -r "$BRIDGE_POLICY_HELPER" ]; then
21-
# shellcheck source=bin/lib/bridge-restart-policy.sh
22-
source "$BRIDGE_POLICY_HELPER"
23-
fi
24-
2519
SOCKET_DIR="$HOME/.pi/session-control"
2620

2721
if [ $# -eq 0 ]; then
@@ -77,47 +71,33 @@ else
7771
exit 1
7872
fi
7973

80-
BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
8174
BRIDGE_LOG_DIR="$HOME/.pi/agent/logs"
8275
BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log"
83-
BRIDGE_STATUS_FILE="$HOME/.pi/agent/slack-bridge-supervisor.json"
84-
85-
kill_bridge_supervisor() {
86-
local bridge_pid="$1"
87-
[ -n "$bridge_pid" ] || return 0
88-
if ! kill -0 "$bridge_pid" 2>/dev/null; then
89-
return 0
90-
fi
76+
BRIDGE_DIR="/opt/baudbot/current/slack-bridge"
77+
BRIDGE_TMUX_SESSION="slack-bridge"
9178

92-
# Best-effort: terminate direct children first so no stale bridge process keeps the port.
93-
local bridge_child_pids
94-
bridge_child_pids="$(pgrep -P "$bridge_pid" 2>/dev/null || true)"
95-
if [ -n "$bridge_child_pids" ]; then
96-
kill $bridge_child_pids 2>/dev/null || true
97-
sleep 1
98-
kill -9 $bridge_child_pids 2>/dev/null || true
99-
fi
79+
mkdir -p "$BRIDGE_LOG_DIR"
10080

101-
kill "$bridge_pid" 2>/dev/null || true
81+
# --- Kill anything holding port 7890, any existing bridge tmux session,
82+
# and any leftover old-style PID-file supervisor.
83+
echo "Cleaning up old bridge..."
84+
PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true)
85+
if [ -n "$PORT_PIDS" ]; then
86+
echo "Killing processes on port 7890: $PORT_PIDS"
87+
echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
10288
sleep 1
103-
kill -9 "$bridge_pid" 2>/dev/null || true
104-
}
105-
106-
# Kill existing slack-bridge process if running
107-
if [ -f "$BRIDGE_PID_FILE" ]; then
108-
BRIDGE_PID="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)"
109-
if [ -n "$BRIDGE_PID" ] && kill -0 "$BRIDGE_PID" 2>/dev/null; then
110-
echo "Killing existing slack-bridge process (pid=$BRIDGE_PID)..."
111-
kill_bridge_supervisor "$BRIDGE_PID"
112-
fi
113-
rm -f "$BRIDGE_PID_FILE"
89+
fi
90+
tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true
91+
OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
92+
if [ -f "$OLD_PID_FILE" ]; then
93+
OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)"
94+
[ -n "$OLD_PID" ] && kill -9 "$OLD_PID" 2>/dev/null || true
95+
rm -f "$OLD_PID_FILE"
11496
fi
11597

116-
# Select bridge script: prefer broker pull mode when SLACK_BROKER_* vars are present,
117-
# then Socket Mode when SLACK_BOT_TOKEN + SLACK_APP_TOKEN are present.
118-
# If neither mode is configured, skip bridge startup.
98+
# --- Detect bridge mode ---
11999
BRIDGE_SCRIPT=""
120-
if [ -f "/opt/baudbot/current/slack-bridge/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c '
100+
if [ -f "$BRIDGE_DIR/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c '
121101
test -n "$SLACK_BROKER_URL" &&
122102
test -n "$SLACK_BROKER_WORKSPACE_ID" &&
123103
test -n "$SLACK_BROKER_SERVER_PRIVATE_KEY" &&
@@ -126,7 +106,7 @@ if [ -f "/opt/baudbot/current/slack-bridge/broker-bridge.mjs" ] && varlock run -
126106
test -n "$SLACK_BROKER_PUBLIC_KEY" &&
127107
test -n "$SLACK_BROKER_SIGNING_PUBLIC_KEY"' 2>/dev/null; then
128108
BRIDGE_SCRIPT="broker-bridge.mjs"
129-
elif varlock run --path "$HOME/.config/" -- sh -c '
109+
elif [ -f "$BRIDGE_DIR/bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c '
130110
test -n "$SLACK_BOT_TOKEN" &&
131111
test -n "$SLACK_APP_TOKEN"' 2>/dev/null; then
132112
BRIDGE_SCRIPT="bridge.mjs"
@@ -139,54 +119,39 @@ if [ -z "$BRIDGE_SCRIPT" ]; then
139119
exit 0
140120
fi
141121

142-
# Start fresh slack-bridge
143-
# Keep a supervisor loop (matching start.sh) so bridge restarts automatically on crash.
144-
echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..."
145-
mkdir -p "$BRIDGE_LOG_DIR"
146-
(
147-
unset PKG_EXECPATH
148-
# Clear ALL varlock-managed env vars inherited from the parent session.
149-
# varlock run does not override vars already set in the environment, so
150-
# stale values (e.g. expired broker tokens) would leak through. By unsetting
151-
# every key varlock manages, we guarantee varlock run injects fresh values
152-
# from ~/.config/.env on every bridge restart.
153-
if command -v varlock >/dev/null 2>&1; then
154-
while IFS='=' read -r key _; do
155-
[ -n "$key" ] && unset "$key"
156-
done < <(varlock load --path "$HOME/.config/" --format env --compact 2>/dev/null)
157-
fi
158-
export PATH="$HOME/.varlock/bin:$HOME/opt/node/bin:$PATH"
159-
export PI_SESSION_ID="$MY_UUID"
160-
cd /opt/baudbot/current/slack-bridge
161-
162-
if command -v bb_bridge_supervise >/dev/null 2>&1; then
163-
bb_bridge_supervise "$BRIDGE_LOG_FILE" "$BRIDGE_STATUS_FILE" "$BRIDGE_SCRIPT" \
164-
varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT"
165-
else
166-
while true; do
167-
if varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1; then
168-
exit_code=0
169-
else
170-
exit_code=$?
171-
fi
172-
echo "[$(date -Is)] bridge-supervisor event=restart_scheduled mode=legacy script=$BRIDGE_SCRIPT exit_code=$exit_code delay_seconds=5" >>"$BRIDGE_LOG_FILE"
173-
sleep 5
174-
done
175-
fi
176-
) &
177-
NEW_BRIDGE_PID=$!
178-
echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE"
179-
chmod 600 "$BRIDGE_PID_FILE"
180-
echo "Bridge pid: $NEW_BRIDGE_PID"
122+
# --- Launch bridge in a tmux session with restart loop ---
123+
# The tmux session stays alive independently of this script (same pattern as
124+
# sentry-agent). If the bridge crashes, the loop restarts it after 5 seconds.
125+
echo "Starting slack-bridge ($BRIDGE_SCRIPT) via tmux..."
126+
NODE_BIN_DIR="$HOME/opt/node/bin"
127+
if [ ! -d "$NODE_BIN_DIR" ]; then
128+
# Fallback: resolve versioned node dir
129+
NODE_BIN_DIR="$(echo "$HOME"/opt/node-v*-linux-x64/bin | awk '{print $1}')"
130+
fi
131+
132+
tmux new-session -d -s "$BRIDGE_TMUX_SESSION" "\
133+
unset PKG_EXECPATH; \
134+
export PATH=\$HOME/.varlock/bin:$NODE_BIN_DIR:\$PATH; \
135+
export PI_SESSION_ID=$MY_UUID; \
136+
cd $BRIDGE_DIR; \
137+
while true; do \
138+
echo \"[\$(date -Is)] bridge: starting $BRIDGE_SCRIPT\" >> $BRIDGE_LOG_FILE; \
139+
varlock run --path \$HOME/.config/ -- node $BRIDGE_SCRIPT >> $BRIDGE_LOG_FILE 2>&1; \
140+
exit_code=\$?; \
141+
echo \"[\$(date -Is)] bridge: exited with code \$exit_code, restarting in 5s\" >> $BRIDGE_LOG_FILE; \
142+
sleep 5; \
143+
done"
144+
145+
echo "Bridge tmux session: $BRIDGE_TMUX_SESSION"
181146
echo "Bridge logs: $BRIDGE_LOG_FILE"
182147

183-
# Wait for bridge to come up
148+
# --- Verify bridge is up ---
184149
sleep 3
185150
HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}' 2>/dev/null || echo "000")
186151
if [ "$HTTP_CODE" = "400" ]; then
187152
echo "✅ Slack bridge is up (HTTP $HTTP_CODE)"
188153
else
189-
echo "⚠️ Slack bridge may not be ready yet (HTTP $HTTP_CODE). Check manually."
154+
echo "⚠️ Bridge may not be ready yet (HTTP $HTTP_CODE). Check: tmux attach -t $BRIDGE_TMUX_SESSION"
190155
fi
191156

192157
echo ""

0 commit comments

Comments
 (0)