Skip to content

Commit c6e566f

Browse files
authored
ops: remove bridge tmux guidance, keep tmux for agent sessions (#140)
1 parent 9a3e46b commit c6e566f

6 files changed

Lines changed: 91 additions & 45 deletions

File tree

bin/baudbot

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -351,16 +351,6 @@ case "${1:-}" in
351351
shift
352352
require_root "restart"
353353
if has_systemd; then
354-
# Ensure any pre-existing detached bridge tmux session is torn down so
355-
# restart always boots a fresh bridge from currently deployed runtime files.
356-
AGENT_USER="${BAUDBOT_AGENT_USER:-baudbot_agent}"
357-
if command -v tmux >/dev/null 2>&1; then
358-
if command -v sudo >/dev/null 2>&1; then
359-
sudo -u "$AGENT_USER" tmux kill-session -t slack-bridge 2>/dev/null || true
360-
elif command -v runuser >/dev/null 2>&1; then
361-
runuser -u "$AGENT_USER" -- tmux kill-session -t slack-bridge 2>/dev/null || true
362-
fi
363-
fi
364354
exec systemctl restart baudbot "$@"
365355
else
366356
echo "systemd not available."

bin/baudbot.test.sh

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ EOF
130130
)
131131
}
132132

133-
test_restart_restarts_systemd_and_kills_bridge_tmux() {
133+
test_restart_restarts_systemd() {
134134
(
135135
set -euo pipefail
136136
local tmp fakebin log_file
@@ -168,12 +168,6 @@ if [ "${1:-}" = "-u" ]; then
168168
fi
169169
echo "sudo $*" >> "${BAUDBOT_TEST_LOG}"
170170
exec "$@"
171-
EOF
172-
173-
cat > "$fakebin/tmux" <<'EOF'
174-
#!/bin/bash
175-
echo "tmux $*" >> "${BAUDBOT_TEST_LOG}"
176-
exit 0
177171
EOF
178172

179173
cat > "$fakebin/systemctl" <<'EOF'
@@ -182,11 +176,10 @@ echo "systemctl $*" >> "${BAUDBOT_TEST_LOG}"
182176
exit 0
183177
EOF
184178

185-
chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/tmux" "$fakebin/systemctl"
179+
chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/systemctl"
186180

187181
PATH="$fakebin:$PATH" BAUDBOT_TEST_LOG="$log_file" BAUDBOT_ROOT="$tmp" bash "$CLI" restart
188182

189-
grep -q '^tmux kill-session -t slack-bridge$' "$log_file"
190183
grep -q '^systemctl restart baudbot$' "$log_file"
191184
)
192185
}
@@ -198,7 +191,7 @@ run_test "version reads package.json" test_version_uses_package_json
198191
run_test "status dispatches via runtime module" test_status_dispatches_via_runtime_module
199192
run_test "attach requires root" test_attach_requires_root
200193
run_test "broker register requires root" test_broker_register_requires_root
201-
run_test "restart kills bridge tmux then restarts systemd" test_restart_restarts_systemd_and_kills_bridge_tmux
194+
run_test "restart restarts systemd" test_restart_restarts_systemd
202195

203196
echo ""
204197
echo "=== $PASSED/$TOTAL passed, $FAILED failed ==="

bin/lib/baudbot-runtime.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ cmd_attach() {
398398
echo " sudo baudbot attach # defaults to control-agent"
399399
echo " sudo baudbot attach --pi control-agent"
400400
echo " sudo baudbot attach --pi <uuid>"
401-
echo " sudo baudbot attach --tmux slack-bridge"
401+
echo " sudo baudbot attach --tmux sentry-agent"
402402
exit 0
403403
;;
404404
*)

pi/skills/control-agent/SKILL.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -339,14 +339,12 @@ The sentry-agent operates in **on-demand mode** — it does NOT poll. Sentry ale
339339

340340
### Starting the Slack Bridge
341341

342-
The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and launches the bridge in a `slack-bridge` tmux session.
342+
The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and starts the bridge as a normal background process.
343343

344-
If you need to restart the bridge manually:
344+
If you need to restart the bridge manually, rerun startup cleanup and then inspect logs:
345345
```bash
346-
MY_UUID=$(readlink ~/.pi/session-control/control-agent.alias | sed 's/.sock$//')
347-
tmux kill-session -t slack-bridge 2>/dev/null || true
348-
tmux new-session -d -s slack-bridge \
349-
"unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd ~/runtime/slack-bridge && exec varlock run --path ~/.config/ -- node broker-bridge.mjs"
346+
bash ~/.pi/agent/skills/control-agent/startup-cleanup.sh UUID1 UUID2 UUID3
347+
tail -n 200 ~/.pi/agent/logs/slack-bridge.log
350348
```
351349

352350
Verify: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`.
@@ -364,7 +362,7 @@ If you need to check manually, use `heartbeat trigger` to run all checks immedia
364362
When the heartbeat reports a failure, take the appropriate action:
365363
1. **Missing sentry-agent**: Respawn with tmux and re-send role assignment.
366364
2. **Orphaned dev-agents**: Kill tmux session and remove worktree.
367-
3. **Bridge down**: Restart the `slack-bridge` tmux session.
365+
3. **Bridge down**: Restart via `startup-cleanup.sh`, then check `~/.pi/agent/logs/slack-bridge.log`.
368366
4. **Stale worktrees**: `git worktree remove --force` + `rmdir` empty parents.
369367
5. **Stuck todos**: Escalate to user via Slack.
370368

pi/skills/control-agent/startup-cleanup.sh

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# Pass the live session UUIDs (from list_sessions) as arguments.
88
# Any .sock file whose UUID is NOT in the live set gets removed.
99
# Stale .alias symlinks pointing to removed sockets also get cleaned.
10-
# Then restarts the slack-bridge tmux session with the current control-agent UUID.
10+
# Then restarts the slack-bridge process with the current control-agent UUID.
1111

1212
set -euo pipefail
1313

@@ -66,11 +66,39 @@ else
6666
exit 1
6767
fi
6868

69-
# Kill existing slack-bridge tmux session if running
70-
if tmux has-session -t slack-bridge 2>/dev/null; then
71-
echo "Killing existing slack-bridge session..."
72-
tmux kill-session -t slack-bridge
69+
BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
70+
BRIDGE_LOG_DIR="$HOME/.pi/agent/logs"
71+
BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log"
72+
73+
kill_bridge_supervisor() {
74+
local bridge_pid="$1"
75+
[ -n "$bridge_pid" ] || return 0
76+
if ! kill -0 "$bridge_pid" 2>/dev/null; then
77+
return 0
78+
fi
79+
80+
# Best-effort: terminate direct children first so no stale bridge process keeps the port.
81+
local bridge_child_pids
82+
bridge_child_pids="$(pgrep -P "$bridge_pid" 2>/dev/null || true)"
83+
if [ -n "$bridge_child_pids" ]; then
84+
kill $bridge_child_pids 2>/dev/null || true
85+
sleep 1
86+
kill -9 $bridge_child_pids 2>/dev/null || true
87+
fi
88+
89+
kill "$bridge_pid" 2>/dev/null || true
7390
sleep 1
91+
kill -9 "$bridge_pid" 2>/dev/null || true
92+
}
93+
94+
# Kill existing slack-bridge process if running
95+
if [ -f "$BRIDGE_PID_FILE" ]; then
96+
BRIDGE_PID="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)"
97+
if [ -n "$BRIDGE_PID" ] && kill -0 "$BRIDGE_PID" 2>/dev/null; then
98+
echo "Killing existing slack-bridge process (pid=$BRIDGE_PID)..."
99+
kill_bridge_supervisor "$BRIDGE_PID"
100+
fi
101+
rm -f "$BRIDGE_PID_FILE"
74102
fi
75103

76104
# Select bridge script: prefer broker pull mode when SLACK_BROKER_* vars are present,
@@ -100,9 +128,25 @@ if [ -z "$BRIDGE_SCRIPT" ]; then
100128
fi
101129

102130
# Start fresh slack-bridge
131+
# Keep a supervisor loop (matching start.sh) so bridge restarts automatically on crash.
103132
echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..."
104-
tmux new-session -d -s slack-bridge \
105-
"unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd /opt/baudbot/current/slack-bridge && exec varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT"
133+
mkdir -p "$BRIDGE_LOG_DIR"
134+
(
135+
unset PKG_EXECPATH
136+
export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH"
137+
export PI_SESSION_ID="$MY_UUID"
138+
cd /opt/baudbot/current/slack-bridge
139+
while true; do
140+
varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1
141+
echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE"
142+
sleep 5
143+
done
144+
) &
145+
NEW_BRIDGE_PID=$!
146+
echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE"
147+
chmod 600 "$BRIDGE_PID_FILE"
148+
echo "Bridge pid: $NEW_BRIDGE_PID"
149+
echo "Bridge logs: $BRIDGE_LOG_FILE"
106150

107151
# Wait for bridge to come up
108152
sleep 3

start.sh

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,37 @@ fi
8383

8484
if [ -n "$BRIDGE_SCRIPT" ]; then
8585
RELEASE_BRIDGE="/opt/baudbot/current/slack-bridge"
86-
tmux kill-session -t slack-bridge 2>/dev/null || true
87-
echo "Starting Slack bridge ($BRIDGE_SCRIPT)..."
88-
tmux new-session -d -s slack-bridge \
89-
"export PATH=$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \
90-
cd $RELEASE_BRIDGE && \
91-
while true; do \
92-
varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; \
93-
echo '⚠️ Bridge exited (\$?), restarting in 5s...'; \
94-
sleep 5; \
95-
done"
86+
BRIDGE_LOG_DIR="$HOME/.pi/agent/logs"
87+
BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log"
88+
BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
89+
90+
mkdir -p "$BRIDGE_LOG_DIR"
91+
92+
# Stop any previous bridge process tracked by pid file.
93+
if [ -f "$BRIDGE_PID_FILE" ]; then
94+
old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)"
95+
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
96+
kill "$old_pid" 2>/dev/null || true
97+
sleep 1
98+
kill -9 "$old_pid" 2>/dev/null || true
99+
fi
100+
rm -f "$BRIDGE_PID_FILE"
101+
fi
102+
103+
echo "Starting Slack bridge ($BRIDGE_SCRIPT)... logs: $BRIDGE_LOG_FILE"
104+
(
105+
export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH"
106+
cd "$RELEASE_BRIDGE"
107+
while true; do
108+
varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1
109+
echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE"
110+
sleep 5
111+
done
112+
) &
113+
# Intentionally track the supervisor subshell PID (not per-restart node child PID)
114+
# so a single kill stops the entire bridge restart loop.
115+
echo $! > "$BRIDGE_PID_FILE"
116+
chmod 600 "$BRIDGE_PID_FILE"
96117
fi
97118

98119
# Set session name (read by auto-name.ts extension)

0 commit comments

Comments
 (0)