diff --git a/bin/baudbot b/bin/baudbot index 37123e4..2b99f4b 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -351,16 +351,6 @@ case "${1:-}" in shift require_root "restart" if has_systemd; then - # Ensure any pre-existing detached bridge tmux session is torn down so - # restart always boots a fresh bridge from currently deployed runtime files. - AGENT_USER="${BAUDBOT_AGENT_USER:-baudbot_agent}" - if command -v tmux >/dev/null 2>&1; then - if command -v sudo >/dev/null 2>&1; then - sudo -u "$AGENT_USER" tmux kill-session -t slack-bridge 2>/dev/null || true - elif command -v runuser >/dev/null 2>&1; then - runuser -u "$AGENT_USER" -- tmux kill-session -t slack-bridge 2>/dev/null || true - fi - fi exec systemctl restart baudbot "$@" else echo "systemd not available." diff --git a/bin/baudbot.test.sh b/bin/baudbot.test.sh index ba2ba7a..8867be2 100644 --- a/bin/baudbot.test.sh +++ b/bin/baudbot.test.sh @@ -130,7 +130,7 @@ EOF ) } -test_restart_restarts_systemd_and_kills_bridge_tmux() { +test_restart_restarts_systemd() { ( set -euo pipefail local tmp fakebin log_file @@ -168,12 +168,6 @@ if [ "${1:-}" = "-u" ]; then fi echo "sudo $*" >> "${BAUDBOT_TEST_LOG}" exec "$@" -EOF - - cat > "$fakebin/tmux" <<'EOF' -#!/bin/bash -echo "tmux $*" >> "${BAUDBOT_TEST_LOG}" -exit 0 EOF cat > "$fakebin/systemctl" <<'EOF' @@ -182,11 +176,10 @@ echo "systemctl $*" >> "${BAUDBOT_TEST_LOG}" exit 0 EOF - chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/tmux" "$fakebin/systemctl" + chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/systemctl" PATH="$fakebin:$PATH" BAUDBOT_TEST_LOG="$log_file" BAUDBOT_ROOT="$tmp" bash "$CLI" restart - grep -q '^tmux kill-session -t slack-bridge$' "$log_file" grep -q '^systemctl restart baudbot$' "$log_file" ) } @@ -198,7 +191,7 @@ run_test "version reads package.json" test_version_uses_package_json run_test "status dispatches via runtime module" test_status_dispatches_via_runtime_module run_test "attach requires root" test_attach_requires_root run_test "broker register requires root" test_broker_register_requires_root -run_test "restart kills bridge tmux then restarts systemd" test_restart_restarts_systemd_and_kills_bridge_tmux +run_test "restart restarts systemd" test_restart_restarts_systemd echo "" echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" diff --git a/bin/lib/baudbot-runtime.sh b/bin/lib/baudbot-runtime.sh index 6a508ba..4ba3cb8 100644 --- a/bin/lib/baudbot-runtime.sh +++ b/bin/lib/baudbot-runtime.sh @@ -398,7 +398,7 @@ cmd_attach() { echo " sudo baudbot attach # defaults to control-agent" echo " sudo baudbot attach --pi control-agent" echo " sudo baudbot attach --pi " - echo " sudo baudbot attach --tmux slack-bridge" + echo " sudo baudbot attach --tmux sentry-agent" exit 0 ;; *) diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index 280b1dc..cd4bc3f 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -339,14 +339,12 @@ The sentry-agent operates in **on-demand mode** — it does NOT poll. Sentry ale ### Starting the Slack Bridge -The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and launches the bridge in a `slack-bridge` tmux session. +The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and starts the bridge as a normal background process. -If you need to restart the bridge manually: +If you need to restart the bridge manually, rerun startup cleanup and then inspect logs: ```bash -MY_UUID=$(readlink ~/.pi/session-control/control-agent.alias | sed 's/.sock$//') -tmux kill-session -t slack-bridge 2>/dev/null || true -tmux new-session -d -s slack-bridge \ - "unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd ~/runtime/slack-bridge && exec varlock run --path ~/.config/ -- node broker-bridge.mjs" +bash ~/.pi/agent/skills/control-agent/startup-cleanup.sh UUID1 UUID2 UUID3 +tail -n 200 ~/.pi/agent/logs/slack-bridge.log ``` Verify: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. @@ -364,7 +362,7 @@ If you need to check manually, use `heartbeat trigger` to run all checks immedia When the heartbeat reports a failure, take the appropriate action: 1. **Missing sentry-agent**: Respawn with tmux and re-send role assignment. 2. **Orphaned dev-agents**: Kill tmux session and remove worktree. -3. **Bridge down**: Restart the `slack-bridge` tmux session. +3. **Bridge down**: Restart via `startup-cleanup.sh`, then check `~/.pi/agent/logs/slack-bridge.log`. 4. **Stale worktrees**: `git worktree remove --force` + `rmdir` empty parents. 5. **Stuck todos**: Escalate to user via Slack. diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index ece41ad..81a456d 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -7,7 +7,7 @@ # Pass the live session UUIDs (from list_sessions) as arguments. # Any .sock file whose UUID is NOT in the live set gets removed. # Stale .alias symlinks pointing to removed sockets also get cleaned. -# Then restarts the slack-bridge tmux session with the current control-agent UUID. +# Then restarts the slack-bridge process with the current control-agent UUID. set -euo pipefail @@ -66,11 +66,39 @@ else exit 1 fi -# Kill existing slack-bridge tmux session if running -if tmux has-session -t slack-bridge 2>/dev/null; then - echo "Killing existing slack-bridge session..." - tmux kill-session -t slack-bridge +BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" +BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" +BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" + +kill_bridge_supervisor() { + local bridge_pid="$1" + [ -n "$bridge_pid" ] || return 0 + if ! kill -0 "$bridge_pid" 2>/dev/null; then + return 0 + fi + + # Best-effort: terminate direct children first so no stale bridge process keeps the port. + local bridge_child_pids + bridge_child_pids="$(pgrep -P "$bridge_pid" 2>/dev/null || true)" + if [ -n "$bridge_child_pids" ]; then + kill $bridge_child_pids 2>/dev/null || true + sleep 1 + kill -9 $bridge_child_pids 2>/dev/null || true + fi + + kill "$bridge_pid" 2>/dev/null || true sleep 1 + kill -9 "$bridge_pid" 2>/dev/null || true +} + +# Kill existing slack-bridge process if running +if [ -f "$BRIDGE_PID_FILE" ]; then + BRIDGE_PID="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" + if [ -n "$BRIDGE_PID" ] && kill -0 "$BRIDGE_PID" 2>/dev/null; then + echo "Killing existing slack-bridge process (pid=$BRIDGE_PID)..." + kill_bridge_supervisor "$BRIDGE_PID" + fi + rm -f "$BRIDGE_PID_FILE" fi # Select bridge script: prefer broker pull mode when SLACK_BROKER_* vars are present, @@ -100,9 +128,25 @@ if [ -z "$BRIDGE_SCRIPT" ]; then fi # Start fresh slack-bridge +# Keep a supervisor loop (matching start.sh) so bridge restarts automatically on crash. echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..." -tmux new-session -d -s slack-bridge \ - "unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd /opt/baudbot/current/slack-bridge && exec varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT" +mkdir -p "$BRIDGE_LOG_DIR" +( + unset PKG_EXECPATH + export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" + export PI_SESSION_ID="$MY_UUID" + cd /opt/baudbot/current/slack-bridge + while true; do + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 + echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" + sleep 5 + done +) & +NEW_BRIDGE_PID=$! +echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE" +chmod 600 "$BRIDGE_PID_FILE" +echo "Bridge pid: $NEW_BRIDGE_PID" +echo "Bridge logs: $BRIDGE_LOG_FILE" # Wait for bridge to come up sleep 3 diff --git a/start.sh b/start.sh index 2c2b034..a0ad82c 100755 --- a/start.sh +++ b/start.sh @@ -83,16 +83,37 @@ fi if [ -n "$BRIDGE_SCRIPT" ]; then RELEASE_BRIDGE="/opt/baudbot/current/slack-bridge" - tmux kill-session -t slack-bridge 2>/dev/null || true - echo "Starting Slack bridge ($BRIDGE_SCRIPT)..." - tmux new-session -d -s slack-bridge \ - "export PATH=$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ - cd $RELEASE_BRIDGE && \ - while true; do \ - varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; \ - echo '⚠️ Bridge exited (\$?), restarting in 5s...'; \ - sleep 5; \ - done" + BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" + BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" + BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" + + mkdir -p "$BRIDGE_LOG_DIR" + + # Stop any previous bridge process tracked by pid file. + if [ -f "$BRIDGE_PID_FILE" ]; then + old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" + if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then + kill "$old_pid" 2>/dev/null || true + sleep 1 + kill -9 "$old_pid" 2>/dev/null || true + fi + rm -f "$BRIDGE_PID_FILE" + fi + + echo "Starting Slack bridge ($BRIDGE_SCRIPT)... logs: $BRIDGE_LOG_FILE" + ( + export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" + cd "$RELEASE_BRIDGE" + while true; do + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 + echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" + sleep 5 + done + ) & + # Intentionally track the supervisor subshell PID (not per-restart node child PID) + # so a single kill stops the entire bridge restart loop. + echo $! > "$BRIDGE_PID_FILE" + chmod 600 "$BRIDGE_PID_FILE" fi # Set session name (read by auto-name.ts extension)