From a8bc26511115ed74f539358a10d813818c9326e1 Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sun, 22 Feb 2026 20:36:53 -0500 Subject: [PATCH 1/4] ops: remove tmux-based bridge guidance and runtime paths --- README.md | 2 +- bin/baudbot | 14 +-- bin/baudbot.test.sh | 13 +-- bin/lib/baudbot-runtime.sh | 99 +++---------------- install.sh | 15 +-- pi/skills/control-agent/SKILL.md | 49 +++++---- pi/skills/control-agent/memory/operational.md | 2 +- pi/skills/control-agent/startup-cleanup.sh | 36 +++++-- start.sh | 39 ++++++-- 9 files changed, 116 insertions(+), 153 deletions(-) diff --git a/README.md b/README.md index 6d3f14c..5bcc82e 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Baudbot is designed as shared engineering infrastructure, not a single-user desk | **CPU** | 2 vCPU | 4 vCPU | | **Disk** | 20 GB | 40 GB+ (repos, dependencies, Docker images) | -System package dependencies (installed by `baudbot install`): `git`, `curl`, `tmux`, `iptables`, `docker`, `gh`, `jq`, `sudo`. +System package dependencies (installed by `baudbot install`): `git`, `curl`, `iptables`, `docker`, `gh`, `jq`, `sudo`. ## Quick Start diff --git a/bin/baudbot b/bin/baudbot index 37123e4..f17043d 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -131,8 +131,8 @@ usage() { echo " restart Restart the agent" echo " status Show agent status + deployed version + broker connection" echo " logs Tail agent logs" - echo " attach Attach to control-agent by default; supports --pi/--tmux" - echo " sessions List agent tmux and pi sessions (name → id)" + echo " attach Attach to a running pi session (defaults to control-agent)" + echo " sessions List live pi sessions (name → id)" echo "" echo -e "${BOLD}Setup:${RESET}" echo " install Bootstrap install from GitHub (download script, then escalate)" @@ -351,16 +351,6 @@ case "${1:-}" in shift require_root "restart" if has_systemd; then - # Ensure any pre-existing detached bridge tmux session is torn down so - # restart always boots a fresh bridge from currently deployed runtime files. - AGENT_USER="${BAUDBOT_AGENT_USER:-baudbot_agent}" - if command -v tmux >/dev/null 2>&1; then - if command -v sudo >/dev/null 2>&1; then - sudo -u "$AGENT_USER" tmux kill-session -t slack-bridge 2>/dev/null || true - elif command -v runuser >/dev/null 2>&1; then - runuser -u "$AGENT_USER" -- tmux kill-session -t slack-bridge 2>/dev/null || true - fi - fi exec systemctl restart baudbot "$@" else echo "systemd not available." diff --git a/bin/baudbot.test.sh b/bin/baudbot.test.sh index ba2ba7a..8867be2 100644 --- a/bin/baudbot.test.sh +++ b/bin/baudbot.test.sh @@ -130,7 +130,7 @@ EOF ) } -test_restart_restarts_systemd_and_kills_bridge_tmux() { +test_restart_restarts_systemd() { ( set -euo pipefail local tmp fakebin log_file @@ -168,12 +168,6 @@ if [ "${1:-}" = "-u" ]; then fi echo "sudo $*" >> "${BAUDBOT_TEST_LOG}" exec "$@" -EOF - - cat > "$fakebin/tmux" <<'EOF' -#!/bin/bash -echo "tmux $*" >> "${BAUDBOT_TEST_LOG}" -exit 0 EOF cat > "$fakebin/systemctl" <<'EOF' @@ -182,11 +176,10 @@ echo "systemctl $*" >> "${BAUDBOT_TEST_LOG}" exit 0 EOF - chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/tmux" "$fakebin/systemctl" + chmod +x "$fakebin/id" "$fakebin/sudo" "$fakebin/systemctl" PATH="$fakebin:$PATH" BAUDBOT_TEST_LOG="$log_file" BAUDBOT_ROOT="$tmp" bash "$CLI" restart - grep -q '^tmux kill-session -t slack-bridge$' "$log_file" grep -q '^systemctl restart baudbot$' "$log_file" ) } @@ -198,7 +191,7 @@ run_test "version reads package.json" test_version_uses_package_json run_test "status dispatches via runtime module" test_status_dispatches_via_runtime_module run_test "attach requires root" test_attach_requires_root run_test "broker register requires root" test_broker_register_requires_root -run_test "restart kills bridge tmux then restarts systemd" test_restart_restarts_systemd_and_kills_bridge_tmux +run_test "restart restarts systemd" test_restart_restarts_systemd echo "" echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" diff --git a/bin/lib/baudbot-runtime.sh b/bin/lib/baudbot-runtime.sh index 6a508ba..b41cc4e 100644 --- a/bin/lib/baudbot-runtime.sh +++ b/bin/lib/baudbot-runtime.sh @@ -305,8 +305,9 @@ cmd_logs() { exec journalctl -u baudbot -f "$@" fi - echo "No systemd unit. Check tmux sessions:" - echo " sudo -u baudbot_agent tmux ls" + echo "No systemd unit. Check process + logs:" + echo " pgrep -u baudbot_agent -af 'pi --session-control'" + echo " tail -n 200 /home/baudbot_agent/.pi/agent/logs/runtime.log" } cmd_sessions() { @@ -316,14 +317,6 @@ cmd_sessions() { local found alias alias_name alias_uuid sock sess_id name status declare -A ALIASES - echo -e "${BOLD}tmux sessions:${RESET}" - if sudo -u "$AGENT_USER" tmux ls 2>/dev/null; then - : - else - echo " (none)" - fi - - echo "" echo -e "${BOLD}pi sessions:${RESET}" PI_CONTROL_DIR="$(pi_control_dir "$AGENT_USER")" if [ ! -d "$PI_CONTROL_DIR" ]; then @@ -377,28 +370,26 @@ cmd_attach() { local AGENT_USER="baudbot_agent" local AGENT_HOME="/home/$AGENT_USER" - local ATTACH_MODE="auto" local TARGET="" - local tmux_target pi_target + local pi_target while [ "$#" -gt 0 ]; do case "$1" in --pi) - ATTACH_MODE="pi" + # Backward-compatible no-op (pi is the only supported mode now) shift ;; --tmux) - ATTACH_MODE="tmux" - shift + echo "❌ --tmux is no longer supported. Use: sudo baudbot attach [session-name|session-id]" + exit 1 ;; -h|--help) - echo "Usage: sudo baudbot attach [--pi|--tmux] [session-name|session-id]" + echo "Usage: sudo baudbot attach [session-name|session-id]" echo "" echo "Examples:" echo " sudo baudbot attach # defaults to control-agent" - echo " sudo baudbot attach --pi control-agent" - echo " sudo baudbot attach --pi " - echo " sudo baudbot attach --tmux slack-bridge" + echo " sudo baudbot attach control-agent" + echo " sudo baudbot attach " exit 0 ;; *) @@ -416,82 +407,22 @@ cmd_attach() { TARGET="control-agent" fi - attach_tmux_session() { - local tmux_target="$1" - echo -e "${BOLD}${CYAN}Attaching to tmux session:${RESET} $tmux_target" - echo -e "${GREEN}Safe detach:${RESET} Ctrl+b, d ${DIM}(keeps agent running)${RESET}" - echo "" - pause_before_attach - exec sudo -u "$AGENT_USER" tmux attach-session -t "$tmux_target" - } - attach_pi_session() { - local pi_target="$1" - echo -e "${BOLD}${CYAN}Attaching to pi session:${RESET} $pi_target" + local target_session="$1" + echo -e "${BOLD}${CYAN}Attaching to pi session:${RESET} $target_session" echo -e "${BOLD}${YELLOW}Safe detach (does NOT stop the agent):${RESET}" echo -e " ${YELLOW}1)${RESET} Press Ctrl+C once to clear input/cancel local prompt" echo -e " ${YELLOW}2)${RESET} Press Ctrl+C again to exit this client" echo -e " ${GREEN}Agent keeps running under systemd in the background.${RESET}" echo "" pause_before_attach - exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$pi_target'" - } - - choose_tmux_target() { - local requested="${1:-}" - local first - - if [ -n "$requested" ]; then - if sudo -u "$AGENT_USER" tmux has-session -t "$requested" 2>/dev/null; then - echo "$requested" - return 0 - fi - return 1 - fi - - first=$(sudo -u "$AGENT_USER" tmux ls -F '#{session_name}' 2>/dev/null | head -1) - [ -n "$first" ] || return 1 - echo "$first" - return 0 - } - - choose_pi_target() { - local requested="${1:-}" - local resolved - - if ! resolved=$(resolve_pi_session_id "$AGENT_USER" "$requested"); then - return 1 - fi - - [ -n "$resolved" ] || return 1 - echo "$resolved" - return 0 + exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$target_session'" } - if [ "$ATTACH_MODE" = "tmux" ]; then - if tmux_target=$(choose_tmux_target "$TARGET"); then - attach_tmux_session "$tmux_target" - fi - echo "❌ tmux session not found. See: sudo baudbot sessions" - exit 1 - fi - - if [ "$ATTACH_MODE" = "pi" ]; then - if pi_target=$(choose_pi_target "$TARGET"); then - attach_pi_session "$pi_target" - fi - echo "❌ pi session not found. See: sudo baudbot sessions" - exit 1 - fi - - if pi_target=$(choose_pi_target "$TARGET"); then + if pi_target=$(resolve_pi_session_id "$AGENT_USER" "$TARGET"); then attach_pi_session "$pi_target" fi - if tmux_target=$(choose_tmux_target "$TARGET"); then - attach_tmux_session "$tmux_target" - fi - - echo "❌ No matching tmux/pi session found. See: sudo baudbot sessions" + echo "❌ pi session not found. See: sudo baudbot sessions" exit 1 } diff --git a/install.sh b/install.sh index 91a4260..6afce83 100755 --- a/install.sh +++ b/install.sh @@ -163,7 +163,7 @@ install_prereqs_ubuntu() { for attempt in $(seq 1 5); do if DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 update -qq \ - && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl tmux iptables docker.io gh jq sudo 2>&1 | tail -3; then + && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl iptables docker.io gh jq sudo 2>&1 | tail -3; then return 0 fi @@ -179,10 +179,10 @@ install_prereqs_ubuntu() { } install_prereqs_arch() { - pacman -Syu --noconfirm --needed git curl tmux iptables docker github-cli jq sudo 2>&1 | tail -5 + pacman -Syu --noconfirm --needed git curl iptables docker github-cli jq sudo 2>&1 | tail -5 } -info "Installing: git, curl, tmux, iptables, docker, gh, jq, sudo" +info "Installing: git, curl, iptables, docker, gh, jq, sudo" "install_prereqs_$DISTRO" info "Prerequisites installed" @@ -318,12 +318,15 @@ else warn "Agent didn't start — check: baudbot logs" fi else - sudo -u baudbot_agent tmux new-session -d -s baudbot "$BAUDBOT_HOME/runtime/start.sh" 2>/dev/null || true + RUNTIME_LOG_DIR="$BAUDBOT_HOME/.pi/agent/logs" + RUNTIME_LOG_FILE="$RUNTIME_LOG_DIR/runtime.log" + sudo -u baudbot_agent mkdir -p "$RUNTIME_LOG_DIR" + sudo -u baudbot_agent bash -lc "nohup '$BAUDBOT_HOME/runtime/start.sh' >> '$RUNTIME_LOG_FILE' 2>&1 &" sleep 2 - if sudo -u baudbot_agent tmux has-session -t baudbot 2>/dev/null; then + if pgrep -u baudbot_agent -f "pi --session-control" >/dev/null 2>&1; then info "Agent is running ✓" else - warn "Agent didn't start — try: baudbot start --direct" + warn "Agent didn't start — check: $RUNTIME_LOG_FILE" fi fi else diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index 280b1dc..7b82762 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -198,19 +198,22 @@ git fetch origin git worktree add ~/workspace/worktrees/$BRANCH -b $BRANCH origin/main # 2. Launch the agent IN the worktree -tmux new-session -d -s $SESSION_NAME \ - "cd ~/workspace/worktrees/$BRANCH && \ - export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ - export PI_SESSION_NAME=$SESSION_NAME && \ - exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/dev-agent --model " +mkdir -p ~/.pi/agent/logs +nohup bash -lc "cd ~/workspace/worktrees/$BRANCH && \ + export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ + export PI_SESSION_NAME=$SESSION_NAME && \ + exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/dev-agent --model " \ + > ~/.pi/agent/logs/$SESSION_NAME.log 2>&1 & +DEV_PID=$! +echo $DEV_PID > ~/.pi/agent/$SESSION_NAME.pid ``` **Important notes:** - `cd` into the worktree BEFORE launching pi — this ensures pi discovers project context from the repo's CWD -- Use `exec` so the tmux session exits when pi exits - Use `varlock run --path ~/.config/` to validate and inject env vars - Set `PI_SESSION_NAME` so the auto-name extension registers it - Include `--session-control` for `send_to_session` / `list_sessions` +- Read logs with `tail -f ~/.pi/agent/logs/$SESSION_NAME.log` - Wait **~10 seconds** after spawning before sending messages (agent needs time to initialize) - Do NOT use `--name` (not a real pi CLI flag) @@ -225,8 +228,13 @@ SESSION_NAME=dev-agent-myapp-a8b7b331 REPO=myapp BRANCH=fix/some-descriptive-name -# 1. Kill the tmux session (agent should have already exited, but ensure it) -tmux kill-session -t $SESSION_NAME 2>/dev/null || true +# 1. Kill the process if still running +PID_FILE=~/.pi/agent/$SESSION_NAME.pid +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE" 2>/dev/null || true) + [ -n "$PID" ] && kill "$PID" 2>/dev/null || true + rm -f "$PID_FILE" +fi # 2. Remove the worktree cd ~/workspace/$REPO @@ -309,10 +317,10 @@ This removes stale `.sock` files, cleans dead aliases, and restarts the Slack br - [ ] Find or create sentry-agent: 1. Use `list_sessions` to look for a session named `sentry-agent` 2. If found, use that session - 3. If not found, launch with tmux (see Sentry Agent section) + 3. If not found, launch as a normal background process (see Sentry Agent section) 4. Wait ~8 seconds, then send role assignment - [ ] Send role assignment to the `sentry-agent` session -- [ ] Clean up any stale dev-agent worktrees/tmux sessions from previous runs +- [ ] Clean up any stale dev-agent worktrees/background processes from previous runs **Note**: Dev agents are NOT started at startup. They are spawned on-demand when tasks arrive. @@ -330,7 +338,9 @@ The sentry-agent triages Sentry alerts and investigates critical issues via the | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-haiku-4-5` | ```bash -tmux new-session -d -s sentry-agent "export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_NAME=sentry-agent && varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/sentry-agent --model " +mkdir -p ~/.pi/agent/logs +nohup bash -lc "export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_NAME=sentry-agent && exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/sentry-agent --model " > ~/.pi/agent/logs/sentry-agent.log 2>&1 & +echo $! > ~/.pi/agent/sentry-agent.pid ``` **Model note**: `github-copilot/*` models reject Personal Access Tokens and will fail in non-interactive sessions. @@ -339,17 +349,14 @@ The sentry-agent operates in **on-demand mode** — it does NOT poll. Sentry ale ### Starting the Slack Bridge -The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and launches the bridge in a `slack-bridge` tmux session. +The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and starts the bridge as a normal background process. -If you need to restart the bridge manually: +If you need to restart the bridge manually, run `startup-cleanup.sh` again and then inspect logs: ```bash -MY_UUID=$(readlink ~/.pi/session-control/control-agent.alias | sed 's/.sock$//') -tmux kill-session -t slack-bridge 2>/dev/null || true -tmux new-session -d -s slack-bridge \ - "unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd ~/runtime/slack-bridge && exec varlock run --path ~/.config/ -- node broker-bridge.mjs" +tail -n 200 ~/.pi/agent/logs/slack-bridge.log ``` -Verify: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. +Verify API readiness: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. The bridge forwards: - **Human @mentions and DMs** from allowed users → delivered to you with security boundaries for handling @@ -362,9 +369,9 @@ Health checks run automatically every ~10 minutes via the `heartbeat.ts` extensi If you need to check manually, use `heartbeat trigger` to run all checks immediately. When the heartbeat reports a failure, take the appropriate action: -1. **Missing sentry-agent**: Respawn with tmux and re-send role assignment. -2. **Orphaned dev-agents**: Kill tmux session and remove worktree. -3. **Bridge down**: Restart the `slack-bridge` tmux session. +1. **Missing sentry-agent**: Respawn as a background process and re-send role assignment. +2. **Orphaned dev-agents**: Kill stale process + remove worktree. +3. **Bridge down**: Restart via `startup-cleanup.sh` and read `~/.pi/agent/logs/slack-bridge.log`. 4. **Stale worktrees**: `git worktree remove --force` + `rmdir` empty parents. 5. **Stuck todos**: Escalate to user via Slack. diff --git a/pi/skills/control-agent/memory/operational.md b/pi/skills/control-agent/memory/operational.md index 55df260..7a4f632 100644 --- a/pi/skills/control-agent/memory/operational.md +++ b/pi/skills/control-agent/memory/operational.md @@ -8,5 +8,5 @@ Add entries under dated headings. Keep entries concise — one line per learning diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index ece41ad..b9e027b 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -7,7 +7,7 @@ # Pass the live session UUIDs (from list_sessions) as arguments. # Any .sock file whose UUID is NOT in the live set gets removed. # Stale .alias symlinks pointing to removed sockets also get cleaned. -# Then restarts the slack-bridge tmux session with the current control-agent UUID. +# Then restarts the slack-bridge process with the current control-agent UUID. set -euo pipefail @@ -66,11 +66,20 @@ else exit 1 fi -# Kill existing slack-bridge tmux session if running -if tmux has-session -t slack-bridge 2>/dev/null; then - echo "Killing existing slack-bridge session..." - tmux kill-session -t slack-bridge - sleep 1 +BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" +BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" +BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" + +# Kill existing slack-bridge process if running +if [ -f "$BRIDGE_PID_FILE" ]; then + BRIDGE_PID="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" + if [ -n "$BRIDGE_PID" ] && kill -0 "$BRIDGE_PID" 2>/dev/null; then + echo "Killing existing slack-bridge process (pid=$BRIDGE_PID)..." + kill "$BRIDGE_PID" 2>/dev/null || true + sleep 1 + kill -9 "$BRIDGE_PID" 2>/dev/null || true + fi + rm -f "$BRIDGE_PID_FILE" fi # Select bridge script: prefer broker pull mode when SLACK_BROKER_* vars are present, @@ -101,8 +110,19 @@ fi # Start fresh slack-bridge echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..." -tmux new-session -d -s slack-bridge \ - "unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd /opt/baudbot/current/slack-bridge && exec varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT" +mkdir -p "$BRIDGE_LOG_DIR" +( + unset PKG_EXECPATH + export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" + export PI_SESSION_ID="$MY_UUID" + cd /opt/baudbot/current/slack-bridge + exec varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" +) >>"$BRIDGE_LOG_FILE" 2>&1 & +NEW_BRIDGE_PID=$! +echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE" +chmod 600 "$BRIDGE_PID_FILE" +echo "Bridge pid: $NEW_BRIDGE_PID" +echo "Bridge logs: $BRIDGE_LOG_FILE" # Wait for bridge to come up sleep 3 diff --git a/start.sh b/start.sh index 2c2b034..a3ee453 100755 --- a/start.sh +++ b/start.sh @@ -83,16 +83,35 @@ fi if [ -n "$BRIDGE_SCRIPT" ]; then RELEASE_BRIDGE="/opt/baudbot/current/slack-bridge" - tmux kill-session -t slack-bridge 2>/dev/null || true - echo "Starting Slack bridge ($BRIDGE_SCRIPT)..." - tmux new-session -d -s slack-bridge \ - "export PATH=$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ - cd $RELEASE_BRIDGE && \ - while true; do \ - varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; \ - echo '⚠️ Bridge exited (\$?), restarting in 5s...'; \ - sleep 5; \ - done" + BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" + BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" + BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" + + mkdir -p "$BRIDGE_LOG_DIR" + + # Stop any previous bridge process tracked by pid file. + if [ -f "$BRIDGE_PID_FILE" ]; then + old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" + if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then + kill "$old_pid" 2>/dev/null || true + sleep 1 + kill -9 "$old_pid" 2>/dev/null || true + fi + rm -f "$BRIDGE_PID_FILE" + fi + + echo "Starting Slack bridge ($BRIDGE_SCRIPT)... logs: $BRIDGE_LOG_FILE" + ( + export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" + cd "$RELEASE_BRIDGE" + while true; do + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 + echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" + sleep 5 + done + ) & + echo $! > "$BRIDGE_PID_FILE" + chmod 600 "$BRIDGE_PID_FILE" fi # Set session name (read by auto-name.ts extension) From b1c8ced4f5c9a92ce895a49fa223fc93f6dda00f Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Sun, 22 Feb 2026 20:41:15 -0500 Subject: [PATCH 2/4] ops: keep tmux for agent sessions, remove bridge tmux guidance --- README.md | 2 +- bin/baudbot | 4 +- bin/lib/baudbot-runtime.sh | 99 ++++++++++++++++--- install.sh | 15 ++- pi/skills/control-agent/SKILL.md | 43 ++++---- pi/skills/control-agent/memory/operational.md | 2 +- 6 files changed, 111 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 5bcc82e..6d3f14c 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Baudbot is designed as shared engineering infrastructure, not a single-user desk | **CPU** | 2 vCPU | 4 vCPU | | **Disk** | 20 GB | 40 GB+ (repos, dependencies, Docker images) | -System package dependencies (installed by `baudbot install`): `git`, `curl`, `iptables`, `docker`, `gh`, `jq`, `sudo`. +System package dependencies (installed by `baudbot install`): `git`, `curl`, `tmux`, `iptables`, `docker`, `gh`, `jq`, `sudo`. ## Quick Start diff --git a/bin/baudbot b/bin/baudbot index f17043d..2b99f4b 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -131,8 +131,8 @@ usage() { echo " restart Restart the agent" echo " status Show agent status + deployed version + broker connection" echo " logs Tail agent logs" - echo " attach Attach to a running pi session (defaults to control-agent)" - echo " sessions List live pi sessions (name → id)" + echo " attach Attach to control-agent by default; supports --pi/--tmux" + echo " sessions List agent tmux and pi sessions (name → id)" echo "" echo -e "${BOLD}Setup:${RESET}" echo " install Bootstrap install from GitHub (download script, then escalate)" diff --git a/bin/lib/baudbot-runtime.sh b/bin/lib/baudbot-runtime.sh index b41cc4e..4ba3cb8 100644 --- a/bin/lib/baudbot-runtime.sh +++ b/bin/lib/baudbot-runtime.sh @@ -305,9 +305,8 @@ cmd_logs() { exec journalctl -u baudbot -f "$@" fi - echo "No systemd unit. Check process + logs:" - echo " pgrep -u baudbot_agent -af 'pi --session-control'" - echo " tail -n 200 /home/baudbot_agent/.pi/agent/logs/runtime.log" + echo "No systemd unit. Check tmux sessions:" + echo " sudo -u baudbot_agent tmux ls" } cmd_sessions() { @@ -317,6 +316,14 @@ cmd_sessions() { local found alias alias_name alias_uuid sock sess_id name status declare -A ALIASES + echo -e "${BOLD}tmux sessions:${RESET}" + if sudo -u "$AGENT_USER" tmux ls 2>/dev/null; then + : + else + echo " (none)" + fi + + echo "" echo -e "${BOLD}pi sessions:${RESET}" PI_CONTROL_DIR="$(pi_control_dir "$AGENT_USER")" if [ ! -d "$PI_CONTROL_DIR" ]; then @@ -370,26 +377,28 @@ cmd_attach() { local AGENT_USER="baudbot_agent" local AGENT_HOME="/home/$AGENT_USER" + local ATTACH_MODE="auto" local TARGET="" - local pi_target + local tmux_target pi_target while [ "$#" -gt 0 ]; do case "$1" in --pi) - # Backward-compatible no-op (pi is the only supported mode now) + ATTACH_MODE="pi" shift ;; --tmux) - echo "❌ --tmux is no longer supported. Use: sudo baudbot attach [session-name|session-id]" - exit 1 + ATTACH_MODE="tmux" + shift ;; -h|--help) - echo "Usage: sudo baudbot attach [session-name|session-id]" + echo "Usage: sudo baudbot attach [--pi|--tmux] [session-name|session-id]" echo "" echo "Examples:" echo " sudo baudbot attach # defaults to control-agent" - echo " sudo baudbot attach control-agent" - echo " sudo baudbot attach " + echo " sudo baudbot attach --pi control-agent" + echo " sudo baudbot attach --pi " + echo " sudo baudbot attach --tmux sentry-agent" exit 0 ;; *) @@ -407,22 +416,82 @@ cmd_attach() { TARGET="control-agent" fi + attach_tmux_session() { + local tmux_target="$1" + echo -e "${BOLD}${CYAN}Attaching to tmux session:${RESET} $tmux_target" + echo -e "${GREEN}Safe detach:${RESET} Ctrl+b, d ${DIM}(keeps agent running)${RESET}" + echo "" + pause_before_attach + exec sudo -u "$AGENT_USER" tmux attach-session -t "$tmux_target" + } + attach_pi_session() { - local target_session="$1" - echo -e "${BOLD}${CYAN}Attaching to pi session:${RESET} $target_session" + local pi_target="$1" + echo -e "${BOLD}${CYAN}Attaching to pi session:${RESET} $pi_target" echo -e "${BOLD}${YELLOW}Safe detach (does NOT stop the agent):${RESET}" echo -e " ${YELLOW}1)${RESET} Press Ctrl+C once to clear input/cancel local prompt" echo -e " ${YELLOW}2)${RESET} Press Ctrl+C again to exit this client" echo -e " ${GREEN}Agent keeps running under systemd in the background.${RESET}" echo "" pause_before_attach - exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$target_session'" + exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$pi_target'" + } + + choose_tmux_target() { + local requested="${1:-}" + local first + + if [ -n "$requested" ]; then + if sudo -u "$AGENT_USER" tmux has-session -t "$requested" 2>/dev/null; then + echo "$requested" + return 0 + fi + return 1 + fi + + first=$(sudo -u "$AGENT_USER" tmux ls -F '#{session_name}' 2>/dev/null | head -1) + [ -n "$first" ] || return 1 + echo "$first" + return 0 + } + + choose_pi_target() { + local requested="${1:-}" + local resolved + + if ! resolved=$(resolve_pi_session_id "$AGENT_USER" "$requested"); then + return 1 + fi + + [ -n "$resolved" ] || return 1 + echo "$resolved" + return 0 } - if pi_target=$(resolve_pi_session_id "$AGENT_USER" "$TARGET"); then + if [ "$ATTACH_MODE" = "tmux" ]; then + if tmux_target=$(choose_tmux_target "$TARGET"); then + attach_tmux_session "$tmux_target" + fi + echo "❌ tmux session not found. See: sudo baudbot sessions" + exit 1 + fi + + if [ "$ATTACH_MODE" = "pi" ]; then + if pi_target=$(choose_pi_target "$TARGET"); then + attach_pi_session "$pi_target" + fi + echo "❌ pi session not found. See: sudo baudbot sessions" + exit 1 + fi + + if pi_target=$(choose_pi_target "$TARGET"); then attach_pi_session "$pi_target" fi - echo "❌ pi session not found. See: sudo baudbot sessions" + if tmux_target=$(choose_tmux_target "$TARGET"); then + attach_tmux_session "$tmux_target" + fi + + echo "❌ No matching tmux/pi session found. See: sudo baudbot sessions" exit 1 } diff --git a/install.sh b/install.sh index 6afce83..91a4260 100755 --- a/install.sh +++ b/install.sh @@ -163,7 +163,7 @@ install_prereqs_ubuntu() { for attempt in $(seq 1 5); do if DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 update -qq \ - && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl iptables docker.io gh jq sudo 2>&1 | tail -3; then + && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl tmux iptables docker.io gh jq sudo 2>&1 | tail -3; then return 0 fi @@ -179,10 +179,10 @@ install_prereqs_ubuntu() { } install_prereqs_arch() { - pacman -Syu --noconfirm --needed git curl iptables docker github-cli jq sudo 2>&1 | tail -5 + pacman -Syu --noconfirm --needed git curl tmux iptables docker github-cli jq sudo 2>&1 | tail -5 } -info "Installing: git, curl, iptables, docker, gh, jq, sudo" +info "Installing: git, curl, tmux, iptables, docker, gh, jq, sudo" "install_prereqs_$DISTRO" info "Prerequisites installed" @@ -318,15 +318,12 @@ else warn "Agent didn't start — check: baudbot logs" fi else - RUNTIME_LOG_DIR="$BAUDBOT_HOME/.pi/agent/logs" - RUNTIME_LOG_FILE="$RUNTIME_LOG_DIR/runtime.log" - sudo -u baudbot_agent mkdir -p "$RUNTIME_LOG_DIR" - sudo -u baudbot_agent bash -lc "nohup '$BAUDBOT_HOME/runtime/start.sh' >> '$RUNTIME_LOG_FILE' 2>&1 &" + sudo -u baudbot_agent tmux new-session -d -s baudbot "$BAUDBOT_HOME/runtime/start.sh" 2>/dev/null || true sleep 2 - if pgrep -u baudbot_agent -f "pi --session-control" >/dev/null 2>&1; then + if sudo -u baudbot_agent tmux has-session -t baudbot 2>/dev/null; then info "Agent is running ✓" else - warn "Agent didn't start — check: $RUNTIME_LOG_FILE" + warn "Agent didn't start — try: baudbot start --direct" fi fi else diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index 7b82762..cd4bc3f 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -198,22 +198,19 @@ git fetch origin git worktree add ~/workspace/worktrees/$BRANCH -b $BRANCH origin/main # 2. Launch the agent IN the worktree -mkdir -p ~/.pi/agent/logs -nohup bash -lc "cd ~/workspace/worktrees/$BRANCH && \ - export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ - export PI_SESSION_NAME=$SESSION_NAME && \ - exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/dev-agent --model " \ - > ~/.pi/agent/logs/$SESSION_NAME.log 2>&1 & -DEV_PID=$! -echo $DEV_PID > ~/.pi/agent/$SESSION_NAME.pid +tmux new-session -d -s $SESSION_NAME \ + "cd ~/workspace/worktrees/$BRANCH && \ + export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ + export PI_SESSION_NAME=$SESSION_NAME && \ + exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/dev-agent --model " ``` **Important notes:** - `cd` into the worktree BEFORE launching pi — this ensures pi discovers project context from the repo's CWD +- Use `exec` so the tmux session exits when pi exits - Use `varlock run --path ~/.config/` to validate and inject env vars - Set `PI_SESSION_NAME` so the auto-name extension registers it - Include `--session-control` for `send_to_session` / `list_sessions` -- Read logs with `tail -f ~/.pi/agent/logs/$SESSION_NAME.log` - Wait **~10 seconds** after spawning before sending messages (agent needs time to initialize) - Do NOT use `--name` (not a real pi CLI flag) @@ -228,13 +225,8 @@ SESSION_NAME=dev-agent-myapp-a8b7b331 REPO=myapp BRANCH=fix/some-descriptive-name -# 1. Kill the process if still running -PID_FILE=~/.pi/agent/$SESSION_NAME.pid -if [ -f "$PID_FILE" ]; then - PID=$(cat "$PID_FILE" 2>/dev/null || true) - [ -n "$PID" ] && kill "$PID" 2>/dev/null || true - rm -f "$PID_FILE" -fi +# 1. Kill the tmux session (agent should have already exited, but ensure it) +tmux kill-session -t $SESSION_NAME 2>/dev/null || true # 2. Remove the worktree cd ~/workspace/$REPO @@ -317,10 +309,10 @@ This removes stale `.sock` files, cleans dead aliases, and restarts the Slack br - [ ] Find or create sentry-agent: 1. Use `list_sessions` to look for a session named `sentry-agent` 2. If found, use that session - 3. If not found, launch as a normal background process (see Sentry Agent section) + 3. If not found, launch with tmux (see Sentry Agent section) 4. Wait ~8 seconds, then send role assignment - [ ] Send role assignment to the `sentry-agent` session -- [ ] Clean up any stale dev-agent worktrees/background processes from previous runs +- [ ] Clean up any stale dev-agent worktrees/tmux sessions from previous runs **Note**: Dev agents are NOT started at startup. They are spawned on-demand when tasks arrive. @@ -338,9 +330,7 @@ The sentry-agent triages Sentry alerts and investigates critical issues via the | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-haiku-4-5` | ```bash -mkdir -p ~/.pi/agent/logs -nohup bash -lc "export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_NAME=sentry-agent && exec varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/sentry-agent --model " > ~/.pi/agent/logs/sentry-agent.log 2>&1 & -echo $! > ~/.pi/agent/sentry-agent.pid +tmux new-session -d -s sentry-agent "export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_NAME=sentry-agent && varlock run --path ~/.config/ -- pi --session-control --skill ~/.pi/agent/skills/sentry-agent --model " ``` **Model note**: `github-copilot/*` models reject Personal Access Tokens and will fail in non-interactive sessions. @@ -351,12 +341,13 @@ The sentry-agent operates in **on-demand mode** — it does NOT poll. Sentry ale The `startup-cleanup.sh` script handles bridge (re)start automatically — it detects broker vs Socket Mode, reads the control-agent UUID, and starts the bridge as a normal background process. -If you need to restart the bridge manually, run `startup-cleanup.sh` again and then inspect logs: +If you need to restart the bridge manually, rerun startup cleanup and then inspect logs: ```bash +bash ~/.pi/agent/skills/control-agent/startup-cleanup.sh UUID1 UUID2 UUID3 tail -n 200 ~/.pi/agent/logs/slack-bridge.log ``` -Verify API readiness: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. +Verify: `curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return `400`. The bridge forwards: - **Human @mentions and DMs** from allowed users → delivered to you with security boundaries for handling @@ -369,9 +360,9 @@ Health checks run automatically every ~10 minutes via the `heartbeat.ts` extensi If you need to check manually, use `heartbeat trigger` to run all checks immediately. When the heartbeat reports a failure, take the appropriate action: -1. **Missing sentry-agent**: Respawn as a background process and re-send role assignment. -2. **Orphaned dev-agents**: Kill stale process + remove worktree. -3. **Bridge down**: Restart via `startup-cleanup.sh` and read `~/.pi/agent/logs/slack-bridge.log`. +1. **Missing sentry-agent**: Respawn with tmux and re-send role assignment. +2. **Orphaned dev-agents**: Kill tmux session and remove worktree. +3. **Bridge down**: Restart via `startup-cleanup.sh`, then check `~/.pi/agent/logs/slack-bridge.log`. 4. **Stale worktrees**: `git worktree remove --force` + `rmdir` empty parents. 5. **Stuck todos**: Escalate to user via Slack. diff --git a/pi/skills/control-agent/memory/operational.md b/pi/skills/control-agent/memory/operational.md index 7a4f632..55df260 100644 --- a/pi/skills/control-agent/memory/operational.md +++ b/pi/skills/control-agent/memory/operational.md @@ -8,5 +8,5 @@ Add entries under dated headings. Keep entries concise — one line per learning From 0692c67be65fcb4adec23b4cf420d8c4cce50748 Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Mon, 23 Feb 2026 09:13:37 -0500 Subject: [PATCH 3/4] ops: clarify bridge pid tracks supervisor loop --- start.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/start.sh b/start.sh index a3ee453..a0ad82c 100755 --- a/start.sh +++ b/start.sh @@ -110,6 +110,8 @@ if [ -n "$BRIDGE_SCRIPT" ]; then sleep 5 done ) & + # Intentionally track the supervisor subshell PID (not per-restart node child PID) + # so a single kill stops the entire bridge restart loop. echo $! > "$BRIDGE_PID_FILE" chmod 600 "$BRIDGE_PID_FILE" fi From 342626d7e2416d99eeba2eadc2c72eb73d3ec3a2 Mon Sep 17 00:00:00 2001 From: Ben Vinegar Date: Mon, 23 Feb 2026 11:26:48 -0500 Subject: [PATCH 4/4] ops: harden bridge restart supervision in startup cleanup --- pi/skills/control-agent/startup-cleanup.sh | 34 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index b9e027b..81a456d 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -70,14 +70,33 @@ BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" +kill_bridge_supervisor() { + local bridge_pid="$1" + [ -n "$bridge_pid" ] || return 0 + if ! kill -0 "$bridge_pid" 2>/dev/null; then + return 0 + fi + + # Best-effort: terminate direct children first so no stale bridge process keeps the port. + local bridge_child_pids + bridge_child_pids="$(pgrep -P "$bridge_pid" 2>/dev/null || true)" + if [ -n "$bridge_child_pids" ]; then + kill $bridge_child_pids 2>/dev/null || true + sleep 1 + kill -9 $bridge_child_pids 2>/dev/null || true + fi + + kill "$bridge_pid" 2>/dev/null || true + sleep 1 + kill -9 "$bridge_pid" 2>/dev/null || true +} + # Kill existing slack-bridge process if running if [ -f "$BRIDGE_PID_FILE" ]; then BRIDGE_PID="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" if [ -n "$BRIDGE_PID" ] && kill -0 "$BRIDGE_PID" 2>/dev/null; then echo "Killing existing slack-bridge process (pid=$BRIDGE_PID)..." - kill "$BRIDGE_PID" 2>/dev/null || true - sleep 1 - kill -9 "$BRIDGE_PID" 2>/dev/null || true + kill_bridge_supervisor "$BRIDGE_PID" fi rm -f "$BRIDGE_PID_FILE" fi @@ -109,6 +128,7 @@ if [ -z "$BRIDGE_SCRIPT" ]; then fi # Start fresh slack-bridge +# Keep a supervisor loop (matching start.sh) so bridge restarts automatically on crash. echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..." mkdir -p "$BRIDGE_LOG_DIR" ( @@ -116,8 +136,12 @@ mkdir -p "$BRIDGE_LOG_DIR" export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" export PI_SESSION_ID="$MY_UUID" cd /opt/baudbot/current/slack-bridge - exec varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" -) >>"$BRIDGE_LOG_FILE" 2>&1 & + while true; do + varlock run --path ~/.config/ -- node "$BRIDGE_SCRIPT" >>"$BRIDGE_LOG_FILE" 2>&1 + echo "[$(date -Is)] ⚠️ Bridge exited ($?), restarting in 5s..." >>"$BRIDGE_LOG_FILE" + sleep 5 + done +) & NEW_BRIDGE_PID=$! echo "$NEW_BRIDGE_PID" > "$BRIDGE_PID_FILE" chmod 600 "$BRIDGE_PID_FILE"