Skip to content

Commit 1158c1e

Browse files
baudbot-agentBaudbot
andauthored
fix: graceful bridge restarts — SIGTERM handling + EADDRINUSE retry (#161)
Co-authored-by: Baudbot <hornet@agentmail.to>
1 parent c67a481 commit 1158c1e

2 files changed

Lines changed: 84 additions & 6 deletions

File tree

pi/skills/control-agent/startup-cleanup.sh

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,35 @@ mkdir -p "$BRIDGE_LOG_DIR"
8787
# --- Kill anything holding port 7890, any existing bridge tmux session,
8888
# and any leftover old-style PID-file supervisor.
8989
echo "Cleaning up old bridge..."
90+
91+
# Kill the tmux session first — this stops the restart loop from respawning
92+
# the bridge while we're trying to clean up the port.
93+
tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true
94+
95+
# Now gracefully stop any process on the port. SIGTERM lets the bridge close
96+
# the HTTP server and release the port cleanly; SIGKILL is the fallback.
9097
PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true)
9198
if [ -n "$PORT_PIDS" ]; then
92-
echo "Killing processes on port 7890: $PORT_PIDS"
93-
echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
94-
sleep 1
99+
echo "Stopping processes on port 7890 (SIGTERM): $PORT_PIDS"
100+
echo "$PORT_PIDS" | xargs kill 2>/dev/null || true
101+
# Wait up to 3s for graceful shutdown
102+
for i in 1 2 3; do
103+
sleep 1
104+
PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true)
105+
[ -z "$PORT_PIDS" ] && break
106+
done
107+
# Force-kill anything that didn't exit
108+
if [ -n "$PORT_PIDS" ]; then
109+
echo "Force-killing stubborn processes: $PORT_PIDS"
110+
echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
111+
sleep 1
112+
fi
95113
fi
96-
tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true
114+
97115
OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
98116
if [ -f "$OLD_PID_FILE" ]; then
99117
OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)"
100-
[ -n "$OLD_PID" ] && kill -9 "$OLD_PID" 2>/dev/null || true
118+
[ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true
101119
rm -f "$OLD_PID_FILE"
102120
fi
103121

@@ -151,6 +169,12 @@ tmux new-session -d -s "$BRIDGE_TMUX_SESSION" "\
151169
exit_code=\$?; \
152170
echo \"[\$(date -Is)] bridge: exited with code \$exit_code, restarting in 5s\" >> $BRIDGE_LOG_FILE; \
153171
sleep 5; \
172+
tries=0; \
173+
while lsof -ti :7890 >/dev/null 2>&1 && [ \$tries -lt 10 ]; do \
174+
echo \"[\$(date -Is)] bridge: port 7890 still in use, waiting...\" >> $BRIDGE_LOG_FILE; \
175+
sleep 2; \
176+
tries=\$((tries + 1)); \
177+
done; \
154178
done"
155179

156180
echo "Bridge tmux session: $BRIDGE_TMUX_SESSION"

slack-bridge/broker-bridge.mjs

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,36 @@ function getLogLinesForResponse(url) {
885885
return lines;
886886
}
887887

888+
/** Reference to the HTTP server so we can close it on shutdown. */
889+
let apiServer = null;
890+
let shuttingDown = false;
891+
892+
/**
893+
* Graceful shutdown: close the HTTP server (releases the port), then exit.
894+
* Called on SIGTERM/SIGINT so restarts don't fight over the port.
895+
*/
896+
function gracefulShutdown(signal) {
897+
if (shuttingDown) return;
898+
shuttingDown = true;
899+
logInfo(`🛑 received ${signal} — shutting down gracefully`);
900+
if (apiServer) {
901+
apiServer.close(() => {
902+
logInfo("🛑 HTTP server closed, exiting");
903+
process.exit(0);
904+
});
905+
// Force exit after 5s if connections don't drain
906+
setTimeout(() => {
907+
logWarn("🛑 forceful exit after 5s timeout");
908+
process.exit(1);
909+
}, 5000).unref();
910+
} else {
911+
process.exit(0);
912+
}
913+
}
914+
915+
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
916+
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
917+
888918
function startApiServer() {
889919
const server = createServer(async (req, res) => {
890920
const url = new URL(req.url, `http://localhost:${API_PORT}`);
@@ -1024,9 +1054,33 @@ function startApiServer() {
10241054
}
10251055
});
10261056

1027-
server.listen(API_PORT, "127.0.0.1", () => {
1057+
// Retry with backoff if the port is still held by a dying predecessor.
1058+
const MAX_BIND_RETRIES = 5;
1059+
const BIND_RETRY_DELAY_MS = 2000;
1060+
let bindAttempt = 0;
1061+
1062+
function tryListen() {
1063+
bindAttempt++;
1064+
server.listen(API_PORT, "127.0.0.1");
1065+
}
1066+
1067+
server.on("listening", () => {
1068+
apiServer = server;
10281069
logInfo(`📡 Outbound API listening on http://127.0.0.1:${API_PORT}`);
10291070
});
1071+
1072+
server.on("error", (err) => {
1073+
if (err.code === "EADDRINUSE" && bindAttempt < MAX_BIND_RETRIES) {
1074+
logWarn(`⚠️ port ${API_PORT} in use, retrying in ${BIND_RETRY_DELAY_MS}ms (attempt ${bindAttempt}/${MAX_BIND_RETRIES})`);
1075+
server.close();
1076+
setTimeout(tryListen, BIND_RETRY_DELAY_MS);
1077+
} else {
1078+
logError(`❌ HTTP server error: ${err.message}`);
1079+
process.exit(1);
1080+
}
1081+
});
1082+
1083+
tryListen();
10301084
}
10311085

10321086
async function startPollLoop() {

0 commit comments

Comments
 (0)