Skip to content

Commit c8acc8b

Browse files
authored
runtime: add adaptive Slack bridge restart policy (#148)
1 parent 388269b commit c8acc8b

10 files changed

Lines changed: 428 additions & 10 deletions

File tree

.env.schema

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,3 +196,27 @@ BRIDGE_API_PORT=7890
196196
# Target pi session ID (auto-detects control-agent if unset)
197197
# @sensitive=false @type=string
198198
PI_SESSION_ID=
199+
200+
# Bridge restart policy mode: legacy (fixed 5s restart) or adaptive (backoff + jitter)
201+
# @sensitive=false @type=string
202+
BAUDBOT_BRIDGE_RESTART_POLICY=
203+
204+
# Adaptive restart base delay (seconds)
205+
# @sensitive=false @type=number
206+
BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS=
207+
208+
# Adaptive restart max delay cap (seconds)
209+
# @sensitive=false @type=number
210+
BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS=
211+
212+
# Adaptive restart stable runtime window before counters reset (seconds)
213+
# @sensitive=false @type=number
214+
BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS=
215+
216+
# Adaptive restart degraded-state threshold for consecutive failures
217+
# @sensitive=false @type=number
218+
BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES=
219+
220+
# Adaptive restart random jitter upper bound (seconds)
221+
# @sensitive=false @type=number
222+
BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS=

CONFIGURATION.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@ On startup, Baudbot verifies deployed runtime files against `~/.pi/agent/baudbot
184184
|----------|-------------|---------|
185185
| `BRIDGE_API_PORT` | Local HTTP API port for outbound Slack messages | `7890` |
186186
| `PI_SESSION_ID` | Target pi session ID for the bridge | Auto-detects control-agent |
187+
| `BAUDBOT_BRIDGE_RESTART_POLICY` | Bridge supervisor mode (`legacy` or `adaptive`) | auto (`legacy` unless adaptive knobs are set) |
188+
| `BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS` | Adaptive mode base restart delay | `5` |
189+
| `BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS` | Adaptive mode max backoff delay | `300` |
190+
| `BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS` | Runtime window that resets failure/backoff counters | `120` |
191+
| `BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES` | Threshold that marks supervisor state as degraded (`threshold_exceeded`) | `5` |
192+
| `BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS` | Random jitter added to each adaptive restart sleep | `2` |
187193

188194
## Example `.env` File
189195

bin/deploy.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ if [ "$DRY_RUN" -eq 0 ]; then
8787
[ -f "$BAUDBOT_SRC/bin/$script" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/$script" "$STAGE_DIR/bin/$script"
8888
done
8989
[ -f "$BAUDBOT_SRC/bin/lib/runtime-node.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/runtime-node.sh" "$STAGE_DIR/bin/lib/runtime-node.sh"
90+
[ -f "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" "$STAGE_DIR/bin/lib/bridge-restart-policy.sh"
9091
[ -f "$BAUDBOT_SRC/pi/settings.json" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/pi/settings.json" "$STAGE_DIR/settings.json"
9192
[ -f "$BAUDBOT_SRC/.env.schema" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/.env.schema" "$STAGE_DIR/.env.schema"
9293
chmod -R a+rX "$STAGE_DIR"
@@ -263,6 +264,12 @@ if [ "$DRY_RUN" -eq 0 ]; then
263264
log "✓ bin/lib/runtime-node.sh"
264265
fi
265266

267+
if [ -f "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" ]; then
268+
as_agent cp "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh"
269+
as_agent chmod u+r "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh"
270+
log "✓ bin/lib/bridge-restart-policy.sh"
271+
fi
272+
266273
as_agent cp "$STAGE_DIR/start.sh" "$BAUDBOT_HOME/runtime/start.sh"
267274
as_agent chmod u+x "$BAUDBOT_HOME/runtime/start.sh"
268275
log "✓ start.sh"

bin/lib/baudbot-runtime.sh

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,61 @@ PY
193193
[ -n "$components_line" ] && echo -e "${BOLD}broker health:${RESET} $components_line"
194194
}
195195

196+
print_bridge_supervisor_status() {
197+
local agent_user="${BAUDBOT_AGENT_USER:-baudbot_agent}"
198+
local status_file="/home/$agent_user/.pi/agent/slack-bridge-supervisor.json"
199+
local summary=""
200+
local mode=""
201+
local state=""
202+
local failures=""
203+
local threshold=""
204+
205+
if [ ! -r "$status_file" ]; then
206+
return 0
207+
fi
208+
209+
summary="$(python3 - "$status_file" <<'PY'
210+
import json
211+
import sys
212+
213+
path = sys.argv[1]
214+
try:
215+
with open(path, 'r', encoding='utf-8') as f:
216+
data = json.load(f)
217+
except Exception:
218+
print('')
219+
sys.exit(0)
220+
221+
print(data.get('mode', 'unknown'))
222+
print(data.get('state', 'unknown'))
223+
print(data.get('consecutive_failures', 0))
224+
print(data.get('max_consecutive_failures', 0))
225+
PY
226+
)"
227+
228+
mode="$(printf '%s\n' "$summary" | sed -n '1p')"
229+
state="$(printf '%s\n' "$summary" | sed -n '2p')"
230+
failures="$(printf '%s\n' "$summary" | sed -n '3p')"
231+
threshold="$(printf '%s\n' "$summary" | sed -n '4p')"
232+
233+
[ -n "$mode" ] || return 0
234+
235+
case "$state" in
236+
threshold_exceeded)
237+
echo -e "${BOLD}bridge supervisor:${RESET} degraded (mode=$mode failures=$failures threshold=$threshold)"
238+
;;
239+
restarting)
240+
echo -e "${BOLD}bridge supervisor:${RESET} restarting (mode=$mode failures=$failures)"
241+
;;
242+
running)
243+
echo -e "${BOLD}bridge supervisor:${RESET} healthy (mode=$mode)"
244+
;;
245+
*)
246+
echo -e "${BOLD}bridge supervisor:${RESET} $state (mode=$mode)"
247+
;;
248+
esac
249+
}
250+
196251
pi_control_dir() {
197252
local agent_user="${1:-baudbot_agent}"
198253
echo "/home/$agent_user/.pi/session-control"
@@ -290,6 +345,7 @@ cmd_status() {
290345
echo ""
291346
print_deployed_version
292347
print_broker_connection_status
348+
print_bridge_supervisor_status
293349
exit "$status_rc"
294350
fi
295351

@@ -302,6 +358,7 @@ cmd_status() {
302358
echo ""
303359
print_deployed_version
304360
print_broker_connection_status
361+
print_bridge_supervisor_status
305362
}
306363

307364
cmd_logs() {

bin/lib/bridge-restart-policy.sh

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/usr/bin/env bash
2+
# Shared Slack bridge restart policy helpers.
3+
4+
bb_bridge_policy_mode() {
5+
if [ -n "${BAUDBOT_BRIDGE_RESTART_POLICY:-}" ]; then
6+
case "${BAUDBOT_BRIDGE_RESTART_POLICY}" in
7+
adaptive|ADAPTIVE|Adaptive) echo "adaptive"; return 0 ;;
8+
legacy|LEGACY|Legacy) echo "legacy"; return 0 ;;
9+
esac
10+
fi
11+
12+
if [ -n "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" ] \
13+
|| [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" ] \
14+
|| [ -n "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" ] \
15+
|| [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" ] \
16+
|| [ -n "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" ]; then
17+
echo "adaptive"
18+
return 0
19+
fi
20+
21+
# Backward-compatible fallback when no policy configuration is provided.
22+
echo "legacy"
23+
}
24+
25+
bb_bridge_policy_int() {
26+
local raw="${1:-}"
27+
local fallback="${2:-0}"
28+
29+
if [ -z "$raw" ]; then
30+
echo "$fallback"
31+
return 0
32+
fi
33+
34+
if [[ "$raw" =~ ^[0-9]+$ ]]; then
35+
echo "$raw"
36+
return 0
37+
fi
38+
39+
echo "$fallback"
40+
}
41+
42+
bb_bridge_policy_compute_next_delay() {
43+
local current="$1"
44+
local max_delay="$2"
45+
local doubled=$((current * 2))
46+
47+
if [ "$doubled" -gt "$max_delay" ]; then
48+
echo "$max_delay"
49+
else
50+
echo "$doubled"
51+
fi
52+
}
53+
54+
bb_bridge_policy_random_jitter() {
55+
local max_jitter="$1"
56+
57+
if [ "$max_jitter" -le 0 ]; then
58+
echo 0
59+
return 0
60+
fi
61+
62+
echo $((RANDOM % (max_jitter + 1)))
63+
}
64+
65+
bb_bridge_policy_log() {
66+
local log_file="$1"
67+
shift
68+
69+
if [ -z "$log_file" ]; then
70+
return 0
71+
fi
72+
73+
printf '[%s] bridge-supervisor %s\n' "$(date -Is)" "$*" >>"$log_file"
74+
}
75+
76+
bb_bridge_policy_write_status() {
77+
local status_file="$1"
78+
local mode="$2"
79+
local bridge_script="$3"
80+
local state="$4"
81+
local consecutive_failures="$5"
82+
local delay_seconds="$6"
83+
local max_failures="$7"
84+
local last_exit_code="$8"
85+
local last_runtime_seconds="$9"
86+
87+
[ -n "$status_file" ] || return 0
88+
mkdir -p "$(dirname "$status_file")" 2>/dev/null || true
89+
90+
cat >"$status_file" <<EOF
91+
{
92+
"updated_at": "$(date -Is)",
93+
"mode": "$mode",
94+
"bridge_script": "$bridge_script",
95+
"state": "$state",
96+
"consecutive_failures": $consecutive_failures,
97+
"current_delay_seconds": $delay_seconds,
98+
"max_consecutive_failures": $max_failures,
99+
"last_exit_code": $last_exit_code,
100+
"last_runtime_seconds": $last_runtime_seconds
101+
}
102+
EOF
103+
}
104+
105+
bb_bridge_supervise() {
106+
local log_file="$1"
107+
local status_file="$2"
108+
local bridge_script="$3"
109+
shift 3
110+
111+
local mode
112+
mode="$(bb_bridge_policy_mode)"
113+
114+
if [ "$mode" = "legacy" ]; then
115+
bb_bridge_policy_log "$log_file" "event=policy_selected mode=legacy restart_delay_seconds=5"
116+
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" 0 5 0 0 0
117+
118+
while true; do
119+
local exit_code=0
120+
if "$@" >>"$log_file" 2>&1; then
121+
exit_code=0
122+
else
123+
exit_code=$?
124+
fi
125+
126+
bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=legacy script=$bridge_script exit_code=$exit_code delay_seconds=5"
127+
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "restarting" 0 5 0 "$exit_code" 0
128+
sleep 5
129+
done
130+
fi
131+
132+
local base_delay max_delay stable_window max_failures max_jitter
133+
base_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" 5)"
134+
max_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" 300)"
135+
stable_window="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" 120)"
136+
max_failures="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" 5)"
137+
max_jitter="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" 2)"
138+
139+
if [ "$max_delay" -lt "$base_delay" ]; then
140+
max_delay="$base_delay"
141+
fi
142+
143+
local consecutive_failures=0
144+
local current_delay="$base_delay"
145+
146+
bb_bridge_policy_log "$log_file" "event=policy_selected mode=adaptive base_delay_seconds=$base_delay max_delay_seconds=$max_delay stable_window_seconds=$stable_window max_consecutive_failures=$max_failures max_jitter_seconds=$max_jitter"
147+
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" "$consecutive_failures" "$current_delay" "$max_failures" 0 0
148+
149+
while true; do
150+
local started_at finished_at runtime_seconds exit_code
151+
started_at="$(date +%s)"
152+
if "$@" >>"$log_file" 2>&1; then
153+
exit_code=0
154+
else
155+
exit_code=$?
156+
fi
157+
finished_at="$(date +%s)"
158+
runtime_seconds=$((finished_at - started_at))
159+
160+
local reset_failures=0
161+
local scheduled_delay="$current_delay"
162+
if [ "$runtime_seconds" -ge "$stable_window" ]; then
163+
reset_failures=1
164+
consecutive_failures=0
165+
scheduled_delay="$base_delay"
166+
current_delay="$base_delay"
167+
bb_bridge_policy_log "$log_file" "event=stable_window_reset mode=adaptive script=$bridge_script runtime_seconds=$runtime_seconds stable_window_seconds=$stable_window"
168+
else
169+
consecutive_failures=$((consecutive_failures + 1))
170+
scheduled_delay="$current_delay"
171+
current_delay="$(bb_bridge_policy_compute_next_delay "$current_delay" "$max_delay")"
172+
fi
173+
174+
local jitter_seconds total_sleep_seconds
175+
jitter_seconds="$(bb_bridge_policy_random_jitter "$max_jitter")"
176+
total_sleep_seconds=$((scheduled_delay + jitter_seconds))
177+
178+
local state="restarting"
179+
if [ "$max_failures" -gt 0 ] && [ "$consecutive_failures" -ge "$max_failures" ]; then
180+
state="threshold_exceeded"
181+
bb_bridge_policy_log "$log_file" "event=restart_threshold_exceeded mode=adaptive script=$bridge_script consecutive_failures=$consecutive_failures threshold=$max_failures exit_code=$exit_code runtime_seconds=$runtime_seconds"
182+
fi
183+
184+
bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=adaptive script=$bridge_script exit_code=$exit_code runtime_seconds=$runtime_seconds reset_failures=$reset_failures consecutive_failures=$consecutive_failures backoff_seconds=$scheduled_delay next_backoff_seconds=$current_delay jitter_seconds=$jitter_seconds sleep_seconds=$total_sleep_seconds"
185+
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "$state" "$consecutive_failures" "$scheduled_delay" "$max_failures" "$exit_code" "$runtime_seconds"
186+
187+
sleep "$total_sleep_seconds"
188+
done
189+
}

0 commit comments

Comments
 (0)