Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .env.schema
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,27 @@ BRIDGE_API_PORT=7890
# Target pi session ID (auto-detects control-agent if unset)
# @sensitive=false @type=string
PI_SESSION_ID=

# Bridge restart policy mode: legacy (fixed 5s restart) or adaptive (backoff + jitter)
# @sensitive=false @type=string
BAUDBOT_BRIDGE_RESTART_POLICY=

# Adaptive restart base delay (seconds)
# @sensitive=false @type=number
BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS=

# Adaptive restart max delay cap (seconds)
# @sensitive=false @type=number
BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS=

# Adaptive restart stable runtime window before counters reset (seconds)
# @sensitive=false @type=number
BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS=

# Adaptive restart degraded-state threshold for consecutive failures
# @sensitive=false @type=number
BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES=

# Adaptive restart random jitter upper bound (seconds)
# @sensitive=false @type=number
BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS=
6 changes: 6 additions & 0 deletions CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,12 @@ Set during `setup.sh` / `baudbot install` via env vars:
|----------|-------------|---------|
| `BRIDGE_API_PORT` | Local HTTP API port for outbound Slack messages | `7890` |
| `PI_SESSION_ID` | Target pi session ID for the bridge | Auto-detects control-agent |
| `BAUDBOT_BRIDGE_RESTART_POLICY` | Bridge supervisor mode (`legacy` or `adaptive`) | auto (`legacy` unless adaptive knobs are set) |
| `BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS` | Adaptive mode base restart delay | `5` |
| `BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS` | Adaptive mode max backoff delay | `300` |
| `BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS` | Runtime window that resets failure/backoff counters | `120` |
| `BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES` | Threshold that marks supervisor state as degraded (`threshold_exceeded`) | `5` |
| `BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS` | Random jitter added to each adaptive restart sleep | `2` |

## Example `.env` File

Expand Down
7 changes: 7 additions & 0 deletions bin/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ if [ "$DRY_RUN" -eq 0 ]; then
[ -f "$BAUDBOT_SRC/bin/$script" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/$script" "$STAGE_DIR/bin/$script"
done
[ -f "$BAUDBOT_SRC/bin/lib/runtime-node.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/runtime-node.sh" "$STAGE_DIR/bin/lib/runtime-node.sh"
[ -f "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/bin/lib/bridge-restart-policy.sh" "$STAGE_DIR/bin/lib/bridge-restart-policy.sh"
[ -f "$BAUDBOT_SRC/pi/settings.json" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/pi/settings.json" "$STAGE_DIR/settings.json"
[ -f "$BAUDBOT_SRC/.env.schema" ] && cp --no-preserve=ownership "$BAUDBOT_SRC/.env.schema" "$STAGE_DIR/.env.schema"
chmod -R a+rX "$STAGE_DIR"
Expand Down Expand Up @@ -263,6 +264,12 @@ if [ "$DRY_RUN" -eq 0 ]; then
log "✓ bin/lib/runtime-node.sh"
fi

if [ -f "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" ]; then
as_agent cp "$STAGE_DIR/bin/lib/bridge-restart-policy.sh" "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh"
as_agent chmod u+r "$BAUDBOT_HOME/runtime/bin/lib/bridge-restart-policy.sh"
log "✓ bin/lib/bridge-restart-policy.sh"
fi

as_agent cp "$STAGE_DIR/start.sh" "$BAUDBOT_HOME/runtime/start.sh"
as_agent chmod u+x "$BAUDBOT_HOME/runtime/start.sh"
log "✓ start.sh"
Expand Down
57 changes: 57 additions & 0 deletions bin/lib/baudbot-runtime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,61 @@ PY
[ -n "$components_line" ] && echo -e "${BOLD}broker health:${RESET} $components_line"
}

print_bridge_supervisor_status() {
local agent_user="${BAUDBOT_AGENT_USER:-baudbot_agent}"
local status_file="/home/$agent_user/.pi/agent/slack-bridge-supervisor.json"
local summary=""
local mode=""
local state=""
local failures=""
local threshold=""

if [ ! -r "$status_file" ]; then
return 0
fi

summary="$(python3 - "$status_file" <<'PY'
import json
import sys

path = sys.argv[1]
try:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
except Exception:
print('')
sys.exit(0)

print(data.get('mode', 'unknown'))
print(data.get('state', 'unknown'))
print(data.get('consecutive_failures', 0))
print(data.get('max_consecutive_failures', 0))
PY
)"

mode="$(printf '%s\n' "$summary" | sed -n '1p')"
state="$(printf '%s\n' "$summary" | sed -n '2p')"
failures="$(printf '%s\n' "$summary" | sed -n '3p')"
threshold="$(printf '%s\n' "$summary" | sed -n '4p')"

[ -n "$mode" ] || return 0

case "$state" in
threshold_exceeded)
echo -e "${BOLD}bridge supervisor:${RESET} degraded (mode=$mode failures=$failures threshold=$threshold)"
;;
restarting)
echo -e "${BOLD}bridge supervisor:${RESET} restarting (mode=$mode failures=$failures)"
;;
running)
echo -e "${BOLD}bridge supervisor:${RESET} healthy (mode=$mode)"
;;
*)
echo -e "${BOLD}bridge supervisor:${RESET} $state (mode=$mode)"
;;
esac
}

pi_control_dir() {
local agent_user="${1:-baudbot_agent}"
echo "/home/$agent_user/.pi/session-control"
Expand Down Expand Up @@ -290,6 +345,7 @@ cmd_status() {
echo ""
print_deployed_version
print_broker_connection_status
print_bridge_supervisor_status
exit "$status_rc"
fi

Expand All @@ -302,6 +358,7 @@ cmd_status() {
echo ""
print_deployed_version
print_broker_connection_status
print_bridge_supervisor_status
}

cmd_logs() {
Expand Down
189 changes: 189 additions & 0 deletions bin/lib/bridge-restart-policy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/usr/bin/env bash
# Shared Slack bridge restart policy helpers.

bb_bridge_policy_mode() {
if [ -n "${BAUDBOT_BRIDGE_RESTART_POLICY:-}" ]; then
case "${BAUDBOT_BRIDGE_RESTART_POLICY}" in
adaptive|ADAPTIVE|Adaptive) echo "adaptive"; return 0 ;;
legacy|LEGACY|Legacy) echo "legacy"; return 0 ;;
esac
fi

if [ -n "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" ] \
|| [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" ] \
|| [ -n "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" ] \
|| [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" ] \
|| [ -n "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" ]; then
echo "adaptive"
return 0
fi

# Backward-compatible fallback when no policy configuration is provided.
echo "legacy"
}

bb_bridge_policy_int() {
local raw="${1:-}"
local fallback="${2:-0}"

if [ -z "$raw" ]; then
echo "$fallback"
return 0
fi

if [[ "$raw" =~ ^[0-9]+$ ]]; then
echo "$raw"
return 0
fi

echo "$fallback"
}

bb_bridge_policy_compute_next_delay() {
local current="$1"
local max_delay="$2"
local doubled=$((current * 2))

if [ "$doubled" -gt "$max_delay" ]; then
echo "$max_delay"
else
echo "$doubled"
fi
}

bb_bridge_policy_random_jitter() {
local max_jitter="$1"

if [ "$max_jitter" -le 0 ]; then
echo 0
return 0
fi

echo $((RANDOM % (max_jitter + 1)))
}

bb_bridge_policy_log() {
local log_file="$1"
shift

if [ -z "$log_file" ]; then
return 0
fi

printf '[%s] bridge-supervisor %s\n' "$(date -Is)" "$*" >>"$log_file"
}

bb_bridge_policy_write_status() {
local status_file="$1"
local mode="$2"
local bridge_script="$3"
local state="$4"
local consecutive_failures="$5"
local delay_seconds="$6"
local max_failures="$7"
local last_exit_code="$8"
local last_runtime_seconds="$9"

[ -n "$status_file" ] || return 0
mkdir -p "$(dirname "$status_file")" 2>/dev/null || true

cat >"$status_file" <<EOF
{
"updated_at": "$(date -Is)",
"mode": "$mode",
"bridge_script": "$bridge_script",
"state": "$state",
"consecutive_failures": $consecutive_failures,
"current_delay_seconds": $delay_seconds,
"max_consecutive_failures": $max_failures,
"last_exit_code": $last_exit_code,
"last_runtime_seconds": $last_runtime_seconds
}
EOF
}

bb_bridge_supervise() {
local log_file="$1"
local status_file="$2"
local bridge_script="$3"
shift 3

local mode
mode="$(bb_bridge_policy_mode)"

if [ "$mode" = "legacy" ]; then
bb_bridge_policy_log "$log_file" "event=policy_selected mode=legacy restart_delay_seconds=5"
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" 0 5 0 0 0

while true; do
local exit_code=0
if "$@" >>"$log_file" 2>&1; then
exit_code=0
else
exit_code=$?
fi

bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=legacy script=$bridge_script exit_code=$exit_code delay_seconds=5"
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "restarting" 0 5 0 "$exit_code" 0
sleep 5
done
fi

local base_delay max_delay stable_window max_failures max_jitter
base_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" 5)"
max_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" 300)"
stable_window="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" 120)"
max_failures="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" 5)"
max_jitter="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" 2)"

if [ "$max_delay" -lt "$base_delay" ]; then
max_delay="$base_delay"
fi

local consecutive_failures=0
local current_delay="$base_delay"

bb_bridge_policy_log "$log_file" "event=policy_selected mode=adaptive base_delay_seconds=$base_delay max_delay_seconds=$max_delay stable_window_seconds=$stable_window max_consecutive_failures=$max_failures max_jitter_seconds=$max_jitter"
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" "$consecutive_failures" "$current_delay" "$max_failures" 0 0

while true; do
local started_at finished_at runtime_seconds exit_code
started_at="$(date +%s)"
if "$@" >>"$log_file" 2>&1; then
exit_code=0
else
exit_code=$?
fi
finished_at="$(date +%s)"
runtime_seconds=$((finished_at - started_at))

local reset_failures=0
local scheduled_delay="$current_delay"
if [ "$runtime_seconds" -ge "$stable_window" ]; then
reset_failures=1
consecutive_failures=0
scheduled_delay="$base_delay"
current_delay="$base_delay"
bb_bridge_policy_log "$log_file" "event=stable_window_reset mode=adaptive script=$bridge_script runtime_seconds=$runtime_seconds stable_window_seconds=$stable_window"
else
consecutive_failures=$((consecutive_failures + 1))
scheduled_delay="$current_delay"
current_delay="$(bb_bridge_policy_compute_next_delay "$current_delay" "$max_delay")"
fi

local jitter_seconds total_sleep_seconds
jitter_seconds="$(bb_bridge_policy_random_jitter "$max_jitter")"
total_sleep_seconds=$((scheduled_delay + jitter_seconds))

local state="restarting"
if [ "$max_failures" -gt 0 ] && [ "$consecutive_failures" -ge "$max_failures" ]; then
state="threshold_exceeded"
bb_bridge_policy_log "$log_file" "event=restart_threshold_exceeded mode=adaptive script=$bridge_script consecutive_failures=$consecutive_failures threshold=$max_failures exit_code=$exit_code runtime_seconds=$runtime_seconds"
fi

bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=adaptive script=$bridge_script exit_code=$exit_code runtime_seconds=$runtime_seconds reset_failures=$reset_failures consecutive_failures=$consecutive_failures backoff_seconds=$scheduled_delay next_backoff_seconds=$current_delay jitter_seconds=$jitter_seconds sleep_seconds=$total_sleep_seconds"
bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "$state" "$consecutive_failures" "$scheduled_delay" "$max_failures" "$exit_code" "$runtime_seconds"

sleep "$total_sleep_seconds"
done
}
Loading