|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Shared Slack bridge restart policy helpers. |
| 3 | + |
| 4 | +bb_bridge_policy_mode() { |
| 5 | + if [ -n "${BAUDBOT_BRIDGE_RESTART_POLICY:-}" ]; then |
| 6 | + case "${BAUDBOT_BRIDGE_RESTART_POLICY}" in |
| 7 | + adaptive|ADAPTIVE|Adaptive) echo "adaptive"; return 0 ;; |
| 8 | + legacy|LEGACY|Legacy) echo "legacy"; return 0 ;; |
| 9 | + esac |
| 10 | + fi |
| 11 | + |
| 12 | + if [ -n "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" ] \ |
| 13 | + || [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" ] \ |
| 14 | + || [ -n "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" ] \ |
| 15 | + || [ -n "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" ] \ |
| 16 | + || [ -n "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" ]; then |
| 17 | + echo "adaptive" |
| 18 | + return 0 |
| 19 | + fi |
| 20 | + |
| 21 | + # Backward-compatible fallback when no policy configuration is provided. |
| 22 | + echo "legacy" |
| 23 | +} |
| 24 | + |
| 25 | +bb_bridge_policy_int() { |
| 26 | + local raw="${1:-}" |
| 27 | + local fallback="${2:-0}" |
| 28 | + |
| 29 | + if [ -z "$raw" ]; then |
| 30 | + echo "$fallback" |
| 31 | + return 0 |
| 32 | + fi |
| 33 | + |
| 34 | + if [[ "$raw" =~ ^[0-9]+$ ]]; then |
| 35 | + echo "$raw" |
| 36 | + return 0 |
| 37 | + fi |
| 38 | + |
| 39 | + echo "$fallback" |
| 40 | +} |
| 41 | + |
| 42 | +bb_bridge_policy_compute_next_delay() { |
| 43 | + local current="$1" |
| 44 | + local max_delay="$2" |
| 45 | + local doubled=$((current * 2)) |
| 46 | + |
| 47 | + if [ "$doubled" -gt "$max_delay" ]; then |
| 48 | + echo "$max_delay" |
| 49 | + else |
| 50 | + echo "$doubled" |
| 51 | + fi |
| 52 | +} |
| 53 | + |
| 54 | +bb_bridge_policy_random_jitter() { |
| 55 | + local max_jitter="$1" |
| 56 | + |
| 57 | + if [ "$max_jitter" -le 0 ]; then |
| 58 | + echo 0 |
| 59 | + return 0 |
| 60 | + fi |
| 61 | + |
| 62 | + echo $((RANDOM % (max_jitter + 1))) |
| 63 | +} |
| 64 | + |
| 65 | +bb_bridge_policy_log() { |
| 66 | + local log_file="$1" |
| 67 | + shift |
| 68 | + |
| 69 | + if [ -z "$log_file" ]; then |
| 70 | + return 0 |
| 71 | + fi |
| 72 | + |
| 73 | + printf '[%s] bridge-supervisor %s\n' "$(date -Is)" "$*" >>"$log_file" |
| 74 | +} |
| 75 | + |
| 76 | +bb_bridge_policy_write_status() { |
| 77 | + local status_file="$1" |
| 78 | + local mode="$2" |
| 79 | + local bridge_script="$3" |
| 80 | + local state="$4" |
| 81 | + local consecutive_failures="$5" |
| 82 | + local delay_seconds="$6" |
| 83 | + local max_failures="$7" |
| 84 | + local last_exit_code="$8" |
| 85 | + local last_runtime_seconds="$9" |
| 86 | + |
| 87 | + [ -n "$status_file" ] || return 0 |
| 88 | + mkdir -p "$(dirname "$status_file")" 2>/dev/null || true |
| 89 | + |
| 90 | + cat >"$status_file" <<EOF |
| 91 | +{ |
| 92 | + "updated_at": "$(date -Is)", |
| 93 | + "mode": "$mode", |
| 94 | + "bridge_script": "$bridge_script", |
| 95 | + "state": "$state", |
| 96 | + "consecutive_failures": $consecutive_failures, |
| 97 | + "current_delay_seconds": $delay_seconds, |
| 98 | + "max_consecutive_failures": $max_failures, |
| 99 | + "last_exit_code": $last_exit_code, |
| 100 | + "last_runtime_seconds": $last_runtime_seconds |
| 101 | +} |
| 102 | +EOF |
| 103 | +} |
| 104 | + |
| 105 | +bb_bridge_supervise() { |
| 106 | + local log_file="$1" |
| 107 | + local status_file="$2" |
| 108 | + local bridge_script="$3" |
| 109 | + shift 3 |
| 110 | + |
| 111 | + local mode |
| 112 | + mode="$(bb_bridge_policy_mode)" |
| 113 | + |
| 114 | + if [ "$mode" = "legacy" ]; then |
| 115 | + bb_bridge_policy_log "$log_file" "event=policy_selected mode=legacy restart_delay_seconds=5" |
| 116 | + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" 0 5 0 0 0 |
| 117 | + |
| 118 | + while true; do |
| 119 | + local exit_code=0 |
| 120 | + if "$@" >>"$log_file" 2>&1; then |
| 121 | + exit_code=0 |
| 122 | + else |
| 123 | + exit_code=$? |
| 124 | + fi |
| 125 | + |
| 126 | + bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=legacy script=$bridge_script exit_code=$exit_code delay_seconds=5" |
| 127 | + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "restarting" 0 5 0 "$exit_code" 0 |
| 128 | + sleep 5 |
| 129 | + done |
| 130 | + fi |
| 131 | + |
| 132 | + local base_delay max_delay stable_window max_failures max_jitter |
| 133 | + base_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_BASE_DELAY_SECONDS:-}" 5)" |
| 134 | + max_delay="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_DELAY_SECONDS:-}" 300)" |
| 135 | + stable_window="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_STABLE_WINDOW_SECONDS:-}" 120)" |
| 136 | + max_failures="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_MAX_CONSECUTIVE_FAILURES:-}" 5)" |
| 137 | + max_jitter="$(bb_bridge_policy_int "${BAUDBOT_BRIDGE_RESTART_JITTER_SECONDS:-}" 2)" |
| 138 | + |
| 139 | + if [ "$max_delay" -lt "$base_delay" ]; then |
| 140 | + max_delay="$base_delay" |
| 141 | + fi |
| 142 | + |
| 143 | + local consecutive_failures=0 |
| 144 | + local current_delay="$base_delay" |
| 145 | + |
| 146 | + bb_bridge_policy_log "$log_file" "event=policy_selected mode=adaptive base_delay_seconds=$base_delay max_delay_seconds=$max_delay stable_window_seconds=$stable_window max_consecutive_failures=$max_failures max_jitter_seconds=$max_jitter" |
| 147 | + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "running" "$consecutive_failures" "$current_delay" "$max_failures" 0 0 |
| 148 | + |
| 149 | + while true; do |
| 150 | + local started_at finished_at runtime_seconds exit_code |
| 151 | + started_at="$(date +%s)" |
| 152 | + if "$@" >>"$log_file" 2>&1; then |
| 153 | + exit_code=0 |
| 154 | + else |
| 155 | + exit_code=$? |
| 156 | + fi |
| 157 | + finished_at="$(date +%s)" |
| 158 | + runtime_seconds=$((finished_at - started_at)) |
| 159 | + |
| 160 | + local reset_failures=0 |
| 161 | + local scheduled_delay="$current_delay" |
| 162 | + if [ "$runtime_seconds" -ge "$stable_window" ]; then |
| 163 | + reset_failures=1 |
| 164 | + consecutive_failures=0 |
| 165 | + scheduled_delay="$base_delay" |
| 166 | + current_delay="$base_delay" |
| 167 | + bb_bridge_policy_log "$log_file" "event=stable_window_reset mode=adaptive script=$bridge_script runtime_seconds=$runtime_seconds stable_window_seconds=$stable_window" |
| 168 | + else |
| 169 | + consecutive_failures=$((consecutive_failures + 1)) |
| 170 | + scheduled_delay="$current_delay" |
| 171 | + current_delay="$(bb_bridge_policy_compute_next_delay "$current_delay" "$max_delay")" |
| 172 | + fi |
| 173 | + |
| 174 | + local jitter_seconds total_sleep_seconds |
| 175 | + jitter_seconds="$(bb_bridge_policy_random_jitter "$max_jitter")" |
| 176 | + total_sleep_seconds=$((scheduled_delay + jitter_seconds)) |
| 177 | + |
| 178 | + local state="restarting" |
| 179 | + if [ "$max_failures" -gt 0 ] && [ "$consecutive_failures" -ge "$max_failures" ]; then |
| 180 | + state="threshold_exceeded" |
| 181 | + bb_bridge_policy_log "$log_file" "event=restart_threshold_exceeded mode=adaptive script=$bridge_script consecutive_failures=$consecutive_failures threshold=$max_failures exit_code=$exit_code runtime_seconds=$runtime_seconds" |
| 182 | + fi |
| 183 | + |
| 184 | + bb_bridge_policy_log "$log_file" "event=restart_scheduled mode=adaptive script=$bridge_script exit_code=$exit_code runtime_seconds=$runtime_seconds reset_failures=$reset_failures consecutive_failures=$consecutive_failures backoff_seconds=$scheduled_delay next_backoff_seconds=$current_delay jitter_seconds=$jitter_seconds sleep_seconds=$total_sleep_seconds" |
| 185 | + bb_bridge_policy_write_status "$status_file" "$mode" "$bridge_script" "$state" "$consecutive_failures" "$scheduled_delay" "$max_failures" "$exit_code" "$runtime_seconds" |
| 186 | + |
| 187 | + sleep "$total_sleep_seconds" |
| 188 | + done |
| 189 | +} |
0 commit comments