terraphim-ai/adf-orchestrator.sh at main · terraphim/terraphim-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/bin/bash
#
# ADF Infrastructure Orchestrator - Health Monitor & Auto-Recovery
# Prevents Gitea infrastructure crashes caused by docker system prune
#
# Installation:
#   sudo cp adf-orchestrator.sh /usr/local/bin/
#   sudo chmod +x /usr/local/bin/adf-orchestrator.sh
#   sudo systemctl enable adf-orchestrator.service
#

set -euo pipefail

# Configuration
STACK_DIR="/home/alex/gitea-stack"
COMPOSE_FILE="docker-compose.yml"
LOG_FILE="/var/log/adf-orchestrator.log"
ALERT_EMAIL="admin@terraphim.cloud"
HEALTH_CHECK_INTERVAL=30
MAX_RESTART_ATTEMPTS=3
RESTART_WINDOW=300  # 5 minutes

# Services to monitor (in dependency order)
SERVICES=("db" "master" "volume" "filer" "s3" "prometheus" "server")

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Logging function
log() {
    local level="$1"
    shift
    local message="$*"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    echo -e "${timestamp} [${level}] ${message}" | tee -a "$LOG_FILE"
}

# Check if running as root for systemctl access
check_root() {
    if [[ $EUID -ne 0 ]]; then
        log "WARN" "Not running as root. Some features may be limited."
    fi
}

# Get container status
get_container_status() {
    local service="$1"
    cd "$STACK_DIR"
    docker compose ps -q "$service" 2>/dev/null | xargs -I {} docker inspect -f '{{.State.Status}}' {} 2>/dev/null || echo "not_found"
}

# Check if container is healthy
check_health() {
    local service="$1"
    local status=$(get_container_status "$service")

    case "$status" in
        "running")
            # Check health status if available
            local health=$(cd "$STACK_DIR" && docker compose ps -q "$service" 2>/dev/null | xargs -I {} docker inspect -f '{{.State.Health.Status}}' {} 2>/dev/null || echo "none")
            if [[ "$health" == "healthy" ]] || [[ "$health" == "none" ]]; then
                return 0
            else
                log "WARN" "Service $service is running but health check status: $health"
                return 1
            fi
            ;;
        "exited"|"dead"|"not_found")
            return 1
            ;;
        *)
            log "WARN" "Unknown status '$status' for service $service"
            return 1
            ;;
    esac
}

# Restart a service
restart_service() {
    local service="$1"
    log "INFO" "${YELLOW}Restarting service: $service${NC}"

    cd "$STACK_DIR"

    # Stop if running
    docker compose stop "$service" 2>/dev/null || true
    sleep 2

    # Remove container to ensure clean state
    docker compose rm -f "$service" 2>/dev/null || true
    sleep 1

    # Start service
    if docker compose up -d "$service"; then
        log "INFO" "${GREEN}Successfully restarted $service${NC}"

        # Wait for health check
        sleep 10
        local attempts=0
        while [[ $attempts -lt 6 ]]; do
            if check_health "$service"; then
                log "INFO" "${GREEN}Service $service is healthy${NC}"
                return 0
            fi
            attempts=$((attempts + 1))
            log "INFO" "Waiting for $service to become healthy (attempt $attempts/6)..."
            sleep 5
        done

        log "ERROR" "${RED}Service $service failed health check after restart${NC}"
        return 1
    else
        log "ERROR" "${RED}Failed to restart $service${NC}"
        return 1
    fi
}

# Monitor all services
monitor_services() {
    local failed_services=()

    for service in "${SERVICES[@]}"; do
        if ! check_health "$service"; then
            failed_services+=("$service")
        fi
    done

    if [[ ${#failed_services[@]} -eq 0 ]]; then
        return 0
    fi

    log "WARN" "${YELLOW}Detected ${#failed_services[@]} failed service(s): ${failed_services[*]}${NC}"

    # Restart failed services in dependency order
    for service in "${SERVICES[@]}"; do
        if [[ " ${failed_services[*]} " =~ " ${service} " ]]; then
            restart_service "$service" || true
        fi
    done
}

# Protect against docker system prune by ensuring all containers are running
protect_against_prune() {
    # Ensure stack directory exists
    if [[ ! -d "$STACK_DIR" ]]; then
        log "ERROR" "${RED}Stack directory $STACK_DIR does not exist!${NC}"
        return 1
    fi

    local not_running=()

    for service in "${SERVICES[@]}"; do
        local status=$(cd "$STACK_DIR" && docker compose ps -q "$service" 2>/dev/null | xargs -I {} docker inspect -f '{{.State.Status}}' {} 2>/dev/null || echo "not_found")
        if [[ "$status" != "running" ]]; then
            not_running+=("$service")
        fi
    done

    if [[ ${#not_running[@]} -gt 0 ]]; then
        log "WARN" "${YELLOW}Found ${#not_running[@]} stopped container(s) - possible docker prune impact: ${not_running[*]}${NC}"

        # Recreate and restart all services to ensure consistency
        log "INFO" "Performing full stack restart to recover from potential prune..."
        cd "$STACK_DIR"
        docker compose up -d --remove-orphans

        # Wait for all services
        sleep 30
        monitor_services
    fi
}

# Check disk space
check_disk_space() {
    local usage=$(df -h / | awk 'NR==2 {print $5}' | tr -d '%')
    if [[ $usage -gt 90 ]]; then
        log "ERROR" "${RED}CRITICAL: Disk usage is at ${usage}%. Clean up required!${NC}"
        # Alert via email if configured
        if command -v mail &> /dev/null && [[ -n "$ALERT_EMAIL" ]]; then
            echo "Disk usage critical: ${usage}% on $(hostname)" | mail -s "ADF Alert: Low Disk Space" "$ALERT_EMAIL" || true
        fi
        return 1
    elif [[ $usage -gt 80 ]]; then
        log "WARN" "${YELLOW}WARNING: Disk usage is at ${usage}%. Monitor closely.${NC}"
    fi
    return 0
}

# Check memory usage
check_memory() {
    local usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100.0)}')
    if [[ $usage -gt 95 ]]; then
        log "ERROR" "${RED}CRITICAL: Memory usage is at ${usage}%. Risk of OOM kills!${NC}"
        return 1
    elif [[ $usage -gt 85 ]]; then
        log "WARN" "${YELLOW}WARNING: Memory usage is at ${usage}%.${NC}"
    fi
    return 0
}

# Main monitoring loop
main_loop() {
    log "INFO" "${GREEN}=== ADF Orchestrator Started ===${NC}"
    log "INFO" "Monitoring services: ${SERVICES[*]}"
    log "INFO" "Health check interval: ${HEALTH_CHECK_INTERVAL}s"

    while true; do
        # System health checks
        check_disk_space || true
        check_memory || true

        # Service health checks
        protect_against_prune
        monitor_services

        # Wait before next check
        sleep "$HEALTH_CHECK_INTERVAL"
    done
}

# Handle signals gracefully
cleanup() {
    log "INFO" "${YELLOW}ADF Orchestrator shutting down...${NC}"
    exit 0
}

trap cleanup SIGTERM SIGINT

# Ensure log directory exists
mkdir -p "$(dirname "$LOG_FILE")" || true

# Run main loop
check_root
main_loop