|
| 1 | +# Auto-Healing Swarm Watch (Phase 7) - MGEN-011 |
| 2 | +# Automatic detection and recovery from system failures |
| 3 | + |
| 4 | +name: auto_healing |
| 5 | +version: "1.0.0" |
| 6 | +language: zig |
| 7 | +module: auto_healing |
| 8 | + |
| 9 | +types: |
| 10 | + HealthStatus: |
| 11 | + fields: |
| 12 | + component: String |
| 13 | + status: String |
| 14 | + last_check: Int |
| 15 | + error_count: Int |
| 16 | + last_error: String |
| 17 | + recovery_attempts: Int |
| 18 | + |
| 19 | + HealingAction: |
| 20 | + fields: |
| 21 | + action_type: String |
| 22 | + target_component: String |
| 23 | + priority: Int |
| 24 | + description: String |
| 25 | + estimated_duration_sec: Int |
| 26 | + |
| 27 | + RecoveryResult: |
| 28 | + fields: |
| 29 | + success: Bool |
| 30 | + action_taken: String |
| 31 | + time_taken_sec: Int |
| 32 | + new_status: String |
| 33 | + error_message: Option<String> |
| 34 | + |
| 35 | + AutoHealingConfig: |
| 36 | + fields: |
| 37 | + enabled: Bool |
| 38 | + max_recovery_attempts: Int |
| 39 | + cooldown_between_attempts_sec: Int |
| 40 | + auto_restart_enabled: Bool |
| 41 | + notification_on_failure: Bool |
| 42 | + |
| 43 | + HealingLog: |
| 44 | + fields: |
| 45 | + timestamp: Int |
| 46 | + component: String |
| 47 | + issue_detected: String |
| 48 | + action_performed: String |
| 49 | + result: String |
| 50 | + duration_ms: Int |
| 51 | + |
| 52 | +behaviors: |
| 53 | + - name: detect_failure |
| 54 | + given: component health status |
| 55 | + when: component reports error or timeout |
| 56 | + then: returns true if failure detected and recovery needed |
| 57 | + implementation: | |
| 58 | + pub fn detectFailure(status: HealthStatus) bool { |
| 59 | + // Failure conditions |
| 60 | + if (std.mem.eql(u8, status.status, "error")) return true; |
| 61 | + if (std.mem.eql(u8, status.status, "timeout")) return true; |
| 62 | + if (status.error_count >= 3) return true; |
| 63 | + |
| 64 | + // Check staleness |
| 65 | + const now = std.time.timestamp(); |
| 66 | + const stale_seconds = now - status.last_check; |
| 67 | + if (stale_seconds > 300) return true; // 5 minutes without update |
| 68 | + |
| 69 | + return false; |
| 70 | + } |
| 71 | + |
| 72 | + - name: determine_recovery_action |
| 73 | + given: failed component and error type |
| 74 | + when: deciding how to recover |
| 75 | + then: returns appropriate HealingAction based on failure pattern |
| 76 | + implementation: | |
| 77 | + pub fn determineRecoveryAction(allocator: Allocator, status: HealthStatus) !HealingAction { |
| 78 | + const component = status.component; |
| 79 | + |
| 80 | + // Pattern-based recovery selection |
| 81 | + if (std.mem.indexOf(u8, status.last_error, "connection refused") != null) { |
| 82 | + return .{ |
| 83 | + .action_type = "restart", |
| 84 | + .target_component = component, |
| 85 | + .priority = 10, |
| 86 | + .description = try allocator.dupe(u8, "Restart connection"), |
| 87 | + .estimated_duration_sec = 5, |
| 88 | + }; |
| 89 | + } |
| 90 | + |
| 91 | + if (std.mem.indexOf(u8, status.last_error, "out of memory") != null) { |
| 92 | + return .{ |
| 93 | + .action_type = "scale_up", |
| 94 | + .target_component = component, |
| 95 | + .priority = 9, |
| 96 | + .description = try allocator.dupe(u8, "Increase memory allocation"), |
| 97 | + .estimated_duration_sec = 30, |
| 98 | + }; |
| 99 | + } |
| 100 | + |
| 101 | + if (status.error_count >= 5) { |
| 102 | + return .{ |
| 103 | + .action_type = "full_restart", |
| 104 | + .target_component = component, |
| 105 | + .priority = 8, |
| 106 | + .description = try allocator.dupe(u8, "Full component restart"), |
| 107 | + .estimated_duration_sec = 10, |
| 108 | + }; |
| 109 | + } |
| 110 | + |
| 111 | + // Default: simple restart |
| 112 | + return .{ |
| 113 | + .action_type = "restart", |
| 114 | + .target_component = component, |
| 115 | + .priority = 5, |
| 116 | + .description = try allocator.dupe(u8, "Standard recovery restart"), |
| 117 | + .estimated_duration_sec = 5, |
| 118 | + }; |
| 119 | + } |
| 120 | + |
| 121 | + - name: execute_recovery |
| 122 | + given: healing action and system handle |
| 123 | + when: performing recovery operation |
| 124 | + then: executes action and returns RecoveryResult |
| 125 | + implementation: | |
| 126 | + pub fn executeRecovery(allocator: Allocator, action: HealingAction, config: AutoHealingConfig) !RecoveryResult { |
| 127 | + if (!config.enabled) return .{ |
| 128 | + .success = false, |
| 129 | + .action_taken = "skipped", |
| 130 | + .time_taken_sec = 0, |
| 131 | + .new_status = "unchanged", |
| 132 | + .error_message = try allocator.dupe(u8, "Auto-healing disabled"), |
| 133 | + }; |
| 134 | + |
| 135 | + const start = std.time.timestamp(); |
| 136 | + |
| 137 | + // Execute based on action type |
| 138 | + const result = if (std.mem.eql(u8, action.action_type, "restart")) |
| 139 | + executeRestart(action.target_component) |
| 140 | + else if (std.mem.eql(u8, action.action_type, "scale_up")) |
| 141 | + executeScaleUp(action.target_component) |
| 142 | + else if (std.mem.eql(u8, action.action_type, "full_restart")) |
| 143 | + executeFullRestart(action.target_component) |
| 144 | + else |
| 145 | + error.UnknownActionType; |
| 146 | + |
| 147 | + const duration = std.time.timestamp() - start; |
| 148 | + |
| 149 | + return .{ |
| 150 | + .success = result, |
| 151 | + .action_taken = action.action_type, |
| 152 | + .time_taken_sec = @intCast(duration), |
| 153 | + .new_status = if (result) "operational" else "failed", |
| 154 | + .error_message = if (result) null else try allocator.dupe(u8, "Recovery failed"), |
| 155 | + }; |
| 156 | + } |
| 157 | + |
| 158 | + // Helper functions (stubs for now) |
| 159 | + fn executeRestart(component: []const u8) bool { |
| 160 | + _ = component; |
| 161 | + // TODO: actual restart logic |
| 162 | + return true; |
| 163 | + } |
| 164 | + |
| 165 | + fn executeScaleUp(component: []const u8) bool { |
| 166 | + _ = component; |
| 167 | + // TODO: actual scale up logic |
| 168 | + return true; |
| 169 | + } |
| 170 | + |
| 171 | + fn executeFullRestart(component: []const u8) bool { |
| 172 | + _ = component; |
| 173 | + // TODO: actual full restart logic |
| 174 | + return true; |
| 175 | + } |
| 176 | + |
| 177 | + - name: check_recovery_cooldown |
| 178 | + given: component recovery history |
| 179 | + when: checking if recovery allowed |
| 180 | + then: returns true if cooldown period has passed |
| 181 | + implementation: | |
| 182 | + pub fn checkRecoveryCooldown(logs: []HealingLog, component: []const u8, cooldown_sec: i64) bool { |
| 183 | + const now = std.time.timestamp(); |
| 184 | + |
| 185 | + // Find last recovery attempt for this component |
| 186 | + var last_attempt: i64 = 0; |
| 187 | + for (logs) |log| { |
| 188 | + if (std.mem.eql(u8, log.component, component)) { |
| 189 | + if (log.timestamp > last_attempt) { |
| 190 | + last_attempt = log.timestamp; |
| 191 | + } |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + if (last_attempt == 0) return true; // No previous attempts |
| 196 | + |
| 197 | + const elapsed = now - last_attempt; |
| 198 | + return elapsed >= cooldown_sec; |
| 199 | + } |
| 200 | + |
| 201 | + - name: log_recovery_attempt |
| 202 | + given: recovery action and result |
| 203 | + when: recording healing operation |
| 204 | + then: creates HealingLog entry |
| 205 | + implementation: | |
| 206 | + pub fn logRecoveryAttempt(allocator: Allocator, component: []const u8, action: HealingAction, result: RecoveryResult, duration_ms: i64) !HealingLog { |
| 207 | + return .{ |
| 208 | + .timestamp = std.time.timestamp(), |
| 209 | + .component = try allocator.dupe(u8, component), |
| 210 | + .issue_detected = try allocator.dupe(u8, action.description), |
| 211 | + .action_performed = try allocator.dupe(u8, action.action_type), |
| 212 | + .result = if (result.success) "success" else "failed", |
| 213 | + .duration_ms = duration_ms, |
| 214 | + }; |
| 215 | + } |
| 216 | + |
| 217 | + - name: get_healing_stats |
| 218 | + given: healing logs |
| 219 | + when: analyzing recovery effectiveness |
| 220 | + then: returns statistics about success rate, common failures |
| 221 | + implementation: | |
| 222 | + pub const HealingStats = struct { |
| 223 | + total_attempts: u32, |
| 224 | + successful_recoveries: u32, |
| 225 | + failed_recoveries: u32, |
| 226 | + avg_recovery_time_ms: u64, |
| 227 | + most_common_failure: []const u8, |
| 228 | + success_rate: f64, |
| 229 | + }; |
| 230 | + |
| 231 | + pub fn getHealingStats(allocator: Allocator, logs: []HealingLog) !HealingStats { |
| 232 | + if (logs.len == 0) { |
| 233 | + return .{ |
| 234 | + .total_attempts = 0, |
| 235 | + .successful_recoveries = 0, |
| 236 | + .failed_recoveries = 0, |
| 237 | + .avg_recovery_time_ms = 0, |
| 238 | + .most_common_failure = try allocator.dupe(u8, "none"), |
| 239 | + .success_rate = 1.0, |
| 240 | + }; |
| 241 | + } |
| 242 | + |
| 243 | + var successful: u32 = 0; |
| 244 | + var total_time: u64 = 0; |
| 245 | + |
| 246 | + for (logs) |log| { |
| 247 | + if (std.mem.eql(u8, log.result, "success")) successful += 1; |
| 248 | + total_time += log.duration_ms; |
| 249 | + } |
| 250 | + |
| 251 | + const success_rate = @as(f64, @floatFromInt(successful)) / @as(f64, @floatFromInt(logs.len)); |
| 252 | + const avg_time = if (logs.len > 0) total_time / @as(u64, @intCast(logs.len)) else 0; |
| 253 | + |
| 254 | + // Count most common failure |
| 255 | + // TODO: implement frequency counting |
| 256 | + |
| 257 | + return .{ |
| 258 | + .total_attempts = @intCast(logs.len), |
| 259 | + .successful_recoveries = successful, |
| 260 | + .failed_recoveries = @intCast(logs.len - successful), |
| 261 | + .avg_recovery_time_ms = avg_time, |
| 262 | + .most_common_failure = try allocator.dupe(u8, "unknown"), |
| 263 | + .success_rate = success_rate, |
| 264 | + }; |
| 265 | + } |
| 266 | + |
| 267 | + - name: escalate_failure |
| 268 | + given: component that failed max recovery attempts |
| 269 | + when: automatic recovery exhausted |
| 270 | + then: triggers manual intervention alert |
| 271 | + implementation: | |
| 272 | + pub fn escalateFailure(allocator: Allocator, status: HealthStatus, config: AutoHealingConfig) ![]const u8 { |
| 273 | + const message = try std.fmt.allocPrint(allocator, |
| 274 | + \\🚨 CRITICAL: Auto-recovery exhausted |
| 275 | + \\Component: {s} |
| 276 | + \\Errors: {d} |
| 277 | + \\Last error: {s} |
| 278 | + \\Attempts: {d}/{d} |
| 279 | + \\Manual intervention REQUIRED |
| 280 | + , .{ |
| 281 | + status.component, |
| 282 | + status.error_count, |
| 283 | + status.last_error, |
| 284 | + status.recovery_attempts, |
| 285 | + config.max_recovery_attempts, |
| 286 | + }); |
| 287 | + |
| 288 | + // Send alert notification |
| 289 | + // TODO: Telegram integration |
| 290 | + |
| 291 | + return message; |
| 292 | + } |
0 commit comments