Skip to content

Commit 1ad14b8

Browse files
committed
feat(vibee-v10): Cycle 48 Phase 7 Swarm Watch — auto_healing + self_scale + self_improving_v2 (1040 lines .vibee → 1504 LOC generated)
2 parents a2909e5 + 4748966 commit 1ad14b8

3 files changed

Lines changed: 1040 additions & 0 deletions

File tree

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
# Auto-Healing Swarm Watch (Phase 7) - MGEN-011
2+
# Automatic detection and recovery from system failures
3+
4+
name: auto_healing
5+
version: "1.0.0"
6+
language: zig
7+
module: auto_healing
8+
9+
types:
10+
HealthStatus:
11+
fields:
12+
component: String
13+
status: String
14+
last_check: Int
15+
error_count: Int
16+
last_error: String
17+
recovery_attempts: Int
18+
19+
HealingAction:
20+
fields:
21+
action_type: String
22+
target_component: String
23+
priority: Int
24+
description: String
25+
estimated_duration_sec: Int
26+
27+
RecoveryResult:
28+
fields:
29+
success: Bool
30+
action_taken: String
31+
time_taken_sec: Int
32+
new_status: String
33+
error_message: Option<String>
34+
35+
AutoHealingConfig:
36+
fields:
37+
enabled: Bool
38+
max_recovery_attempts: Int
39+
cooldown_between_attempts_sec: Int
40+
auto_restart_enabled: Bool
41+
notification_on_failure: Bool
42+
43+
HealingLog:
44+
fields:
45+
timestamp: Int
46+
component: String
47+
issue_detected: String
48+
action_performed: String
49+
result: String
50+
duration_ms: Int
51+
52+
behaviors:
53+
- name: detect_failure
54+
given: component health status
55+
when: component reports error or timeout
56+
then: returns true if failure detected and recovery needed
57+
implementation: |
58+
pub fn detectFailure(status: HealthStatus) bool {
59+
// Failure conditions
60+
if (std.mem.eql(u8, status.status, "error")) return true;
61+
if (std.mem.eql(u8, status.status, "timeout")) return true;
62+
if (status.error_count >= 3) return true;
63+
64+
// Check staleness
65+
const now = std.time.timestamp();
66+
const stale_seconds = now - status.last_check;
67+
if (stale_seconds > 300) return true; // 5 minutes without update
68+
69+
return false;
70+
}
71+
72+
- name: determine_recovery_action
73+
given: failed component and error type
74+
when: deciding how to recover
75+
then: returns appropriate HealingAction based on failure pattern
76+
implementation: |
77+
pub fn determineRecoveryAction(allocator: Allocator, status: HealthStatus) !HealingAction {
78+
const component = status.component;
79+
80+
// Pattern-based recovery selection
81+
if (std.mem.indexOf(u8, status.last_error, "connection refused") != null) {
82+
return .{
83+
.action_type = "restart",
84+
.target_component = component,
85+
.priority = 10,
86+
.description = try allocator.dupe(u8, "Restart connection"),
87+
.estimated_duration_sec = 5,
88+
};
89+
}
90+
91+
if (std.mem.indexOf(u8, status.last_error, "out of memory") != null) {
92+
return .{
93+
.action_type = "scale_up",
94+
.target_component = component,
95+
.priority = 9,
96+
.description = try allocator.dupe(u8, "Increase memory allocation"),
97+
.estimated_duration_sec = 30,
98+
};
99+
}
100+
101+
if (status.error_count >= 5) {
102+
return .{
103+
.action_type = "full_restart",
104+
.target_component = component,
105+
.priority = 8,
106+
.description = try allocator.dupe(u8, "Full component restart"),
107+
.estimated_duration_sec = 10,
108+
};
109+
}
110+
111+
// Default: simple restart
112+
return .{
113+
.action_type = "restart",
114+
.target_component = component,
115+
.priority = 5,
116+
.description = try allocator.dupe(u8, "Standard recovery restart"),
117+
.estimated_duration_sec = 5,
118+
};
119+
}
120+
121+
- name: execute_recovery
122+
given: healing action and system handle
123+
when: performing recovery operation
124+
then: executes action and returns RecoveryResult
125+
implementation: |
126+
pub fn executeRecovery(allocator: Allocator, action: HealingAction, config: AutoHealingConfig) !RecoveryResult {
127+
if (!config.enabled) return .{
128+
.success = false,
129+
.action_taken = "skipped",
130+
.time_taken_sec = 0,
131+
.new_status = "unchanged",
132+
.error_message = try allocator.dupe(u8, "Auto-healing disabled"),
133+
};
134+
135+
const start = std.time.timestamp();
136+
137+
// Execute based on action type
138+
const result = if (std.mem.eql(u8, action.action_type, "restart"))
139+
executeRestart(action.target_component)
140+
else if (std.mem.eql(u8, action.action_type, "scale_up"))
141+
executeScaleUp(action.target_component)
142+
else if (std.mem.eql(u8, action.action_type, "full_restart"))
143+
executeFullRestart(action.target_component)
144+
else
145+
error.UnknownActionType;
146+
147+
const duration = std.time.timestamp() - start;
148+
149+
return .{
150+
.success = result,
151+
.action_taken = action.action_type,
152+
.time_taken_sec = @intCast(duration),
153+
.new_status = if (result) "operational" else "failed",
154+
.error_message = if (result) null else try allocator.dupe(u8, "Recovery failed"),
155+
};
156+
}
157+
158+
// Helper functions (stubs for now)
159+
fn executeRestart(component: []const u8) bool {
160+
_ = component;
161+
// TODO: actual restart logic
162+
return true;
163+
}
164+
165+
fn executeScaleUp(component: []const u8) bool {
166+
_ = component;
167+
// TODO: actual scale up logic
168+
return true;
169+
}
170+
171+
fn executeFullRestart(component: []const u8) bool {
172+
_ = component;
173+
// TODO: actual full restart logic
174+
return true;
175+
}
176+
177+
- name: check_recovery_cooldown
178+
given: component recovery history
179+
when: checking if recovery allowed
180+
then: returns true if cooldown period has passed
181+
implementation: |
182+
pub fn checkRecoveryCooldown(logs: []HealingLog, component: []const u8, cooldown_sec: i64) bool {
183+
const now = std.time.timestamp();
184+
185+
// Find last recovery attempt for this component
186+
var last_attempt: i64 = 0;
187+
for (logs) |log| {
188+
if (std.mem.eql(u8, log.component, component)) {
189+
if (log.timestamp > last_attempt) {
190+
last_attempt = log.timestamp;
191+
}
192+
}
193+
}
194+
195+
if (last_attempt == 0) return true; // No previous attempts
196+
197+
const elapsed = now - last_attempt;
198+
return elapsed >= cooldown_sec;
199+
}
200+
201+
- name: log_recovery_attempt
202+
given: recovery action and result
203+
when: recording healing operation
204+
then: creates HealingLog entry
205+
implementation: |
206+
pub fn logRecoveryAttempt(allocator: Allocator, component: []const u8, action: HealingAction, result: RecoveryResult, duration_ms: i64) !HealingLog {
207+
return .{
208+
.timestamp = std.time.timestamp(),
209+
.component = try allocator.dupe(u8, component),
210+
.issue_detected = try allocator.dupe(u8, action.description),
211+
.action_performed = try allocator.dupe(u8, action.action_type),
212+
.result = if (result.success) "success" else "failed",
213+
.duration_ms = duration_ms,
214+
};
215+
}
216+
217+
- name: get_healing_stats
218+
given: healing logs
219+
when: analyzing recovery effectiveness
220+
then: returns statistics about success rate, common failures
221+
implementation: |
222+
pub const HealingStats = struct {
223+
total_attempts: u32,
224+
successful_recoveries: u32,
225+
failed_recoveries: u32,
226+
avg_recovery_time_ms: u64,
227+
most_common_failure: []const u8,
228+
success_rate: f64,
229+
};
230+
231+
pub fn getHealingStats(allocator: Allocator, logs: []HealingLog) !HealingStats {
232+
if (logs.len == 0) {
233+
return .{
234+
.total_attempts = 0,
235+
.successful_recoveries = 0,
236+
.failed_recoveries = 0,
237+
.avg_recovery_time_ms = 0,
238+
.most_common_failure = try allocator.dupe(u8, "none"),
239+
.success_rate = 1.0,
240+
};
241+
}
242+
243+
var successful: u32 = 0;
244+
var total_time: u64 = 0;
245+
246+
for (logs) |log| {
247+
if (std.mem.eql(u8, log.result, "success")) successful += 1;
248+
total_time += log.duration_ms;
249+
}
250+
251+
const success_rate = @as(f64, @floatFromInt(successful)) / @as(f64, @floatFromInt(logs.len));
252+
const avg_time = if (logs.len > 0) total_time / @as(u64, @intCast(logs.len)) else 0;
253+
254+
// Count most common failure
255+
// TODO: implement frequency counting
256+
257+
return .{
258+
.total_attempts = @intCast(logs.len),
259+
.successful_recoveries = successful,
260+
.failed_recoveries = @intCast(logs.len - successful),
261+
.avg_recovery_time_ms = avg_time,
262+
.most_common_failure = try allocator.dupe(u8, "unknown"),
263+
.success_rate = success_rate,
264+
};
265+
}
266+
267+
- name: escalate_failure
268+
given: component that failed max recovery attempts
269+
when: automatic recovery exhausted
270+
then: triggers manual intervention alert
271+
implementation: |
272+
pub fn escalateFailure(allocator: Allocator, status: HealthStatus, config: AutoHealingConfig) ![]const u8 {
273+
const message = try std.fmt.allocPrint(allocator,
274+
\\🚨 CRITICAL: Auto-recovery exhausted
275+
\\Component: {s}
276+
\\Errors: {d}
277+
\\Last error: {s}
278+
\\Attempts: {d}/{d}
279+
\\Manual intervention REQUIRED
280+
, .{
281+
status.component,
282+
status.error_count,
283+
status.last_error,
284+
status.recovery_attempts,
285+
config.max_recovery_attempts,
286+
});
287+
288+
// Send alert notification
289+
// TODO: Telegram integration
290+
291+
return message;
292+
}

0 commit comments

Comments
 (0)