55 → Health verification → Slack post-mortem
66"""
77
8+ import logging
89import time
910
1011from autosre .config import get_settings
1112from autosre .incident_detection import get_incident_stream
1213from autosre .log_storage import LogStore
13- from autosre .models import IncidentType , PostMortemReport , RecoveryStatus
14+ from autosre .models import Diagnosis , IncidentType , PostMortemReport , RecoveryStatus
1415from autosre .planner import PlannerAgent
1516from autosre .reasoning_agent import ReasoningAgent
17+ from autosre .reasoning_agent .agent import FALLBACK_DIAGNOSIS
1618from autosre .recovery_verification import RecoveryMonitor
1719from autosre .slack_reporter import SlackReporter
1820from autosre .ui_automation import UIActionAgent
1921
22+ logger = logging .getLogger (__name__ )
23+
24+
25+ def _publish_report (slack : SlackReporter , report : PostMortemReport ) -> None :
26+ """Publish post-mortem to Slack; log and swallow errors so workflow does not crash."""
27+ try :
28+ slack .publish (report )
29+ except Exception as e :
30+ logger .warning ("Slack publish failed: %s" , e , exc_info = True )
31+
32+
33+ def _build_report (
34+ incident_id : str ,
35+ detected_at : str ,
36+ diagnosis : Diagnosis ,
37+ recovery_seconds : float ,
38+ status : RecoveryStatus ,
39+ extra_timeline : list [str ] | None = None ,
40+ ) -> PostMortemReport :
41+ """Build a post-mortem report from incident, diagnosis, and verification result."""
42+ timeline = [
43+ f"Alert received: { detected_at } " ,
44+ f"Root cause: { diagnosis .summary } " ,
45+ f"Action: { diagnosis .recommended_action .value } " ,
46+ f"Recovery: { status .value } in { recovery_seconds :.0f} s" ,
47+ ]
48+ if extra_timeline :
49+ timeline .extend (extra_timeline )
50+ return PostMortemReport (
51+ incident_id = incident_id ,
52+ root_cause = diagnosis .summary ,
53+ action_taken = diagnosis .recommended_action .value ,
54+ recovery_time_seconds = recovery_seconds ,
55+ prevention_suggestion = "Add memory profiling to CI pipeline" ,
56+ timeline = timeline ,
57+ )
58+
2059
2160def run_once (incident_type : IncidentType | None = None ) -> bool :
2261 """
2362 Run one full cycle: detect one incident, diagnose, act, verify, report.
2463
2564 For demo pass incident_type=IncidentType.LATENCY_SPIKE.
26- Returns True if the cycle completed successfully.
65+ Returns True if the cycle completed successfully (recovered). On escalation,
66+ UI failure, or verification failure still publishes a post-mortem when possible.
2767 """
2868 settings = get_settings ()
2969 log_store = LogStore (data_dir = settings .log_storage_data_dir or None )
@@ -44,49 +84,86 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
4484 stream = get_incident_stream (incident_type = incident_type )
4585 incident = next (stream , None )
4686 if not incident :
87+ logger .warning ("No incident received" )
4788 return False
48- log_store .record_incident (incident )
4989
50- # 2. Root cause analysis (Nova)
90+ try :
91+ log_store .record_incident (incident )
92+ except Exception as e :
93+ logger .warning ("Failed to record incident: %s" , e , exc_info = True )
94+
95+ # 2. Root cause analysis (Nova), with retries
5196 logs = log_store .get_logs_for_incident (incident )
5297 deployment_history = log_store .get_deployment_history (incident .service_name )
53- diagnosis = reasoning .analyze (incident , logs , deployment_history )
98+ diagnosis : Diagnosis = FALLBACK_DIAGNOSIS
99+ max_attempts = 1 + max (0 , settings .reasoning_max_retries )
100+ for attempt in range (max_attempts ):
101+ try :
102+ result = reasoning .analyze (incident , logs , deployment_history )
103+ if result is not None :
104+ diagnosis = result
105+ break
106+ except Exception as e :
107+ logger .warning ("Reasoning attempt %s failed: %s" , attempt + 1 , e , exc_info = True )
108+ if attempt == max_attempts - 1 :
109+ diagnosis = FALLBACK_DIAGNOSIS
54110
55111 # 3. Plan actions
56112 actions = planner .plan (diagnosis )
57113 if not actions :
58- # e.g. ESCALATE
114+ logger .info ("No actions (e.g. escalate); publishing escalation report" )
115+ report = _build_report (
116+ incident .incident_id ,
117+ incident .detected_at .isoformat (),
118+ diagnosis ,
119+ 0.0 ,
120+ RecoveryStatus .UNKNOWN ,
121+ extra_timeline = ["Escalated; no automated action taken." ],
122+ )
123+ _publish_report (slack , report )
59124 return False
60125
61126 # 4. UI automation (Nova Act)
62127 action_start_time = time .monotonic ()
63128 success = ui_agent .execute (actions , service_name = incident .service_name )
64129 if not success :
130+ logger .warning ("UI automation failed; publishing report" )
131+ report = _build_report (
132+ incident .incident_id ,
133+ incident .detected_at .isoformat (),
134+ diagnosis ,
135+ 0.0 ,
136+ RecoveryStatus .NOT_RECOVERED ,
137+ extra_timeline = ["UI action execution failed." ],
138+ )
139+ _publish_report (slack , report )
65140 return False
66141
67142 # 5. Recovery verification
68- status = monitor .verify (
69- incident .incident_id ,
70- incident .service_name ,
71- action_start_time = action_start_time ,
72- )
73- recovery_seconds = monitor .get_recovery_time_seconds ()
143+ timeout = settings .recovery_verify_timeout_seconds
144+ recovery_seconds : float = 0.0
145+ try :
146+ status = monitor .verify (
147+ incident .incident_id ,
148+ incident .service_name ,
149+ timeout_seconds = timeout ,
150+ action_start_time = action_start_time ,
151+ )
152+ recovery_seconds = monitor .get_recovery_time_seconds ()
153+ except Exception as e :
154+ logger .warning ("Recovery verification failed: %s" , e , exc_info = True )
155+ status = RecoveryStatus .NOT_RECOVERED
156+ recovery_seconds = timeout
74157
75158 # 6. Post-mortem to Slack
76- report = PostMortemReport (
77- incident_id = incident .incident_id ,
78- root_cause = diagnosis .summary ,
79- action_taken = f"{ diagnosis .recommended_action .value } " ,
80- recovery_time_seconds = recovery_seconds ,
81- prevention_suggestion = "Add memory profiling to CI pipeline" ,
82- timeline = [
83- f"Alert received: { incident .detected_at .isoformat ()} " ,
84- "Root cause: bad deployment v1.4.2" ,
85- "Action: rollback to v1.4.1" ,
86- f"Recovery: { status .value } in { recovery_seconds :.0f} s" ,
87- ],
159+ report = _build_report (
160+ incident .incident_id ,
161+ incident .detected_at .isoformat (),
162+ diagnosis ,
163+ recovery_seconds ,
164+ status ,
88165 )
89- slack . publish ( report )
166+ _publish_report ( slack , report )
90167
91168 return status == RecoveryStatus .RECOVERED
92169
0 commit comments