Skip to content

Commit 235e580

Browse files
Phase 7: workflow hardening; ruff format
- Config: reasoning_max_retries, recovery_verify_timeout_seconds - Workflow: logging, try/except, reasoning retries, post-mortem on escalation/UI/verify failure - Slack publish wrapped in try/except; _build_report/_publish_report helpers - Tests: no_actions, ui_failure, verify_exception; README updated - Ruff format: store.py, test_reasoning_agent.py
1 parent 122fab2 commit 235e580

6 files changed

Lines changed: 200 additions & 51 deletions

File tree

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ ruff format --check src dashboard tests
135135
- **Phase 2:** Operations dashboard (FastAPI + static HTML) with login, services, Deployments panel, Rollback, and /api/health for recovery verification.
136136
- **Phase 5:** Slack reporter: real publish via `slack_sdk.WebClient` and Block Kit; fallback text; no token/channel → skip.
137137
- **Phase 6:** Incident/log storage: `LogStore` records incidents, append_log/append_deployment, get_logs_for_incident and get_deployment_history with stub fallbacks; optional file persistence via `LOG_STORAGE_DATA_DIR`.
138-
- **Next:** Phase 7 (workflow hardening), Phase 8 (demo script).
138+
- **Phase 7:** Workflow hardening: logging, try/except around record_incident/reasoning/verify/Slack; reasoning retries (`REASONING_MAX_RETRIES`); post-mortem on escalation, UI failure, or verify exception; config `recovery_verify_timeout_seconds`.
139+
- **Next:** Phase 8 (demo script).
139140

140141
---
141142

src/autosre/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ class Settings(BaseSettings):
4141
# Phase 6: incident / log storage (optional file persistence)
4242
log_storage_data_dir: str = ""
4343

44+
# Phase 7: workflow hardening
45+
reasoning_max_retries: int = 2
46+
recovery_verify_timeout_seconds: float = 120.0
47+
4448

4549
def get_settings() -> Settings:
4650
"""Return loaded settings (singleton-style)."""

src/autosre/log_storage/store.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class LogStore:
3636
"""
3737

3838
def __init__(self, data_dir: str | None = None) -> None:
39-
self._data_dir = (Path(data_dir) if data_dir else None)
39+
self._data_dir = Path(data_dir) if data_dir else None
4040
self._incidents: list[dict] = []
4141
self._log_entries: list[dict] = [] # service_name, timestamp, message
4242
self._deployments: list[dict] = [] # service_name, version, timestamp, status
@@ -88,21 +88,23 @@ def get_incident(self, incident_id: str) -> IncidentEvent | None:
8888
incident_id=p["incident_id"],
8989
incident_type=IncidentType(p["incident_type"]),
9090
service_name=p["service_name"],
91-
detected_at=datetime.fromisoformat(
92-
p["detected_at"].replace("Z", "+00:00")
93-
),
91+
detected_at=datetime.fromisoformat(p["detected_at"].replace("Z", "+00:00")),
9492
raw_payload=p.get("raw_payload") or {},
9593
)
9694
return None
9795

98-
def append_log(self, service_name: str, message: str, timestamp: datetime | None = None) -> None:
96+
def append_log(
97+
self, service_name: str, message: str, timestamp: datetime | None = None
98+
) -> None:
9999
"""Append a log line for the given service (for RCA)."""
100100
ts = timestamp or datetime.utcnow()
101-
self._log_entries.append({
102-
"service_name": service_name,
103-
"timestamp": _iso(ts),
104-
"message": message,
105-
})
101+
self._log_entries.append(
102+
{
103+
"service_name": service_name,
104+
"timestamp": _iso(ts),
105+
"message": message,
106+
}
107+
)
106108
self._save(_LOG_ENTRIES_FILE, self._log_entries)
107109

108110
def get_logs_for_incident(self, incident: IncidentEvent, window_seconds: int = 3600) -> str:
@@ -135,19 +137,28 @@ def append_deployment(
135137
) -> None:
136138
"""Record a deployment event for a service."""
137139
ts = _iso(timestamp) if isinstance(timestamp, datetime) else str(timestamp)
138-
self._deployments.append({
139-
"service_name": service_name,
140-
"version": version,
141-
"timestamp": ts,
142-
"status": status,
143-
})
140+
self._deployments.append(
141+
{
142+
"service_name": service_name,
143+
"version": version,
144+
"timestamp": ts,
145+
"status": status,
146+
}
147+
)
144148
self._save(_DEPLOYMENTS_FILE, self._deployments)
145149

146150
def get_deployment_history(self, service_name: str, limit: int = 5) -> list[dict]:
147151
"""Return recent deployments for the service. Fallback to stub list if empty."""
148152
filtered = [d for d in self._deployments if d.get("service_name") == service_name]
149153
filtered.sort(key=lambda d: d.get("timestamp", ""), reverse=True)
150-
result = [{"version": d.get("version"), "timestamp": d.get("timestamp"), "status": d.get("status")} for d in filtered[:limit]]
154+
result = [
155+
{
156+
"version": d.get("version"),
157+
"timestamp": d.get("timestamp"),
158+
"status": d.get("status"),
159+
}
160+
for d in filtered[:limit]
161+
]
151162
if result:
152163
return result
153164
return STUB_DEPLOYMENTS.copy()

src/autosre/workflow.py

Lines changed: 102 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,65 @@
55
→ Health verification → Slack post-mortem
66
"""
77

8+
import logging
89
import time
910

1011
from autosre.config import get_settings
1112
from autosre.incident_detection import get_incident_stream
1213
from autosre.log_storage import LogStore
13-
from autosre.models import IncidentType, PostMortemReport, RecoveryStatus
14+
from autosre.models import Diagnosis, IncidentType, PostMortemReport, RecoveryStatus
1415
from autosre.planner import PlannerAgent
1516
from autosre.reasoning_agent import ReasoningAgent
17+
from autosre.reasoning_agent.agent import FALLBACK_DIAGNOSIS
1618
from autosre.recovery_verification import RecoveryMonitor
1719
from autosre.slack_reporter import SlackReporter
1820
from autosre.ui_automation import UIActionAgent
1921

22+
logger = logging.getLogger(__name__)
23+
24+
25+
def _publish_report(slack: SlackReporter, report: PostMortemReport) -> None:
26+
"""Publish post-mortem to Slack; log and swallow errors so workflow does not crash."""
27+
try:
28+
slack.publish(report)
29+
except Exception as e:
30+
logger.warning("Slack publish failed: %s", e, exc_info=True)
31+
32+
33+
def _build_report(
34+
incident_id: str,
35+
detected_at: str,
36+
diagnosis: Diagnosis,
37+
recovery_seconds: float,
38+
status: RecoveryStatus,
39+
extra_timeline: list[str] | None = None,
40+
) -> PostMortemReport:
41+
"""Build a post-mortem report from incident, diagnosis, and verification result."""
42+
timeline = [
43+
f"Alert received: {detected_at}",
44+
f"Root cause: {diagnosis.summary}",
45+
f"Action: {diagnosis.recommended_action.value}",
46+
f"Recovery: {status.value} in {recovery_seconds:.0f}s",
47+
]
48+
if extra_timeline:
49+
timeline.extend(extra_timeline)
50+
return PostMortemReport(
51+
incident_id=incident_id,
52+
root_cause=diagnosis.summary,
53+
action_taken=diagnosis.recommended_action.value,
54+
recovery_time_seconds=recovery_seconds,
55+
prevention_suggestion="Add memory profiling to CI pipeline",
56+
timeline=timeline,
57+
)
58+
2059

2160
def run_once(incident_type: IncidentType | None = None) -> bool:
2261
"""
2362
Run one full cycle: detect one incident, diagnose, act, verify, report.
2463
2564
For demo pass incident_type=IncidentType.LATENCY_SPIKE.
26-
Returns True if the cycle completed successfully.
65+
Returns True if the cycle completed successfully (recovered). On escalation,
66+
UI failure, or verification failure still publishes a post-mortem when possible.
2767
"""
2868
settings = get_settings()
2969
log_store = LogStore(data_dir=settings.log_storage_data_dir or None)
@@ -44,49 +84,86 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
4484
stream = get_incident_stream(incident_type=incident_type)
4585
incident = next(stream, None)
4686
if not incident:
87+
logger.warning("No incident received")
4788
return False
48-
log_store.record_incident(incident)
4989

50-
# 2. Root cause analysis (Nova)
90+
try:
91+
log_store.record_incident(incident)
92+
except Exception as e:
93+
logger.warning("Failed to record incident: %s", e, exc_info=True)
94+
95+
# 2. Root cause analysis (Nova), with retries
5196
logs = log_store.get_logs_for_incident(incident)
5297
deployment_history = log_store.get_deployment_history(incident.service_name)
53-
diagnosis = reasoning.analyze(incident, logs, deployment_history)
98+
diagnosis: Diagnosis = FALLBACK_DIAGNOSIS
99+
max_attempts = 1 + max(0, settings.reasoning_max_retries)
100+
for attempt in range(max_attempts):
101+
try:
102+
result = reasoning.analyze(incident, logs, deployment_history)
103+
if result is not None:
104+
diagnosis = result
105+
break
106+
except Exception as e:
107+
logger.warning("Reasoning attempt %s failed: %s", attempt + 1, e, exc_info=True)
108+
if attempt == max_attempts - 1:
109+
diagnosis = FALLBACK_DIAGNOSIS
54110

55111
# 3. Plan actions
56112
actions = planner.plan(diagnosis)
57113
if not actions:
58-
# e.g. ESCALATE
114+
logger.info("No actions (e.g. escalate); publishing escalation report")
115+
report = _build_report(
116+
incident.incident_id,
117+
incident.detected_at.isoformat(),
118+
diagnosis,
119+
0.0,
120+
RecoveryStatus.UNKNOWN,
121+
extra_timeline=["Escalated; no automated action taken."],
122+
)
123+
_publish_report(slack, report)
59124
return False
60125

61126
# 4. UI automation (Nova Act)
62127
action_start_time = time.monotonic()
63128
success = ui_agent.execute(actions, service_name=incident.service_name)
64129
if not success:
130+
logger.warning("UI automation failed; publishing report")
131+
report = _build_report(
132+
incident.incident_id,
133+
incident.detected_at.isoformat(),
134+
diagnosis,
135+
0.0,
136+
RecoveryStatus.NOT_RECOVERED,
137+
extra_timeline=["UI action execution failed."],
138+
)
139+
_publish_report(slack, report)
65140
return False
66141

67142
# 5. Recovery verification
68-
status = monitor.verify(
69-
incident.incident_id,
70-
incident.service_name,
71-
action_start_time=action_start_time,
72-
)
73-
recovery_seconds = monitor.get_recovery_time_seconds()
143+
timeout = settings.recovery_verify_timeout_seconds
144+
recovery_seconds: float = 0.0
145+
try:
146+
status = monitor.verify(
147+
incident.incident_id,
148+
incident.service_name,
149+
timeout_seconds=timeout,
150+
action_start_time=action_start_time,
151+
)
152+
recovery_seconds = monitor.get_recovery_time_seconds()
153+
except Exception as e:
154+
logger.warning("Recovery verification failed: %s", e, exc_info=True)
155+
status = RecoveryStatus.NOT_RECOVERED
156+
recovery_seconds = timeout
74157

75158
# 6. Post-mortem to Slack
76-
report = PostMortemReport(
77-
incident_id=incident.incident_id,
78-
root_cause=diagnosis.summary,
79-
action_taken=f"{diagnosis.recommended_action.value}",
80-
recovery_time_seconds=recovery_seconds,
81-
prevention_suggestion="Add memory profiling to CI pipeline",
82-
timeline=[
83-
f"Alert received: {incident.detected_at.isoformat()}",
84-
"Root cause: bad deployment v1.4.2",
85-
"Action: rollback to v1.4.1",
86-
f"Recovery: {status.value} in {recovery_seconds:.0f}s",
87-
],
159+
report = _build_report(
160+
incident.incident_id,
161+
incident.detected_at.isoformat(),
162+
diagnosis,
163+
recovery_seconds,
164+
status,
88165
)
89-
slack.publish(report)
166+
_publish_report(slack, report)
90167

91168
return status == RecoveryStatus.RECOVERED
92169

tests/test_reasoning_agent.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616

1717
def test_parse_diagnosis_from_text_valid_json():
18-
text = '''{"summary": "Memory leak in v1.4.2", "confidence": 0.9, "recommended_action": "rollback", "reasoning": "Logs point to deployment."}'''
18+
text = """{"summary": "Memory leak in v1.4.2", "confidence": 0.9, "recommended_action": "rollback", "reasoning": "Logs point to deployment."}"""
1919
d = _parse_diagnosis_from_text(text)
2020
assert d is not None
2121
assert d.summary == "Memory leak in v1.4.2"
@@ -109,14 +109,12 @@ def test_reasoning_agent_bedrock_success(mock_get_client, sample_incident):
109109

110110

111111
@patch("autosre.reasoning_agent.agent._get_bedrock_client")
112-
def test_reasoning_agent_bedrock_invalid_response_returns_fallback(mock_get_client, sample_incident):
112+
def test_reasoning_agent_bedrock_invalid_response_returns_fallback(
113+
mock_get_client, sample_incident
114+
):
113115
mock_client = MagicMock()
114116
mock_client.converse.return_value = {
115-
"output": {
116-
"message": {
117-
"content": [{"text": "I'm not JSON, just prose."}]
118-
}
119-
}
117+
"output": {"message": {"content": [{"text": "I'm not JSON, just prose."}]}}
120118
}
121119
mock_get_client.return_value = mock_client
122120

tests/test_workflow.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Smoke test for the full workflow (stub components)."""
1+
"""Smoke test for the full workflow (stub components) and Phase 7 hardening."""
22

33
from unittest.mock import MagicMock, patch
44

@@ -17,3 +17,61 @@ def test_run_once_latency_spike(mock_monitor_class):
1717
result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
1818
assert result is True
1919
mock_monitor.verify.assert_called_once()
20+
21+
22+
@patch("autosre.workflow.SlackReporter")
23+
@patch("autosre.workflow.RecoveryMonitor")
24+
@patch("autosre.workflow.PlannerAgent")
25+
def test_run_once_no_actions_publishes_escalation_report(
26+
mock_planner_class, mock_monitor_class, mock_slack_class
27+
):
28+
"""When planner returns no actions (e.g. ESCALATE), publish report and return False."""
29+
mock_planner = MagicMock()
30+
mock_planner.plan.return_value = []
31+
mock_planner_class.return_value = mock_planner
32+
mock_slack = MagicMock()
33+
mock_slack_class.return_value = mock_slack
34+
35+
result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
36+
assert result is False
37+
mock_slack.publish.assert_called_once()
38+
report = mock_slack.publish.call_args[0][0]
39+
assert any("Escalated" in line for line in report.timeline)
40+
41+
42+
@patch("autosre.workflow.SlackReporter")
43+
@patch("autosre.workflow.RecoveryMonitor")
44+
@patch("autosre.workflow.UIActionAgent")
45+
def test_run_once_ui_failure_publishes_report(mock_ui_class, mock_monitor_class, mock_slack_class):
46+
"""When UI agent fails, publish report and return False."""
47+
mock_ui = MagicMock()
48+
mock_ui.execute.return_value = False
49+
mock_ui_class.return_value = mock_ui
50+
mock_slack = MagicMock()
51+
mock_slack_class.return_value = mock_slack
52+
53+
result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
54+
assert result is False
55+
mock_slack.publish.assert_called_once()
56+
report = mock_slack.publish.call_args[0][0]
57+
assert report.recovery_time_seconds == 0.0
58+
assert "UI action" in report.timeline[-1] or "failed" in report.timeline[-1].lower()
59+
60+
61+
@patch("autosre.workflow.SlackReporter")
62+
@patch("autosre.workflow.RecoveryMonitor")
63+
def test_run_once_verify_exception_returns_false_and_publishes(
64+
mock_monitor_class, mock_slack_class
65+
):
66+
"""When recovery verify raises, still publish report and return False."""
67+
mock_monitor = MagicMock()
68+
mock_monitor.verify.side_effect = RuntimeError("network error")
69+
mock_monitor_class.return_value = mock_monitor
70+
mock_slack = MagicMock()
71+
mock_slack_class.return_value = mock_slack
72+
73+
result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
74+
assert result is False
75+
mock_slack.publish.assert_called_once()
76+
report = mock_slack.publish.call_args[0][0]
77+
assert report.recovery_time_seconds > 0 # timeout used as recovery_seconds

0 commit comments

Comments
 (0)