Skip to content

Commit 0711124

Browse files
Merge pull request #4 from Jeremiah-Sakuda/phase-4-recovery-verification
Phase 4: Recovery verification with real health polling
2 parents 78295a7 + fca8cdd commit 0711124

5 files changed

Lines changed: 187 additions & 12 deletions

File tree

src/autosre/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class Settings(BaseSettings):
3131
# Demo / operations dashboard
3232
operations_dashboard_url: str = "http://localhost:3000"
3333
incident_source: str = "simulated"
34+
# Health/metrics URL for recovery verification (default: dashboard + /api/health)
35+
metrics_url: str = ""
3436

3537
# UI automation (Nova Act): set False to use real browser; True for stub (CI/demo without browser)
3638
ui_stub: bool = True
Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,76 @@
11
"""Verifies service health after remediation actions."""
22

3+
import logging
34
import time
45

6+
import httpx
7+
58
from autosre.models import RecoveryStatus
69

10+
logger = logging.getLogger(__name__)
11+
12+
DEFAULT_POLL_INTERVAL = 3.0
13+
STUB_RECOVERY_SECONDS = 92.0
14+
715

816
class RecoveryMonitor:
917
"""
10-
Checks metrics (latency, error rate) to confirm recovery.
18+
Checks metrics (health endpoint) to confirm recovery.
1119
12-
Stub: waits briefly then returns RECOVERED for deterministic demo.
20+
When metrics_url is set: polls GET metrics_url until response has
21+
status == "healthy" or timeout. Tracks recovery time from action_start_time.
22+
When metrics_url is empty: stub behavior (short sleep, RECOVERED) for demo/CI.
1323
"""
1424

25+
def __init__(self, metrics_url: str | None = None) -> None:
26+
self._metrics_url = (metrics_url or "").strip()
27+
self._last_recovery_seconds: float = 0.0
28+
1529
def verify(
16-
self, incident_id: str, service_name: str, timeout_seconds: float = 120
30+
self,
31+
incident_id: str,
32+
service_name: str,
33+
timeout_seconds: float = 120,
34+
action_start_time: float | None = None,
1735
) -> RecoveryStatus:
18-
"""Poll until recovery signals or timeout. Returns recovery status."""
19-
# Stub: short wait then recovered
20-
time.sleep(1)
21-
return RecoveryStatus.RECOVERED
36+
"""Poll until recovery (status == healthy) or timeout. Returns recovery status."""
37+
if not self._metrics_url:
38+
time.sleep(1)
39+
self._last_recovery_seconds = STUB_RECOVERY_SECONDS
40+
return RecoveryStatus.RECOVERED
41+
42+
start = action_start_time if action_start_time is not None else time.monotonic()
43+
deadline = start + timeout_seconds
44+
poll_interval = DEFAULT_POLL_INTERVAL
45+
46+
while time.monotonic() < deadline:
47+
try:
48+
with httpx.Client(timeout=5.0) as client:
49+
r = client.get(self._metrics_url)
50+
r.raise_for_status()
51+
data = r.json()
52+
if data.get("status") == "healthy":
53+
self._last_recovery_seconds = time.monotonic() - start
54+
logger.info(
55+
"Recovery verified",
56+
extra={
57+
"incident_id": incident_id,
58+
"service_name": service_name,
59+
"recovery_seconds": self._last_recovery_seconds,
60+
},
61+
)
62+
return RecoveryStatus.RECOVERED
63+
except Exception as e:
64+
logger.debug("Health poll failed: %s", e)
65+
time.sleep(poll_interval)
66+
67+
self._last_recovery_seconds = timeout_seconds
68+
logger.warning(
69+
"Recovery timeout",
70+
extra={"incident_id": incident_id, "service_name": service_name},
71+
)
72+
return RecoveryStatus.NOT_RECOVERED
2273

2374
def get_recovery_time_seconds(self) -> float:
24-
"""Return elapsed time from action to recovery (stub)."""
25-
return 92.0
75+
"""Return elapsed time from action to recovery (from last verify run)."""
76+
return self._last_recovery_seconds

src/autosre/workflow.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
→ Health verification → Slack post-mortem
66
"""
77

8+
import time
9+
810
from autosre.config import get_settings
911
from autosre.incident_detection import get_incident_stream
1012
from autosre.log_storage import LogStore
@@ -58,7 +60,11 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
5860
return False
5961

6062
# 5. Recovery verification
61-
status = monitor.verify(incident.incident_id, incident.service_name)
63+
status = monitor.verify(
64+
incident.incident_id,
65+
incident.service_name,
66+
action_start_time=action_start_time,
67+
)
6268
recovery_seconds = monitor.get_recovery_time_seconds()
6369

6470
# 6. Post-mortem to Slack
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""Tests for recovery verification (Phase 4)."""
2+
3+
import time
4+
from unittest.mock import MagicMock, patch
5+
6+
from autosre.models import RecoveryStatus
7+
from autosre.recovery_verification.monitor import STUB_RECOVERY_SECONDS, RecoveryMonitor
8+
9+
10+
def test_monitor_stub_when_no_metrics_url():
11+
"""When metrics_url is empty, returns RECOVERED after short sleep (stub)."""
12+
monitor = RecoveryMonitor(metrics_url="")
13+
status = monitor.verify("inc-1", "checkout", timeout_seconds=120)
14+
assert status == RecoveryStatus.RECOVERED
15+
assert monitor.get_recovery_time_seconds() == STUB_RECOVERY_SECONDS
16+
17+
18+
def test_monitor_stub_with_none_metrics_url():
19+
"""When metrics_url is None, uses stub behavior."""
20+
monitor = RecoveryMonitor(metrics_url=None)
21+
status = monitor.verify("inc-1", "checkout")
22+
assert status == RecoveryStatus.RECOVERED
23+
assert monitor.get_recovery_time_seconds() == STUB_RECOVERY_SECONDS
24+
25+
26+
@patch("autosre.recovery_verification.monitor.time.sleep")
27+
@patch("autosre.recovery_verification.monitor.httpx.Client")
28+
def test_monitor_recovered_when_healthy(mock_client_class, mock_sleep):
29+
"""When GET returns status healthy, returns RECOVERED and sets recovery time."""
30+
mock_response = MagicMock()
31+
mock_response.raise_for_status = MagicMock()
32+
mock_response.json.return_value = {"status": "healthy"}
33+
mock_client = MagicMock()
34+
mock_client.get.return_value = mock_response
35+
mock_client_class.return_value.__enter__.return_value = mock_client
36+
37+
monitor = RecoveryMonitor(metrics_url="http://localhost:3000/api/health")
38+
action_start = time.monotonic()
39+
status = monitor.verify(
40+
"inc-1",
41+
"checkout",
42+
timeout_seconds=30,
43+
action_start_time=action_start,
44+
)
45+
assert status == RecoveryStatus.RECOVERED
46+
recovery_sec = monitor.get_recovery_time_seconds()
47+
assert recovery_sec >= 0
48+
assert recovery_sec < 10
49+
50+
51+
@patch("autosre.recovery_verification.monitor.httpx.Client")
52+
@patch("autosre.recovery_verification.monitor.time.sleep")
53+
def test_monitor_polls_until_healthy(mock_sleep, mock_client_class):
54+
"""Polls until response is healthy; recovery time reflects delay."""
55+
def mk_resp(status: str):
56+
r = MagicMock()
57+
r.raise_for_status = MagicMock()
58+
r.json.return_value = {"status": status}
59+
return r
60+
61+
mock_client = mock_client_class.return_value.__enter__.return_value
62+
mock_client.get.side_effect = [
63+
mk_resp("degraded"),
64+
mk_resp("degraded"),
65+
mk_resp("healthy"),
66+
]
67+
68+
monitor = RecoveryMonitor(metrics_url="http://localhost:3000/api/health")
69+
action_start = time.monotonic()
70+
status = monitor.verify(
71+
"inc-1",
72+
"checkout",
73+
timeout_seconds=30,
74+
action_start_time=action_start,
75+
)
76+
assert status == RecoveryStatus.RECOVERED
77+
assert mock_client.get.call_count == 3
78+
assert mock_sleep.call_count >= 2
79+
80+
81+
@patch("autosre.recovery_verification.monitor.httpx.Client")
82+
@patch("autosre.recovery_verification.monitor.time.monotonic")
83+
@patch("autosre.recovery_verification.monitor.time.sleep")
84+
def test_monitor_timeout_returns_not_recovered(mock_sleep, mock_monotonic, mock_client_class):
85+
"""When timeout is reached without healthy, returns NOT_RECOVERED."""
86+
start = 1000.0
87+
deadline = start + 0.1
88+
# First call: loop condition; then in loop get(); sleep(); loop condition again with time past deadline
89+
mock_monotonic.side_effect = [start, start + 0.05, deadline + 0.01]
90+
def mk_resp(status: str):
91+
r = MagicMock()
92+
r.raise_for_status = MagicMock()
93+
r.json.return_value = {"status": status}
94+
return r
95+
96+
mock_client = mock_client_class.return_value.__enter__.return_value
97+
mock_client.get.return_value = mk_resp("degraded")
98+
99+
monitor = RecoveryMonitor(metrics_url="http://localhost:3000/api/health")
100+
status = monitor.verify(
101+
"inc-1",
102+
"checkout",
103+
timeout_seconds=0.1,
104+
action_start_time=start,
105+
)
106+
assert status == RecoveryStatus.NOT_RECOVERED
107+
assert monitor.get_recovery_time_seconds() == 0.1

tests/test_workflow.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
"""Smoke test for the full workflow (stub components)."""
22

3-
from autosre.models import IncidentType
3+
from unittest.mock import MagicMock, patch
4+
5+
from autosre.models import IncidentType, RecoveryStatus
46
from autosre.workflow import run_once
57

68

7-
def test_run_once_latency_spike():
9+
@patch("autosre.workflow.RecoveryMonitor")
10+
def test_run_once_latency_spike(mock_monitor_class):
811
"""One full cycle with latency_spike incident should complete."""
12+
mock_monitor = MagicMock()
13+
mock_monitor.verify.return_value = RecoveryStatus.RECOVERED
14+
mock_monitor.get_recovery_time_seconds.return_value = 92.0
15+
mock_monitor_class.return_value = mock_monitor
16+
917
result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
1018
assert result is True
19+
mock_monitor.verify.assert_called_once()

0 commit comments

Comments
 (0)