Skip to content

Commit b27b173

Browse files
Restore Phase 3-5 after merge: config, UI automation, recovery monitor, workflow, tests
- config: add metrics_url, ui_stub, nova_act_api_key - recovery_verification: full Phase 4 monitor (httpx poll, action_start_time) - ui_automation: prompts.py and Nova Act agent; pyproject: nova-act dep - workflow: metrics_url, RecoveryMonitor, UIActionAgent, action_start_time, verify() - tests: test_recovery_verification, test_ui_automation; workflow test mocks RecoveryMonitor
1 parent 56abe09 commit b27b173

10 files changed

Lines changed: 407 additions & 27 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies = [
2222
"boto3>=1.34.0",
2323
"fastapi>=0.115.0",
2424
"httpx>=0.27.0",
25+
"nova-act>=3.0.0",
2526
"pydantic>=2.0",
2627
"pydantic-settings>=2.0",
2728
"python-dotenv>=1.0",

src/autosre/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ class Settings(BaseSettings):
3131
# Demo / operations dashboard
3232
operations_dashboard_url: str = "http://localhost:3000"
3333
incident_source: str = "simulated"
34+
# Health URL for recovery verification (default: dashboard + /api/health)
35+
metrics_url: str = ""
36+
37+
# UI automation (Nova Act): True = stub only; False = use real browser
38+
ui_stub: bool = True
39+
nova_act_api_key: str = ""
3440

3541

3642
def get_settings() -> Settings:
Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,76 @@
11
"""Verifies service health after remediation actions."""
22

3+
import logging
34
import time
45

6+
import httpx
7+
58
from autosre.models import RecoveryStatus
69

10+
logger = logging.getLogger(__name__)
11+
12+
DEFAULT_POLL_INTERVAL = 3.0
13+
STUB_RECOVERY_SECONDS = 92.0
14+
715

816
class RecoveryMonitor:
917
"""
10-
Checks metrics (latency, error rate) to confirm recovery.
18+
Checks metrics (health endpoint) to confirm recovery.
1119
12-
Stub: waits briefly then returns RECOVERED for deterministic demo.
20+
When metrics_url is set: polls GET metrics_url until response has
21+
status == "healthy" or timeout. Tracks recovery time from action_start_time.
22+
When metrics_url is empty: stub behavior (short sleep, RECOVERED) for demo/CI.
1323
"""
1424

25+
def __init__(self, metrics_url: str | None = None) -> None:
26+
self._metrics_url = (metrics_url or "").strip()
27+
self._last_recovery_seconds: float = 0.0
28+
1529
def verify(
16-
self, incident_id: str, service_name: str, timeout_seconds: float = 120
30+
self,
31+
incident_id: str,
32+
service_name: str,
33+
timeout_seconds: float = 120,
34+
action_start_time: float | None = None,
1735
) -> RecoveryStatus:
18-
"""Poll until recovery signals or timeout. Returns recovery status."""
19-
# Stub: short wait then recovered
20-
time.sleep(1)
21-
return RecoveryStatus.RECOVERED
36+
"""Poll until recovery (status == healthy) or timeout. Returns recovery status."""
37+
if not self._metrics_url:
38+
time.sleep(1)
39+
self._last_recovery_seconds = STUB_RECOVERY_SECONDS
40+
return RecoveryStatus.RECOVERED
41+
42+
start = action_start_time if action_start_time is not None else time.monotonic()
43+
deadline = start + timeout_seconds
44+
poll_interval = DEFAULT_POLL_INTERVAL
45+
46+
while time.monotonic() < deadline:
47+
try:
48+
with httpx.Client(timeout=5.0) as client:
49+
r = client.get(self._metrics_url)
50+
r.raise_for_status()
51+
data = r.json()
52+
if data.get("status") == "healthy":
53+
self._last_recovery_seconds = time.monotonic() - start
54+
logger.info(
55+
"Recovery verified",
56+
extra={
57+
"incident_id": incident_id,
58+
"service_name": service_name,
59+
"recovery_seconds": self._last_recovery_seconds,
60+
},
61+
)
62+
return RecoveryStatus.RECOVERED
63+
except Exception as e:
64+
logger.debug("Health poll failed: %s", e)
65+
time.sleep(poll_interval)
66+
67+
self._last_recovery_seconds = timeout_seconds
68+
logger.warning(
69+
"Recovery timeout",
70+
extra={"incident_id": incident_id, "service_name": service_name},
71+
)
72+
return RecoveryStatus.NOT_RECOVERED
2273

2374
def get_recovery_time_seconds(self) -> float:
24-
"""Return elapsed time from action to recovery (stub)."""
25-
return 92.0
75+
"""Return elapsed time from action to recovery (from last verify run)."""
76+
return self._last_recovery_seconds

src/autosre/ui_automation/agent.py

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,75 @@
11
"""UI action agent (Nova Act) — executes planned steps on the operations dashboard."""
22

3+
import logging
4+
import os
5+
36
from autosre.models import PlannedAction
7+
from autosre.ui_automation.prompts import actions_to_prompts
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
def _run_nova_act(dashboard_url: str, prompts: list[str], api_key: str | None) -> bool:
13+
"""Run Nova Act with the given prompts. Returns True if all steps succeeded."""
14+
try:
15+
from nova_act import NovaAct
16+
except ImportError as e:
17+
logger.warning("Nova Act SDK not available; cannot run UI automation: %s", e)
18+
return False
19+
if api_key:
20+
os.environ["NOVA_ACT_API_KEY"] = api_key
21+
try:
22+
with NovaAct(starting_page=dashboard_url) as nova:
23+
for prompt in prompts:
24+
nova.act(prompt)
25+
return True
26+
except Exception as e:
27+
logger.warning("Nova Act execution failed: %s", e, exc_info=True)
28+
return False
429

530

631
class UIActionAgent:
732
"""
833
Performs UI automation against the operations dashboard.
934
10-
Production: Nova Act for real browser/UI interaction.
11-
Stub: logs actions for demo.
35+
In stub mode (default): logs actions and returns True (no browser).
36+
With use_nova_act=True: uses Nova Act SDK to run natural-language prompts.
1237
"""
1338

14-
def __init__(self, dashboard_url: str = "http://localhost:3000") -> None:
15-
self.dashboard_url = dashboard_url
39+
def __init__(
40+
self,
41+
dashboard_url: str = "http://localhost:3000",
42+
use_nova_act: bool = False,
43+
api_key: str | None = None,
44+
) -> None:
45+
self.dashboard_url = dashboard_url.rstrip("/")
46+
self._use_nova_act = use_nova_act
47+
self._api_key = api_key or ""
1648

17-
def execute(self, actions: list[PlannedAction]) -> bool:
49+
def execute(
50+
self,
51+
actions: list[PlannedAction],
52+
service_name: str | None = None,
53+
) -> bool:
1854
"""Execute the list of planned actions. Returns True if all succeeded."""
55+
if not actions:
56+
return True
57+
prompts = actions_to_prompts(
58+
actions,
59+
service_name=service_name,
60+
include_login=True,
61+
)
62+
if self._use_nova_act:
63+
return _run_nova_act(
64+
self.dashboard_url,
65+
prompts,
66+
self._api_key or None,
67+
)
1968
for action in actions:
20-
self._execute_one(action)
69+
logger.info(
70+
"UIAction (stub) %s on %s params=%s",
71+
action.action_type,
72+
action.target,
73+
action.parameters,
74+
)
2175
return True
22-
23-
def _execute_one(self, action: PlannedAction) -> None:
24-
"""Execute a single action (stub: log only)."""
25-
# TODO: Nova Act integration
26-
print(f"[UIAction] {action.action_type} on {action.target} params={action.parameters}")
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Convert PlannedAction list into natural-language prompts for Nova Act."""
2+
3+
from autosre.models import PlannedAction
4+
5+
6+
def actions_to_prompts(
7+
actions: list[PlannedAction],
8+
service_name: str | None = None,
9+
include_login: bool = True,
10+
) -> list[str]:
11+
"""
12+
Build a list of short, direct prompts for Nova Act act() calls.
13+
14+
Uses deterministic labels matching the operations dashboard:
15+
Demo login, Services, Checkout/Payments, Deployments tab, Rollback button.
16+
"""
17+
prompts: list[str] = []
18+
if include_login:
19+
prompts.append("Click the Demo login button.")
20+
if service_name:
21+
display = service_name.capitalize()
22+
prompts.append(f"Click the {display} service link.")
23+
for action in actions:
24+
if action.action_type == "navigate" and action.target == "deployment_panel":
25+
prompts.append("Click the Deployments tab.")
26+
elif action.action_type == "click_rollback":
27+
to_version = action.parameters.get("to_version", "previous")
28+
prompts.append(f"Click the Rollback button for version {to_version}.")
29+
elif action.action_type == "navigate" and action.target == "service_instances":
30+
prompts.append("Open the service instances section.")
31+
elif action.action_type == "restart_instance":
32+
prompts.append("Click the Restart button for the first instance.")
33+
elif action.action_type == "navigate" and action.target == "service_scaling":
34+
prompts.append("Open the scaling section.")
35+
elif action.action_type == "scale_replicas":
36+
replicas = action.parameters.get("replicas", 4)
37+
prompts.append(f"Set replicas to {replicas} and apply.")
38+
elif action.action_type == "navigate" and action.target == "db_pool":
39+
prompts.append("Open the database pool section.")
40+
elif action.action_type == "restart_pool":
41+
prompts.append("Click the Restart pool button.")
42+
else:
43+
prompts.append(f"Perform {action.action_type} on {action.target}.")
44+
return prompts

src/autosre/workflow.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
→ Health verification → Slack post-mortem
66
"""
77

8+
import time
9+
810
from autosre.config import get_settings
911
from autosre.incident_detection import get_incident_stream
1012
from autosre.log_storage import LogStore
@@ -27,8 +29,15 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
2729
log_store = LogStore()
2830
reasoning = ReasoningAgent(use_bedrock=settings.reasoning_use_bedrock)
2931
planner = PlannerAgent()
30-
ui_agent = UIActionAgent(dashboard_url=settings.operations_dashboard_url)
31-
monitor = RecoveryMonitor()
32+
metrics_url = settings.metrics_url or (
33+
settings.operations_dashboard_url.rstrip("/") + "/api/health"
34+
)
35+
monitor = RecoveryMonitor(metrics_url=metrics_url)
36+
ui_agent = UIActionAgent(
37+
dashboard_url=settings.operations_dashboard_url,
38+
use_nova_act=not settings.ui_stub,
39+
api_key=settings.nova_act_api_key or None,
40+
)
3241
slack = SlackReporter(bot_token=settings.slack_bot_token, channel_id=settings.slack_channel_id)
3342

3443
# 1. Incident detection
@@ -49,12 +58,17 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
4958
return False
5059

5160
# 4. UI automation (Nova Act)
52-
success = ui_agent.execute(actions)
61+
action_start_time = time.monotonic()
62+
success = ui_agent.execute(actions, service_name=incident.service_name)
5363
if not success:
5464
return False
5565

5666
# 5. Recovery verification
57-
status = monitor.verify(incident.incident_id, incident.service_name)
67+
status = monitor.verify(
68+
incident.incident_id,
69+
incident.service_name,
70+
action_start_time=action_start_time,
71+
)
5872
recovery_seconds = monitor.get_recovery_time_seconds()
5973

6074
# 6. Post-mortem to Slack

tests/test_reasoning_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pytest
77

8-
from autosre.models import Diagnosis, IncidentEvent, IncidentType, RecommendedAction
8+
from autosre.models import IncidentEvent, IncidentType, RecommendedAction
99
from autosre.reasoning_agent.agent import (
1010
ReasoningAgent,
1111
_extract_text_from_converse_response,

0 commit comments

Comments
 (0)