Restore Phase 3-5 after merge: config, UI automation, recovery monitor, workflow, tests

Jeremiah-Sakuda · Jeremiah-Sakuda · commit b27b17318c7d · 2026-02-11T14:29:53.000-05:00
- config: add metrics_url, ui_stub, nova_act_api_key
- recovery_verification: full Phase 4 monitor (httpx poll, action_start_time)
- ui_automation: prompts.py and Nova Act agent; pyproject: nova-act dep
- workflow: metrics_url, RecoveryMonitor, UIActionAgent, action_start_time, verify()
- tests: test_recovery_verification, test_ui_automation; workflow test mocks RecoveryMonitor
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     "boto3>=1.34.0",
     "fastapi>=0.115.0",
     "httpx>=0.27.0",
+    "nova-act>=3.0.0",
     "pydantic>=2.0",
     "pydantic-settings>=2.0",
     "python-dotenv>=1.0",
diff --git a/src/autosre/config.py b/src/autosre/config.py
@@ -31,6 +31,12 @@ class Settings(BaseSettings):
     # Demo / operations dashboard
     operations_dashboard_url: str = "http://localhost:3000"
     incident_source: str = "simulated"
+    # Health URL for recovery verification (default: dashboard + /api/health)
+    metrics_url: str = ""
+
+    # UI automation (Nova Act): True = stub only; False = use real browser
+    ui_stub: bool = True
+    nova_act_api_key: str = ""
 
 
 def get_settings() -> Settings:
diff --git a/src/autosre/recovery_verification/monitor.py b/src/autosre/recovery_verification/monitor.py
@@ -1,25 +1,76 @@
 """Verifies service health after remediation actions."""
 
+import logging
 import time
 
+import httpx
+
 from autosre.models import RecoveryStatus
 
+logger = logging.getLogger(__name__)
+
+DEFAULT_POLL_INTERVAL = 3.0
+STUB_RECOVERY_SECONDS = 92.0
+
 
 class RecoveryMonitor:
     """
-    Checks metrics (latency, error rate) to confirm recovery.
+    Checks metrics (health endpoint) to confirm recovery.
 
-    Stub: waits briefly then returns RECOVERED for deterministic demo.
+    When metrics_url is set: polls GET metrics_url until response has
+    status == "healthy" or timeout. Tracks recovery time from action_start_time.
+    When metrics_url is empty: stub behavior (short sleep, RECOVERED) for demo/CI.
     """
 
+    def __init__(self, metrics_url: str | None = None) -> None:
+        self._metrics_url = (metrics_url or "").strip()
+        self._last_recovery_seconds: float = 0.0
+
     def verify(
-        self, incident_id: str, service_name: str, timeout_seconds: float = 120
+        self,
+        incident_id: str,
+        service_name: str,
+        timeout_seconds: float = 120,
+        action_start_time: float | None = None,
     ) -> RecoveryStatus:
-        """Poll until recovery signals or timeout. Returns recovery status."""
-        # Stub: short wait then recovered
-        time.sleep(1)
-        return RecoveryStatus.RECOVERED
+        """Poll until recovery (status == healthy) or timeout. Returns recovery status."""
+        if not self._metrics_url:
+            time.sleep(1)
+            self._last_recovery_seconds = STUB_RECOVERY_SECONDS
+            return RecoveryStatus.RECOVERED
+
+        start = action_start_time if action_start_time is not None else time.monotonic()
+        deadline = start + timeout_seconds
+        poll_interval = DEFAULT_POLL_INTERVAL
+
+        while time.monotonic() < deadline:
+            try:
+                with httpx.Client(timeout=5.0) as client:
+                    r = client.get(self._metrics_url)
+                    r.raise_for_status()
+                    data = r.json()
+                    if data.get("status") == "healthy":
+                        self._last_recovery_seconds = time.monotonic() - start
+                        logger.info(
+                            "Recovery verified",
+                            extra={
+                                "incident_id": incident_id,
+                                "service_name": service_name,
+                                "recovery_seconds": self._last_recovery_seconds,
+                            },
+                        )
+                        return RecoveryStatus.RECOVERED
+            except Exception as e:
+                logger.debug("Health poll failed: %s", e)
+            time.sleep(poll_interval)
+
+        self._last_recovery_seconds = timeout_seconds
+        logger.warning(
+            "Recovery timeout",
+            extra={"incident_id": incident_id, "service_name": service_name},
+        )
+        return RecoveryStatus.NOT_RECOVERED
 
     def get_recovery_time_seconds(self) -> float:
-        """Return elapsed time from action to recovery (stub)."""
-        return 92.0
+        """Return elapsed time from action to recovery (from last verify run)."""
+        return self._last_recovery_seconds
diff --git a/src/autosre/ui_automation/agent.py b/src/autosre/ui_automation/agent.py
@@ -1,26 +1,75 @@
 """UI action agent (Nova Act) — executes planned steps on the operations dashboard."""
 
+import logging
+import os
+
 from autosre.models import PlannedAction
+from autosre.ui_automation.prompts import actions_to_prompts
+
+logger = logging.getLogger(__name__)
+
+
+def _run_nova_act(dashboard_url: str, prompts: list[str], api_key: str | None) -> bool:
+    """Run Nova Act with the given prompts. Returns True if all steps succeeded."""
+    try:
+        from nova_act import NovaAct
+    except ImportError as e:
+        logger.warning("Nova Act SDK not available; cannot run UI automation: %s", e)
+        return False
+    if api_key:
+        os.environ["NOVA_ACT_API_KEY"] = api_key
+    try:
+        with NovaAct(starting_page=dashboard_url) as nova:
+            for prompt in prompts:
+                nova.act(prompt)
+        return True
+    except Exception as e:
+        logger.warning("Nova Act execution failed: %s", e, exc_info=True)
+        return False
 
 
 class UIActionAgent:
     """
     Performs UI automation against the operations dashboard.
 
-    Production: Nova Act for real browser/UI interaction.
-    Stub: logs actions for demo.
+    In stub mode (default): logs actions and returns True (no browser).
+    With use_nova_act=True: uses Nova Act SDK to run natural-language prompts.
     """
 
-    def __init__(self, dashboard_url: str = "http://localhost:3000") -> None:
-        self.dashboard_url = dashboard_url
+    def __init__(
+        self,
+        dashboard_url: str = "http://localhost:3000",
+        use_nova_act: bool = False,
+        api_key: str | None = None,
+    ) -> None:
+        self.dashboard_url = dashboard_url.rstrip("/")
+        self._use_nova_act = use_nova_act
+        self._api_key = api_key or ""
 
-    def execute(self, actions: list[PlannedAction]) -> bool:
+    def execute(
+        self,
+        actions: list[PlannedAction],
+        service_name: str | None = None,
+    ) -> bool:
         """Execute the list of planned actions. Returns True if all succeeded."""
+        if not actions:
+            return True
+        prompts = actions_to_prompts(
+            actions,
+            service_name=service_name,
+            include_login=True,
+        )
+        if self._use_nova_act:
+            return _run_nova_act(
+                self.dashboard_url,
+                prompts,
+                self._api_key or None,
+            )
         for action in actions:
-            self._execute_one(action)
+            logger.info(
+                "UIAction (stub) %s on %s params=%s",
+                action.action_type,
+                action.target,
+                action.parameters,
+            )
         return True
-
-    def _execute_one(self, action: PlannedAction) -> None:
-        """Execute a single action (stub: log only)."""
-        # TODO: Nova Act integration
-        print(f"[UIAction] {action.action_type} on {action.target} params={action.parameters}")
diff --git a/src/autosre/ui_automation/prompts.py b/src/autosre/ui_automation/prompts.py
@@ -0,0 +1,44 @@
+"""Convert PlannedAction list into natural-language prompts for Nova Act."""
+
+from autosre.models import PlannedAction
+
+
+def actions_to_prompts(
+    actions: list[PlannedAction],
+    service_name: str | None = None,
+    include_login: bool = True,
+) -> list[str]:
+    """
+    Build a list of short, direct prompts for Nova Act act() calls.
+
+    Uses deterministic labels matching the operations dashboard:
+    Demo login, Services, Checkout/Payments, Deployments tab, Rollback button.
+    """
+    prompts: list[str] = []
+    if include_login:
+        prompts.append("Click the Demo login button.")
+    if service_name:
+        display = service_name.capitalize()
+        prompts.append(f"Click the {display} service link.")
+    for action in actions:
+        if action.action_type == "navigate" and action.target == "deployment_panel":
+            prompts.append("Click the Deployments tab.")
+        elif action.action_type == "click_rollback":
+            to_version = action.parameters.get("to_version", "previous")
+            prompts.append(f"Click the Rollback button for version {to_version}.")
+        elif action.action_type == "navigate" and action.target == "service_instances":
+            prompts.append("Open the service instances section.")
+        elif action.action_type == "restart_instance":
+            prompts.append("Click the Restart button for the first instance.")
+        elif action.action_type == "navigate" and action.target == "service_scaling":
+            prompts.append("Open the scaling section.")
+        elif action.action_type == "scale_replicas":
+            replicas = action.parameters.get("replicas", 4)
+            prompts.append(f"Set replicas to {replicas} and apply.")
+        elif action.action_type == "navigate" and action.target == "db_pool":
+            prompts.append("Open the database pool section.")
+        elif action.action_type == "restart_pool":
+            prompts.append("Click the Restart pool button.")
+        else:
+            prompts.append(f"Perform {action.action_type} on {action.target}.")
+    return prompts
diff --git a/src/autosre/workflow.py b/src/autosre/workflow.py
@@ -5,6 +5,8 @@
     → Health verification → Slack post-mortem
 """
 
+import time
+
 from autosre.config import get_settings
 from autosre.incident_detection import get_incident_stream
 from autosre.log_storage import LogStore
@@ -27,8 +29,15 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
     log_store = LogStore()
     reasoning = ReasoningAgent(use_bedrock=settings.reasoning_use_bedrock)
     planner = PlannerAgent()
-    ui_agent = UIActionAgent(dashboard_url=settings.operations_dashboard_url)
-    monitor = RecoveryMonitor()
+    metrics_url = settings.metrics_url or (
+        settings.operations_dashboard_url.rstrip("/") + "/api/health"
+    )
+    monitor = RecoveryMonitor(metrics_url=metrics_url)
+    ui_agent = UIActionAgent(
+        dashboard_url=settings.operations_dashboard_url,
+        use_nova_act=not settings.ui_stub,
+        api_key=settings.nova_act_api_key or None,
+    )
     slack = SlackReporter(bot_token=settings.slack_bot_token, channel_id=settings.slack_channel_id)
 
     # 1. Incident detection
@@ -49,12 +58,17 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
         return False
 
     # 4. UI automation (Nova Act)
-    success = ui_agent.execute(actions)
+    action_start_time = time.monotonic()
+    success = ui_agent.execute(actions, service_name=incident.service_name)
     if not success:
         return False
 
     # 5. Recovery verification
-    status = monitor.verify(incident.incident_id, incident.service_name)
+    status = monitor.verify(
+        incident.incident_id,
+        incident.service_name,
+        action_start_time=action_start_time,
+    )
     recovery_seconds = monitor.get_recovery_time_seconds()
 
     # 6. Post-mortem to Slack
diff --git a/tests/test_reasoning_agent.py b/tests/test_reasoning_agent.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from autosre.models import Diagnosis, IncidentEvent, IncidentType, RecommendedAction
+from autosre.models import IncidentEvent, IncidentType, RecommendedAction
 from autosre.reasoning_agent.agent import (
     ReasoningAgent,
     _extract_text_from_converse_response,
diff --git a/tests/test_recovery_verification.py b/tests/test_recovery_verification.py
diff --git a/tests/test_ui_automation.py b/tests/test_ui_automation.py
diff --git a/tests/test_workflow.py b/tests/test_workflow.py