Phase 7: workflow hardening; ruff format

Jeremiah-Sakuda · Jeremiah-Sakuda · commit 235e580a516b · 2026-02-11T15:56:53.000-05:00
- Config: reasoning_max_retries, recovery_verify_timeout_seconds
- Workflow: logging, try/except, reasoning retries, post-mortem on escalation/UI/verify failure
- Slack publish wrapped in try/except; _build_report/_publish_report helpers
- Tests: no_actions, ui_failure, verify_exception; README updated
- Ruff format: store.py, test_reasoning_agent.py
diff --git a/README.md b/README.md
@@ -135,7 +135,8 @@ ruff format --check src dashboard tests
 - **Phase 2:** Operations dashboard (FastAPI + static HTML) with login, services, Deployments panel, Rollback, and /api/health for recovery verification.
 - **Phase 5:** Slack reporter: real publish via `slack_sdk.WebClient` and Block Kit; fallback text; no token/channel → skip.
 - **Phase 6:** Incident/log storage: `LogStore` records incidents, append_log/append_deployment, get_logs_for_incident and get_deployment_history with stub fallbacks; optional file persistence via `LOG_STORAGE_DATA_DIR`.
-- **Next:** Phase 7 (workflow hardening), Phase 8 (demo script).
+- **Phase 7:** Workflow hardening: logging, try/except around record_incident/reasoning/verify/Slack; reasoning retries (`REASONING_MAX_RETRIES`); post-mortem on escalation, UI failure, or verify exception; config `recovery_verify_timeout_seconds`.
+- **Next:** Phase 8 (demo script).
 
 ---
 
diff --git a/src/autosre/config.py b/src/autosre/config.py
@@ -41,6 +41,10 @@ class Settings(BaseSettings):
     # Phase 6: incident / log storage (optional file persistence)
     log_storage_data_dir: str = ""
 
+    # Phase 7: workflow hardening
+    reasoning_max_retries: int = 2
+    recovery_verify_timeout_seconds: float = 120.0
+
 
 def get_settings() -> Settings:
     """Return loaded settings (singleton-style)."""
diff --git a/src/autosre/log_storage/store.py b/src/autosre/log_storage/store.py
@@ -36,7 +36,7 @@ class LogStore:
     """
 
     def __init__(self, data_dir: str | None = None) -> None:
-        self._data_dir = (Path(data_dir) if data_dir else None)
+        self._data_dir = Path(data_dir) if data_dir else None
         self._incidents: list[dict] = []
         self._log_entries: list[dict] = []  # service_name, timestamp, message
         self._deployments: list[dict] = []  # service_name, version, timestamp, status
@@ -88,21 +88,23 @@ def get_incident(self, incident_id: str) -> IncidentEvent | None:
                     incident_id=p["incident_id"],
                     incident_type=IncidentType(p["incident_type"]),
                     service_name=p["service_name"],
-                    detected_at=datetime.fromisoformat(
-                        p["detected_at"].replace("Z", "+00:00")
-                    ),
+                    detected_at=datetime.fromisoformat(p["detected_at"].replace("Z", "+00:00")),
                     raw_payload=p.get("raw_payload") or {},
                 )
         return None
 
-    def append_log(self, service_name: str, message: str, timestamp: datetime | None = None) -> None:
+    def append_log(
+        self, service_name: str, message: str, timestamp: datetime | None = None
+    ) -> None:
         """Append a log line for the given service (for RCA)."""
         ts = timestamp or datetime.utcnow()
-        self._log_entries.append({
-            "service_name": service_name,
-            "timestamp": _iso(ts),
-            "message": message,
-        })
+        self._log_entries.append(
+            {
+                "service_name": service_name,
+                "timestamp": _iso(ts),
+                "message": message,
+            }
+        )
         self._save(_LOG_ENTRIES_FILE, self._log_entries)
 
     def get_logs_for_incident(self, incident: IncidentEvent, window_seconds: int = 3600) -> str:
@@ -135,19 +137,28 @@ def append_deployment(
     ) -> None:
         """Record a deployment event for a service."""
         ts = _iso(timestamp) if isinstance(timestamp, datetime) else str(timestamp)
-        self._deployments.append({
-            "service_name": service_name,
-            "version": version,
-            "timestamp": ts,
-            "status": status,
-        })
+        self._deployments.append(
+            {
+                "service_name": service_name,
+                "version": version,
+                "timestamp": ts,
+                "status": status,
+            }
+        )
         self._save(_DEPLOYMENTS_FILE, self._deployments)
 
     def get_deployment_history(self, service_name: str, limit: int = 5) -> list[dict]:
         """Return recent deployments for the service. Fallback to stub list if empty."""
         filtered = [d for d in self._deployments if d.get("service_name") == service_name]
         filtered.sort(key=lambda d: d.get("timestamp", ""), reverse=True)
-        result = [{"version": d.get("version"), "timestamp": d.get("timestamp"), "status": d.get("status")} for d in filtered[:limit]]
+        result = [
+            {
+                "version": d.get("version"),
+                "timestamp": d.get("timestamp"),
+                "status": d.get("status"),
+            }
+            for d in filtered[:limit]
+        ]
         if result:
             return result
         return STUB_DEPLOYMENTS.copy()
diff --git a/src/autosre/workflow.py b/src/autosre/workflow.py
@@ -5,25 +5,65 @@
     → Health verification → Slack post-mortem
 """
 
+import logging
 import time
 
 from autosre.config import get_settings
 from autosre.incident_detection import get_incident_stream
 from autosre.log_storage import LogStore
-from autosre.models import IncidentType, PostMortemReport, RecoveryStatus
+from autosre.models import Diagnosis, IncidentType, PostMortemReport, RecoveryStatus
 from autosre.planner import PlannerAgent
 from autosre.reasoning_agent import ReasoningAgent
+from autosre.reasoning_agent.agent import FALLBACK_DIAGNOSIS
 from autosre.recovery_verification import RecoveryMonitor
 from autosre.slack_reporter import SlackReporter
 from autosre.ui_automation import UIActionAgent
 
+logger = logging.getLogger(__name__)
+
+
+def _publish_report(slack: SlackReporter, report: PostMortemReport) -> None:
+    """Publish post-mortem to Slack; log and swallow errors so workflow does not crash."""
+    try:
+        slack.publish(report)
+    except Exception as e:
+        logger.warning("Slack publish failed: %s", e, exc_info=True)
+
+
+def _build_report(
+    incident_id: str,
+    detected_at: str,
+    diagnosis: Diagnosis,
+    recovery_seconds: float,
+    status: RecoveryStatus,
+    extra_timeline: list[str] | None = None,
+) -> PostMortemReport:
+    """Build a post-mortem report from incident, diagnosis, and verification result."""
+    timeline = [
+        f"Alert received: {detected_at}",
+        f"Root cause: {diagnosis.summary}",
+        f"Action: {diagnosis.recommended_action.value}",
+        f"Recovery: {status.value} in {recovery_seconds:.0f}s",
+    ]
+    if extra_timeline:
+        timeline.extend(extra_timeline)
+    return PostMortemReport(
+        incident_id=incident_id,
+        root_cause=diagnosis.summary,
+        action_taken=diagnosis.recommended_action.value,
+        recovery_time_seconds=recovery_seconds,
+        prevention_suggestion="Add memory profiling to CI pipeline",
+        timeline=timeline,
+    )
+
 
 def run_once(incident_type: IncidentType | None = None) -> bool:
     """
     Run one full cycle: detect one incident, diagnose, act, verify, report.
 
     For demo pass incident_type=IncidentType.LATENCY_SPIKE.
-    Returns True if the cycle completed successfully.
+    Returns True if the cycle completed successfully (recovered). On escalation,
+    UI failure, or verification failure still publishes a post-mortem when possible.
     """
     settings = get_settings()
     log_store = LogStore(data_dir=settings.log_storage_data_dir or None)
@@ -44,49 +84,86 @@ def run_once(incident_type: IncidentType | None = None) -> bool:
     stream = get_incident_stream(incident_type=incident_type)
     incident = next(stream, None)
     if not incident:
+        logger.warning("No incident received")
         return False
-    log_store.record_incident(incident)
 
-    # 2. Root cause analysis (Nova)
+    try:
+        log_store.record_incident(incident)
+    except Exception as e:
+        logger.warning("Failed to record incident: %s", e, exc_info=True)
+
+    # 2. Root cause analysis (Nova), with retries
     logs = log_store.get_logs_for_incident(incident)
     deployment_history = log_store.get_deployment_history(incident.service_name)
-    diagnosis = reasoning.analyze(incident, logs, deployment_history)
+    diagnosis: Diagnosis = FALLBACK_DIAGNOSIS
+    max_attempts = 1 + max(0, settings.reasoning_max_retries)
+    for attempt in range(max_attempts):
+        try:
+            result = reasoning.analyze(incident, logs, deployment_history)
+            if result is not None:
+                diagnosis = result
+                break
+        except Exception as e:
+            logger.warning("Reasoning attempt %s failed: %s", attempt + 1, e, exc_info=True)
+            if attempt == max_attempts - 1:
+                diagnosis = FALLBACK_DIAGNOSIS
 
     # 3. Plan actions
     actions = planner.plan(diagnosis)
     if not actions:
-        # e.g. ESCALATE
+        logger.info("No actions (e.g. escalate); publishing escalation report")
+        report = _build_report(
+            incident.incident_id,
+            incident.detected_at.isoformat(),
+            diagnosis,
+            0.0,
+            RecoveryStatus.UNKNOWN,
+            extra_timeline=["Escalated; no automated action taken."],
+        )
+        _publish_report(slack, report)
         return False
 
     # 4. UI automation (Nova Act)
     action_start_time = time.monotonic()
     success = ui_agent.execute(actions, service_name=incident.service_name)
     if not success:
+        logger.warning("UI automation failed; publishing report")
+        report = _build_report(
+            incident.incident_id,
+            incident.detected_at.isoformat(),
+            diagnosis,
+            0.0,
+            RecoveryStatus.NOT_RECOVERED,
+            extra_timeline=["UI action execution failed."],
+        )
+        _publish_report(slack, report)
         return False
 
     # 5. Recovery verification
-    status = monitor.verify(
-        incident.incident_id,
-        incident.service_name,
-        action_start_time=action_start_time,
-    )
-    recovery_seconds = monitor.get_recovery_time_seconds()
+    timeout = settings.recovery_verify_timeout_seconds
+    recovery_seconds: float = 0.0
+    try:
+        status = monitor.verify(
+            incident.incident_id,
+            incident.service_name,
+            timeout_seconds=timeout,
+            action_start_time=action_start_time,
+        )
+        recovery_seconds = monitor.get_recovery_time_seconds()
+    except Exception as e:
+        logger.warning("Recovery verification failed: %s", e, exc_info=True)
+        status = RecoveryStatus.NOT_RECOVERED
+        recovery_seconds = timeout
 
     # 6. Post-mortem to Slack
-    report = PostMortemReport(
-        incident_id=incident.incident_id,
-        root_cause=diagnosis.summary,
-        action_taken=f"{diagnosis.recommended_action.value}",
-        recovery_time_seconds=recovery_seconds,
-        prevention_suggestion="Add memory profiling to CI pipeline",
-        timeline=[
-            f"Alert received: {incident.detected_at.isoformat()}",
-            "Root cause: bad deployment v1.4.2",
-            "Action: rollback to v1.4.1",
-            f"Recovery: {status.value} in {recovery_seconds:.0f}s",
-        ],
+    report = _build_report(
+        incident.incident_id,
+        incident.detected_at.isoformat(),
+        diagnosis,
+        recovery_seconds,
+        status,
     )
-    slack.publish(report)
+    _publish_report(slack, report)
 
     return status == RecoveryStatus.RECOVERED
 
diff --git a/tests/test_reasoning_agent.py b/tests/test_reasoning_agent.py
@@ -15,7 +15,7 @@
 
 
 def test_parse_diagnosis_from_text_valid_json():
-    text = '''{"summary": "Memory leak in v1.4.2", "confidence": 0.9, "recommended_action": "rollback", "reasoning": "Logs point to deployment."}'''
+    text = """{"summary": "Memory leak in v1.4.2", "confidence": 0.9, "recommended_action": "rollback", "reasoning": "Logs point to deployment."}"""
     d = _parse_diagnosis_from_text(text)
     assert d is not None
     assert d.summary == "Memory leak in v1.4.2"
@@ -109,14 +109,12 @@ def test_reasoning_agent_bedrock_success(mock_get_client, sample_incident):
 
 
 @patch("autosre.reasoning_agent.agent._get_bedrock_client")
-def test_reasoning_agent_bedrock_invalid_response_returns_fallback(mock_get_client, sample_incident):
+def test_reasoning_agent_bedrock_invalid_response_returns_fallback(
+    mock_get_client, sample_incident
+):
     mock_client = MagicMock()
     mock_client.converse.return_value = {
-        "output": {
-            "message": {
-                "content": [{"text": "I'm not JSON, just prose."}]
-            }
-        }
+        "output": {"message": {"content": [{"text": "I'm not JSON, just prose."}]}}
     }
     mock_get_client.return_value = mock_client
 
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
@@ -1,4 +1,4 @@
-"""Smoke test for the full workflow (stub components)."""
+"""Smoke test for the full workflow (stub components) and Phase 7 hardening."""
 
 from unittest.mock import MagicMock, patch
 
@@ -17,3 +17,61 @@ def test_run_once_latency_spike(mock_monitor_class):
     result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
     assert result is True
     mock_monitor.verify.assert_called_once()
+
+
+@patch("autosre.workflow.SlackReporter")
+@patch("autosre.workflow.RecoveryMonitor")
+@patch("autosre.workflow.PlannerAgent")
+def test_run_once_no_actions_publishes_escalation_report(
+    mock_planner_class, mock_monitor_class, mock_slack_class
+):
+    """When planner returns no actions (e.g. ESCALATE), publish report and return False."""
+    mock_planner = MagicMock()
+    mock_planner.plan.return_value = []
+    mock_planner_class.return_value = mock_planner
+    mock_slack = MagicMock()
+    mock_slack_class.return_value = mock_slack
+
+    result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
+    assert result is False
+    mock_slack.publish.assert_called_once()
+    report = mock_slack.publish.call_args[0][0]
+    assert any("Escalated" in line for line in report.timeline)
+
+
+@patch("autosre.workflow.SlackReporter")
+@patch("autosre.workflow.RecoveryMonitor")
+@patch("autosre.workflow.UIActionAgent")
+def test_run_once_ui_failure_publishes_report(mock_ui_class, mock_monitor_class, mock_slack_class):
+    """When UI agent fails, publish report and return False."""
+    mock_ui = MagicMock()
+    mock_ui.execute.return_value = False
+    mock_ui_class.return_value = mock_ui
+    mock_slack = MagicMock()
+    mock_slack_class.return_value = mock_slack
+
+    result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
+    assert result is False
+    mock_slack.publish.assert_called_once()
+    report = mock_slack.publish.call_args[0][0]
+    assert report.recovery_time_seconds == 0.0
+    assert "UI action" in report.timeline[-1] or "failed" in report.timeline[-1].lower()
+
+
+@patch("autosre.workflow.SlackReporter")
+@patch("autosre.workflow.RecoveryMonitor")
+def test_run_once_verify_exception_returns_false_and_publishes(
+    mock_monitor_class, mock_slack_class
+):
+    """When recovery verify raises, still publish report and return False."""
+    mock_monitor = MagicMock()
+    mock_monitor.verify.side_effect = RuntimeError("network error")
+    mock_monitor_class.return_value = mock_monitor
+    mock_slack = MagicMock()
+    mock_slack_class.return_value = mock_slack
+
+    result = run_once(incident_type=IncidentType.LATENCY_SPIKE)
+    assert result is False
+    mock_slack.publish.assert_called_once()
+    report = mock_slack.publish.call_args[0][0]
+    assert report.recovery_time_seconds > 0  # timeout used as recovery_seconds