Address code review feedback in dora_monitor

edg-l · edg-l · commit 99caf8d83e97 · 2026-05-20T11:13:30.000+02:00
- guard the slot-set trim against last_known_head=0 (previously the
  cutoff could go negative and silently never trim)
- pick canonical fork by client majority instead of highest head_slot
  (a minority fork can briefly be ahead during a split)
- offline alert only on status=offline; synchronizing/optimistic are
  normal transient states and were over-paging
- split Slack messages on line boundaries when they exceed 3800 chars
  instead of letting Slack silently truncate
- distinguish Slack 429 in the error log
- cap /clients/execution HTML read at 512KB to bound regex work
- clearer error on unknown YAML keys (top-level and under checks:)
- minor: docstring noting heartbeat snapshots aren't atomic, simpler
  dry-run prefix closure, cleaner status check in DoraClient._get
diff --git a/dora_monitor/dora_monitor/checks.py b/dora_monitor/dora_monitor/checks.py
@@ -63,8 +63,13 @@ def check_client_head_forks(
     if not forks:
         return
 
-    # Canonical head = fork with the highest head_slot (majority assumption).
-    canonical_fork = max(forks, key=lambda f: int(f.get("head_slot", 0)))
+    # Canonical = fork followed by the most clients. Using head_slot would
+    # mis-identify a minority fork that's briefly ahead during a split.
+    # Tiebreak on highest head_slot just to be deterministic.
+    canonical_fork = max(
+        forks,
+        key=lambda f: (len(f.get("clients") or []), int(f.get("head_slot", 0))),
+    )
     canonical_slot = int(canonical_fork.get("head_slot", 0))
     canonical_root = canonical_fork.get("head_root", "")
     state.last_known_head = max(state.last_known_head, canonical_slot)
@@ -93,11 +98,14 @@ def check_client_head_forks(
                 "is_canonical_fork": is_canonical,
             }
 
-            if cfg.checks.offline and status and status != "online":
+            # Only `offline` is an actionable alert. `synchronizing` and
+            # `optimistic` are normal transient states (esp. at startup); we
+            # don't want to page on them. Use sync_lag for stuck-syncing nodes.
+            if cfg.checks.offline and status == "offline":
                 current_offline.add(name)
 
-            # Skip fork/lag judgement when the client isn't online; head_slot
-            # is stale and would produce noisy alerts.
+            # Skip fork/lag judgement when the client isn't fully online;
+            # head_slot is stale and would produce noisy alerts.
             if status != "online":
                 continue
 
@@ -184,12 +192,22 @@ def _format_heartbeat(
     dora: DoraClient,
     cfg: Config,
 ) -> str:
+    """Compose the heartbeat digest text.
+
+    Makes two separate HTTP requests (client_head_forks + slots) so the head
+    slot shown and the missed/orphaned counts are sampled at slightly
+    different instants. They may disagree by a slot or two; this is by
+    design, not a bug.
+    """
     payload = dora.client_head_forks()
     forks = payload.get("forks") or []
     canonical_slot = 0
     canonical_root = ""
     if forks:
-        canonical = max(forks, key=lambda f: int(f.get("head_slot", 0)))
+        canonical = max(
+            forks,
+            key=lambda f: (len(f.get("clients") or []), int(f.get("head_slot", 0))),
+        )
         canonical_slot = int(canonical.get("head_slot", 0))
         canonical_root = canonical.get("head_root", "")
 
@@ -283,7 +301,7 @@ def maybe_heartbeat(
         return
     now = time.time()
     interval_s = cfg.heartbeat_interval_minutes * 60
-    if state.last_heartbeat_ts and (now - state.last_heartbeat_ts) < interval_s:
+    if state.last_heartbeat_ts > 0 and (now - state.last_heartbeat_ts) < interval_s:
         return
     try:
         text = _format_heartbeat(dora, cfg)
@@ -322,7 +340,10 @@ def run_checks(
         log.exception("heartbeat failed: %s", e)
 
     # Trim reported-slots sets to keep state file from growing forever.
-    cutoff = state.last_known_head - 10_000
-    if cutoff > 0:
+    # Guard against last_known_head being 0 (e.g. all client_head_forks
+    # checks disabled or the check threw on every tick): without the guard,
+    # cutoff would go negative and the trim would silently be a no-op.
+    if state.last_known_head > 10_000:
+        cutoff = state.last_known_head - 10_000
         state.reported_missed_slots = {s for s in state.reported_missed_slots if s >= cutoff}
         state.reported_orphan_slots = {s for s in state.reported_orphan_slots if s >= cutoff}
diff --git a/dora_monitor/dora_monitor/config.py b/dora_monitor/dora_monitor/config.py
@@ -37,9 +37,15 @@ def load_config(path: str, require_slack: bool = True) -> Config:
         raw = yaml.safe_load(f) or {}
 
     checks_raw = raw.pop("checks", {}) or {}
-    checks = Checks(**checks_raw)
-
-    cfg = Config(checks=checks, **raw)
+    try:
+        checks = Checks(**checks_raw)
+    except TypeError as e:
+        raise ValueError(f"config: unknown key under `checks:` ({e})") from e
+
+    try:
+        cfg = Config(checks=checks, **raw)
+    except TypeError as e:
+        raise ValueError(f"config: unknown top-level key ({e})") from e
 
     env_hook = os.environ.get("SLACK_WEBHOOK_URL")
     if env_hook:
diff --git a/dora_monitor/dora_monitor/dora.py b/dora_monitor/dora_monitor/dora.py
@@ -21,8 +21,10 @@ def _get(self, path: str, params: dict[str, Any] | None = None) -> Any:
         r = self._session.get(url, params=params, timeout=self.timeout)
         r.raise_for_status()
         data = r.json()
-        if isinstance(data, dict) and data.get("status") and data["status"] != "OK":
-            raise RuntimeError(f"dora API error at {path}: {data}")
+        if isinstance(data, dict):
+            api_status = data.get("status")
+            if api_status and api_status != "OK":
+                raise RuntimeError(f"dora API error at {path}: {data}")
         return data
 
     def slots(self, limit: int = 64, with_orphaned: int = 1, with_missing: int = 1) -> list[dict]:
@@ -57,9 +59,14 @@ def execution_versions(self) -> dict[str, str]:
         which is empty for ethrex. So we parse the rendered table.
         """
         url = f"{self.base_url}/clients/execution"
-        r = self._session.get(url, timeout=self.timeout)
+        # Cap the read at 512 KB. Dora's real page is ~220 KB; this guards
+        # against a malformed/runaway response triggering pathological
+        # regex backtracking or a huge in-memory string.
+        r = self._session.get(url, timeout=self.timeout, stream=True)
         r.raise_for_status()
-        body = r.text
+        body = r.raw.read(512 * 1024, decode_content=True).decode(
+            r.encoding or "utf-8", errors="replace"
+        )
         out: dict[str, str] = {}
         for name, row in _ROW_RE.findall(body):
             m = _VERSION_RE.search(row)
diff --git a/dora_monitor/dora_monitor/main.py b/dora_monitor/dora_monitor/main.py
@@ -41,8 +41,9 @@ def cli() -> None:
     dora = DoraClient(cfg.dora_url, timeout=cfg.http_timeout)
     slack = SlackNotifier(cfg.slack_webhook_url, cfg.network_label, timeout=cfg.http_timeout)
     if args.dry_run:
-        def _dry_send(text: str, _orig=slack._prefix) -> None:
-            print(f"[DRY-RUN] {_orig()}{text}")
+        prefix = slack._prefix()
+        def _dry_send(text: str) -> None:
+            print(f"[DRY-RUN] {prefix}{text}")
         slack.send = _dry_send  # type: ignore[assignment]
 
     state = load_state(None if args.reset_state else cfg.state_file)
diff --git a/dora_monitor/dora_monitor/slack.py b/dora_monitor/dora_monitor/slack.py
@@ -4,6 +4,10 @@
 
 log = logging.getLogger(__name__)
 
+# Slack mrkdwn text limit per message is 4000 chars. Keep some headroom for
+# the network-label prefix and the "(i/n)" series marker we may append.
+_MAX_TEXT = 3800
+
 
 class SlackNotifier:
     def __init__(self, webhook_url: str, network_label: str = "", timeout: int = 10):
@@ -14,11 +18,54 @@ def __init__(self, webhook_url: str, network_label: str = "", timeout: int = 10)
     def _prefix(self) -> str:
         return f"[{self.network_label}] " if self.network_label else ""
 
-    def send(self, text: str) -> None:
-        body = {"text": f"{self._prefix()}{text}"}
+    def _post(self, text: str) -> None:
         try:
-            r = requests.post(self.webhook_url, json=body, timeout=self.timeout)
-            if r.status_code >= 300:
-                log.error("slack webhook failed: %s %s", r.status_code, r.text)
+            r = requests.post(self.webhook_url, json={"text": text}, timeout=self.timeout)
+            if r.status_code == 429:
+                retry = r.headers.get("Retry-After", "?")
+                log.error("slack rate-limited (429, retry-after=%s); alert dropped", retry)
+            elif r.status_code >= 300:
+                log.error("slack webhook failed: %s %s", r.status_code, r.text[:200])
         except requests.RequestException as e:
             log.error("slack webhook error: %s", e)
+
+    def send(self, text: str) -> None:
+        body = f"{self._prefix()}{text}"
+        if len(body) <= _MAX_TEXT:
+            self._post(body)
+            return
+
+        chunks = _split_on_lines(body, _MAX_TEXT)
+        total = len(chunks)
+        for i, chunk in enumerate(chunks, 1):
+            self._post(f"{chunk}\n_({i}/{total})_")
+
+
+def _split_on_lines(text: str, limit: int) -> list[str]:
+    """Split text on newline boundaries into chunks of at most `limit` chars.
+
+    Falls back to hard slicing for any single line longer than `limit` so a
+    pathological input still gets through rather than being dropped.
+    """
+    chunks: list[str] = []
+    buf: list[str] = []
+    buf_len = 0
+    for line in text.split("\n"):
+        # Hard-slice a single oversized line into limit-sized pieces.
+        if len(line) > limit:
+            if buf:
+                chunks.append("\n".join(buf))
+                buf, buf_len = [], 0
+            for i in range(0, len(line), limit):
+                chunks.append(line[i : i + limit])
+            continue
+        add = len(line) + (1 if buf else 0)
+        if buf_len + add > limit:
+            chunks.append("\n".join(buf))
+            buf, buf_len = [line], len(line)
+        else:
+            buf.append(line)
+            buf_len += add
+    if buf:
+        chunks.append("\n".join(buf))
+    return chunks