Skip to content

Commit 2e8e02a

Browse files
committed
Render heartbeat with Slack Block Kit and group healthy clients
- post heartbeats via Block Kit (header / section / divider / context) instead of one mrkdwn blob; action alerts stay as plain text posts - new send_blocks() on SlackNotifier with text fallback for notifications - collapse online + canonical + distance=0 clients into one bucket; surface outliers (offline, synchronizing, non-canonical, lagging) above the healthy bucket with status emoji per row - status emojis: green/yellow/orange/red circles for online/sync/opt/off - dry-run patches both send and send_blocks; --debug dumps blocks JSON so it can be previewed in Slack's Block Kit Builder
1 parent 99caf8d commit 2e8e02a

4 files changed

Lines changed: 169 additions & 37 deletions

File tree

dora_monitor/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ The process holds dedup state in `state_file` so restarts don't re-alert on alre
4646

4747
Recoveries (fork resolved, caught up, back online) are posted as well.
4848

49+
The periodic heartbeat digest uses Slack Block Kit (`{"blocks": [...]}`) with a plain-text fallback for notifications. Action alerts (offline / fork / lag / version change / missed-block) use plain mrkdwn `text` posts. Clients with status `online`, on the canonical fork, and at `distance == 0` from canonical head collapse into a single "online @ canonical" bucket so the digest highlights outliers instead of repeating identical rows; use `heartbeat_other_clients: detailed` (default) to list the healthy names, `summary` for just a count, or `off` to drop the section entirely.
50+
4951
## A note on what "client" means here
5052

5153
`/api/v1/network/client_head_forks` lists Dora's **beacon (CL)** clients; their names embed the paired EL (e.g. `lighthouse-ethrex-1` is the Lighthouse beacon paired with an ethrex EL). So the offline / fork / lag signals are observed on the beacon side. An ethrex-EL crash shows up indirectly: the paired beacon's head stops advancing (sync_lag) or its status flips to non-online (offline).

dora_monitor/dora_monitor/checks.py

Lines changed: 145 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,59 @@ def check_version_drift(
188188
state.client_versions[name] = version
189189

190190

191-
def _format_heartbeat(
191+
_STATUS_EMOJI = {
192+
"online": ":large_green_circle:",
193+
"synchronizing": ":large_yellow_circle:",
194+
"optimistic": ":large_orange_circle:",
195+
"offline": ":red_circle:",
196+
}
197+
198+
199+
def _status_emoji(status: str) -> str:
200+
return _STATUS_EMOJI.get(status, ":white_circle:")
201+
202+
203+
def _health_rank(entry: dict) -> int:
204+
"""Lower rank = surface higher. Sort key for ordering clients."""
205+
status = entry["status"]
206+
if status == "offline":
207+
return 0
208+
if status in ("synchronizing", "optimistic"):
209+
return 1
210+
if not entry["is_canonical_fork"]:
211+
return 2
212+
if entry["distance"] > 0:
213+
return 3
214+
return 4
215+
216+
217+
def _client_line(entry: dict) -> str:
218+
"""One-line mrkdwn rendering of a single client outlier row."""
219+
parts = [
220+
_status_emoji(entry["status"]),
221+
f"`{entry['name']}`",
222+
f"head `{entry['head_slot']}`",
223+
]
224+
if entry["distance"] > 0:
225+
parts.append(f"· *{entry['distance']} behind*")
226+
if not entry["is_canonical_fork"]:
227+
parts.append("· :fork_and_knife: non-canonical")
228+
return " ".join(parts)
229+
230+
231+
def _section(text: str) -> dict:
232+
return {"type": "section", "text": {"type": "mrkdwn", "text": text}}
233+
234+
235+
def _build_heartbeat(
192236
dora: DoraClient,
193237
cfg: Config,
194-
) -> str:
195-
"""Compose the heartbeat digest text.
238+
) -> tuple[list[dict], str]:
239+
"""Build a Block Kit heartbeat plus a plain-text fallback.
196240
197-
Makes two separate HTTP requests (client_head_forks + slots) so the head
198-
slot shown and the missed/orphaned counts are sampled at slightly
199-
different instants. They may disagree by a slot or two; this is by
241+
Makes two separate HTTP requests (client_head_forks + slots), so the
242+
head slot and the missed/orphaned counts are sampled at slightly
243+
different instants. They may disagree by a slot or two; that's by
200244
design, not a bug.
201245
"""
202246
payload = dora.client_head_forks()
@@ -247,48 +291,112 @@ def _format_heartbeat(
247291
elif st == "orphaned":
248292
orphaned += 1
249293

250-
lines: list[str] = []
251-
lines.append(
252-
f":bar_chart: *Heartbeat* — canonical head slot `{canonical_slot}` "
253-
f"(`{canonical_root[:14]}…`), {len(forks)} active fork(s)"
294+
label = f" — {cfg.network_label}" if cfg.network_label else ""
295+
blocks: list[dict] = []
296+
blocks.append({
297+
"type": "header",
298+
"text": {"type": "plain_text", "text": f"\U0001F4CA Heartbeat{label}"},
299+
})
300+
301+
# Network summary section.
302+
status_mix = " ".join(
303+
f"{_status_emoji(k)} {v}" for k, v in sorted(status_counts.items())
304+
) or "no clients"
305+
root_short = f"`{canonical_root[:14]}…`" if canonical_root else "`?`"
306+
summary_text = (
307+
f"Canonical head: slot `{canonical_slot}` · root {root_short}\n"
308+
f"Active forks: *{len(forks)}* · Status mix: {status_mix}"
254309
)
255-
status_summary = ", ".join(f"{k}:{v}" for k, v in sorted(status_counts.items())) or "no clients"
256-
lines.append(f"Network clients: {status_summary}")
310+
blocks.append(_section(summary_text))
311+
blocks.append({"type": "divider"})
257312

313+
# Matched (client_match) section.
258314
if matched:
259-
lines.append(f"*{cfg.client_match}* ({len(matched)} matched):")
260-
for e in sorted(matched, key=lambda x: x["name"]):
261-
mark = "" if e["is_canonical_fork"] else " :fork_and_knife:"
262-
lines.append(
263-
f" • `{e['name']}` status=`{e['status']}` head=`{e['head_slot']}` "
264-
f"distance=`{e['distance']}`{mark}"
265-
)
266-
lines.append(
267-
f" proposals in last {window} slots: {total_matched_proposals} "
268-
f"(missed={missed}, orphaned={orphaned})"
315+
matched_sorted = sorted(matched, key=lambda x: (_health_rank(x), x["name"]))
316+
matched_lines = [
317+
f":rocket: *{cfg.client_match}* ({len(matched_sorted)} matched)"
318+
]
319+
# Collapse the healthy bucket if every matched client is healthy.
320+
healthy = [e for e in matched_sorted if _health_rank(e) == 4]
321+
outliers = [e for e in matched_sorted if _health_rank(e) != 4]
322+
for e in outliers:
323+
matched_lines.append(_client_line(e))
324+
if healthy:
325+
if len(healthy) == len(matched_sorted):
326+
names = ", ".join(f"`{e['name']}`" for e in healthy)
327+
matched_lines.append(
328+
f"{_status_emoji('online')} *all online @ canonical* "
329+
f"({len(healthy)}): {names}"
330+
)
331+
else:
332+
for e in healthy:
333+
matched_lines.append(_client_line(e))
334+
matched_lines.append("")
335+
matched_lines.append(
336+
f"Proposals in last {window} slots: *{total_matched_proposals}* "
337+
f"(missed *{missed}*, orphaned *{orphaned}*)"
269338
)
339+
blocks.append(_section("\n".join(matched_lines)))
270340
else:
271-
lines.append(f"No clients matching `{cfg.client_match}` found.")
341+
blocks.append(_section(f":mag: No clients matching `{cfg.client_match}` found."))
272342

343+
# Other clients section (collapsed healthy bucket + per-client outliers).
273344
mode = (cfg.heartbeat_other_clients or "summary").lower()
274345
if others and mode != "off":
275-
if mode == "detailed":
276-
lines.append(f"Other clients ({len(others)}):")
277-
for e in sorted(others, key=lambda x: x["name"]):
278-
mark = "" if e["is_canonical_fork"] else " :fork_and_knife:"
346+
blocks.append({"type": "divider"})
347+
others_sorted = sorted(others, key=lambda x: (_health_rank(x), x["name"]))
348+
healthy = [e for e in others_sorted if _health_rank(e) == 4]
349+
outliers = [e for e in others_sorted if _health_rank(e) != 4]
350+
351+
lines = [f":desktop_computer: *Other clients* ({len(others_sorted)})"]
352+
for e in outliers:
353+
lines.append(_client_line(e))
354+
if healthy:
355+
if mode == "detailed":
356+
names = ", ".join(f"`{e['name']}`" for e in healthy)
279357
lines.append(
280-
f" • `{e['name']}` status=`{e['status']}` head=`{e['head_slot']}` "
281-
f"distance=`{e['distance']}`{mark}"
358+
f"{_status_emoji('online')} *online @ canonical* "
359+
f"({len(healthy)}): {names}"
282360
)
361+
else: # summary
362+
lines.append(
363+
f"{_status_emoji('online')} *online @ canonical*: "
364+
f"{len(healthy)} client(s)"
365+
)
366+
blocks.append(_section("\n".join(lines)))
367+
368+
# Footer context block (small grey).
369+
footer = (
370+
f"_Polling `{cfg.dora_url}` every {cfg.poll_interval}s · "
371+
f"matching `{cfg.client_match}`_"
372+
)
373+
blocks.append({"type": "context", "elements": [{"type": "mrkdwn", "text": footer}]})
374+
375+
# Plain-text fallback for notifications / non-Block-Kit clients.
376+
fb_status_mix = ", ".join(f"{k}:{v}" for k, v in sorted(status_counts.items())) or "no clients"
377+
fb_lines = [
378+
f"Heartbeat — canonical head {canonical_slot} ({len(forks)} fork(s), {fb_status_mix})",
379+
]
380+
if matched:
381+
unhealthy_matched = sum(1 for e in matched if _health_rank(e) != 4)
382+
if unhealthy_matched == 0:
383+
fb_lines.append(
384+
f"{cfg.client_match}: {len(matched)} client(s) all healthy @ {canonical_slot}; "
385+
f"{total_matched_proposals} proposals (missed {missed}, orphan {orphaned})"
386+
)
283387
else:
284-
non_canonical = [e for e in others if not e["is_canonical_fork"]]
285-
non_online = [e for e in others if e["status"] != "online"]
286-
lines.append(
287-
f"Other clients: {len(others)} total, "
288-
f"{len(non_online)} non-online, {len(non_canonical)} off-canonical"
388+
fb_lines.append(
389+
f"{cfg.client_match}: {unhealthy_matched}/{len(matched)} unhealthy; "
390+
f"{total_matched_proposals} proposals (missed {missed}, orphan {orphaned})"
289391
)
392+
if others:
393+
unhealthy_others = sum(1 for e in others if _health_rank(e) != 4)
394+
fb_lines.append(
395+
f"others: {len(others) - unhealthy_others}/{len(others)} healthy"
396+
)
397+
fallback = "\n".join(fb_lines)
290398

291-
return "\n".join(lines)
399+
return blocks, fallback
292400

293401

294402
def maybe_heartbeat(
@@ -304,11 +412,11 @@ def maybe_heartbeat(
304412
if state.last_heartbeat_ts > 0 and (now - state.last_heartbeat_ts) < interval_s:
305413
return
306414
try:
307-
text = _format_heartbeat(dora, cfg)
415+
blocks, fallback = _build_heartbeat(dora, cfg)
308416
except Exception as e:
309417
log.exception("heartbeat compose failed: %s", e)
310418
return
311-
slack.send(text)
419+
slack.send_blocks(blocks, fallback)
312420
state.last_heartbeat_ts = now
313421

314422

dora_monitor/dora_monitor/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,17 @@ def cli() -> None:
4141
dora = DoraClient(cfg.dora_url, timeout=cfg.http_timeout)
4242
slack = SlackNotifier(cfg.slack_webhook_url, cfg.network_label, timeout=cfg.http_timeout)
4343
if args.dry_run:
44+
import json as _json
4445
prefix = slack._prefix()
4546
def _dry_send(text: str) -> None:
4647
print(f"[DRY-RUN] {prefix}{text}")
48+
def _dry_send_blocks(blocks: list, fallback: str) -> None:
49+
print(f"[DRY-RUN] {prefix}{fallback}")
50+
if args.debug:
51+
print("[DRY-RUN blocks JSON]")
52+
print(_json.dumps(blocks, indent=2))
4753
slack.send = _dry_send # type: ignore[assignment]
54+
slack.send_blocks = _dry_send_blocks # type: ignore[assignment]
4855

4956
state = load_state(None if args.reset_state else cfg.state_file)
5057
if args.dry_run:

dora_monitor/dora_monitor/slack.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ def send(self, text: str) -> None:
4040
for i, chunk in enumerate(chunks, 1):
4141
self._post(f"{chunk}\n_({i}/{total})_")
4242

43+
def send_blocks(self, blocks: list[dict], fallback: str) -> None:
44+
"""Post a Slack Block Kit message. `fallback` is the plain-text
45+
version shown in notifications and clients that can't render blocks.
46+
"""
47+
body = {"blocks": blocks, "text": f"{self._prefix()}{fallback}"}
48+
try:
49+
r = requests.post(self.webhook_url, json=body, timeout=self.timeout)
50+
if r.status_code == 429:
51+
retry = r.headers.get("Retry-After", "?")
52+
log.error("slack rate-limited (429, retry-after=%s); blocks dropped", retry)
53+
elif r.status_code >= 300:
54+
log.error("slack webhook (blocks) failed: %s %s", r.status_code, r.text[:200])
55+
except requests.RequestException as e:
56+
log.error("slack webhook (blocks) error: %s", e)
57+
4358

4459
def _split_on_lines(text: str, limit: int) -> list[str]:
4560
"""Split text on newline boundaries into chunks of at most `limit` chars.

0 commit comments

Comments
 (0)