Skip to content

Commit 37a3a34

Browse files
committed
Collapse missed-block spam into storm mode
When a single proposer crosses missed_burst_threshold misses inside missed_burst_window_minutes, per-slot alerts are suppressed in favor of one burst-start alert, periodic backoff updates (15/30/60/120 min by default, last value repeats), and a burst-resolved alert once the window clears. Burst state is persisted so restarts don't re-fire transitions.
1 parent 2bf66b4 commit 37a3a34

5 files changed

Lines changed: 142 additions & 3 deletions

File tree

dora_monitor/config.example.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ heartbeat_slot_window: 256
5252
# detailed — per-client list with status / head / distance
5353
heartbeat_other_clients: "detailed"
5454

55+
# Missed-block storm mode. When a single proposer hits
56+
# `missed_burst_threshold` misses inside the last `missed_burst_window_minutes`,
57+
# we stop posting per-slot alerts and switch to one "burst started" alert
58+
# plus periodic "still bursting" updates whose interval backs off through
59+
# `missed_burst_update_schedule_minutes` (last value repeats indefinitely).
60+
# A "burst resolved" alert fires once the window has been clear of misses.
61+
missed_burst_threshold: 5
62+
missed_burst_window_minutes: 15
63+
missed_burst_update_schedule_minutes: [15, 30, 60, 120]
64+
5565
# Which checks to enable.
5666
checks:
5767
missed_blocks: true

dora_monitor/dora_monitor/checks.py

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,70 @@ def _matches(name: str | None, needle: str) -> bool:
1717
return needle.lower() in name.lower()
1818

1919

20+
def _burst_update_interval_s(schedule_min: list[int], update_count: int) -> int:
21+
"""Pick the next update interval (seconds) for a burst.
22+
23+
The schedule is consumed in order; once past the end, the final value
24+
is reused indefinitely (the cap). Defaults give 15min → 30min → 1h → 2h
25+
and then 2h forever.
26+
"""
27+
if not schedule_min:
28+
return 120 * 60
29+
idx = min(update_count, len(schedule_min) - 1)
30+
return max(int(schedule_min[idx]), 1) * 60
31+
32+
2033
def check_missed_blocks(
2134
dora: DoraClient,
2235
notifier: Notifier,
2336
cfg: Config,
2437
state: State,
2538
) -> None:
39+
now = time.time()
40+
window_s = max(cfg.missed_burst_window_minutes, 1) * 60
41+
threshold = max(cfg.missed_burst_threshold, 1)
42+
43+
# 1. Age out the recent-misses window for every known proposer; resolve
44+
# any burst whose window is now empty; fire backoff updates for the rest.
45+
for proposer in list(state.missed_recent.keys()):
46+
cutoff = now - window_s
47+
kept = [pair for pair in state.missed_recent[proposer] if pair[0] >= cutoff]
48+
if kept:
49+
state.missed_recent[proposer] = kept
50+
else:
51+
del state.missed_recent[proposer]
52+
53+
for proposer in list(state.burst_state.keys()):
54+
burst = state.burst_state[proposer]
55+
if proposer not in state.missed_recent:
56+
duration_min = max(int((now - float(burst.get("started_ts", now))) / 60), 0)
57+
notifier.send(
58+
f":white_check_mark: *Missed-block burst resolved* — `{proposer}`: "
59+
f"*{int(burst.get('total_misses', 0))}* missed blocks over {duration_min}min "
60+
f"(first `{int(burst.get('first_slot', 0))}`, last `{int(burst.get('last_slot', 0))}`)"
61+
)
62+
del state.burst_state[proposer]
63+
continue
64+
interval = _burst_update_interval_s(
65+
cfg.missed_burst_update_schedule_minutes,
66+
int(burst.get("update_count", 0)),
67+
)
68+
if now - float(burst.get("last_update_ts", 0.0)) >= interval:
69+
burst["last_update_ts"] = now
70+
burst["update_count"] = int(burst.get("update_count", 0)) + 1
71+
next_interval = _burst_update_interval_s(
72+
cfg.missed_burst_update_schedule_minutes,
73+
int(burst["update_count"]),
74+
)
75+
next_min = next_interval // 60
76+
notifier.send(
77+
f":fire: *Missed-block burst continues* — `{proposer}`: "
78+
f"*{int(burst.get('total_misses', 0))}* missed blocks since start "
79+
f"(latest slot `{int(burst.get('last_slot', 0))}`). "
80+
f"Next update in ~{next_min}min."
81+
)
82+
83+
# 2. Process new missed / orphaned slots from Dora.
2684
slots = dora.slots(limit=cfg.slot_scan_limit, with_orphaned=1, with_missing=1)
2785
for s in slots:
2886
proposer_name = s.get("proposer_name") or ""
@@ -32,9 +90,8 @@ def check_missed_blocks(
3290
status = (s.get("status") or "").lower()
3391
if status == "missing" and slot_num not in state.reported_missed_slots:
3492
state.reported_missed_slots.add(slot_num)
35-
notifier.send(
36-
f":warning: *Missed block* — slot `{slot_num}` "
37-
f"(epoch {s.get('epoch')}) proposer `{proposer_name}` (idx {s.get('proposer')})"
93+
_handle_missed_slot(
94+
notifier, state, cfg, proposer_name, slot_num, s, now, window_s, threshold
3895
)
3996
elif status == "orphaned" and slot_num not in state.reported_orphan_slots:
4097
state.reported_orphan_slots.add(slot_num)
@@ -44,6 +101,57 @@ def check_missed_blocks(
44101
)
45102

46103

104+
def _handle_missed_slot(
105+
notifier: Notifier,
106+
state: State,
107+
cfg: Config,
108+
proposer: str,
109+
slot: int,
110+
raw: dict,
111+
now: float,
112+
window_s: int,
113+
threshold: int,
114+
) -> None:
115+
recent = state.missed_recent.setdefault(proposer, [])
116+
recent.append([now, slot])
117+
cutoff = now - window_s
118+
state.missed_recent[proposer] = [pair for pair in recent if pair[0] >= cutoff]
119+
recent = state.missed_recent[proposer]
120+
121+
burst = state.burst_state.get(proposer)
122+
if burst:
123+
# Already in storm mode: accumulate counters, suppress per-slot alert.
124+
burst["total_misses"] = int(burst.get("total_misses", 0)) + 1
125+
burst["last_slot"] = slot
126+
return
127+
128+
if len(recent) >= threshold:
129+
slots_csv = ", ".join(f"`{int(s)}`" for _, s in sorted(recent, key=lambda p: p[1]))
130+
first_slot = min(int(s) for _, s in recent)
131+
first_interval_min = _burst_update_interval_s(
132+
cfg.missed_burst_update_schedule_minutes, 0
133+
) // 60
134+
state.burst_state[proposer] = {
135+
"started_ts": now,
136+
"last_update_ts": now,
137+
"first_slot": first_slot,
138+
"last_slot": slot,
139+
"total_misses": len(recent),
140+
"update_count": 0,
141+
}
142+
notifier.send(
143+
f":fire: *Missed-block burst* — `{proposer}`: *{len(recent)}* missed blocks "
144+
f"in last {cfg.missed_burst_window_minutes}min — slots {slots_csv}. "
145+
f"Further per-miss alerts suppressed; next update in ~{first_interval_min}min."
146+
)
147+
return
148+
149+
notifier.send(
150+
f":warning: *Missed block* — slot `{slot}` "
151+
f"(epoch {raw.get('epoch')}) proposer `{proposer}` (idx {raw.get('proposer')})"
152+
)
153+
154+
47155
def check_client_head_forks(
48156
dora: DoraClient,
49157
notifier: Notifier,

dora_monitor/dora_monitor/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ class Config:
3434
heartbeat_slot_window: int = 256
3535
# "off" (skip), "summary" (one-line aggregate), "detailed" (per-client list)
3636
heartbeat_other_clients: str = "detailed"
37+
# Storm mode for missed-block alerts: once a proposer hits
38+
# `missed_burst_threshold` misses inside `missed_burst_window_minutes`,
39+
# per-slot alerts are suppressed and replaced by one burst-start alert
40+
# plus periodic "still bursting" updates whose interval backs off
41+
# through `missed_burst_update_schedule_minutes` (last value repeats).
42+
missed_burst_threshold: int = 5
43+
missed_burst_window_minutes: int = 15
44+
missed_burst_update_schedule_minutes: list[int] = field(default_factory=lambda: [15, 30, 60, 120])
3745
checks: Checks = field(default_factory=Checks)
3846

3947

dora_monitor/dora_monitor/discord.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
":turtle:": "\U0001f422",
2727
":zap:": "⚡",
2828
":package:": "\U0001f4e6",
29+
":fire:": "\U0001f525",
2930
":rocket:": "\U0001f680",
3031
":mag:": "\U0001f50d",
3132
":desktop_computer:": "\U0001f5a5️",

dora_monitor/dora_monitor/state.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ class State:
1919
last_known_head: int = 0
2020
last_heartbeat_ts: float = 0.0
2121
client_versions: dict[str, str] = field(default_factory=dict)
22+
# Sliding window of recent missed-block timestamps + slot numbers per
23+
# proposer. Used to decide when a client's miss rate crosses the
24+
# storm-mode threshold.
25+
missed_recent: dict[str, list[list[float]]] = field(default_factory=dict)
26+
# Active missed-block burst per proposer:
27+
# {"started_ts", "last_update_ts", "first_slot", "last_slot", "total_misses"}.
28+
burst_state: dict[str, dict] = field(default_factory=dict)
2229

2330
def to_json(self) -> dict:
2431
d = asdict(self)
@@ -44,6 +51,11 @@ def from_json(cls, d: dict) -> "State":
4451
last_heartbeat_ts=float(d.get("last_heartbeat_ts", 0.0)),
4552
client_versions=dict(d.get("client_versions", {})),
4653
pending_fork_ticks={k: int(v) for k, v in (d.get("pending_fork_ticks") or {}).items()},
54+
missed_recent={
55+
k: [[float(t), int(s)] for t, s in v]
56+
for k, v in (d.get("missed_recent") or {}).items()
57+
},
58+
burst_state={k: dict(v) for k, v in (d.get("burst_state") or {}).items()},
4759
)
4860

4961

0 commit comments

Comments
 (0)