Skip to content

Commit 3c85caa

Browse files
committed
Fix Previously up services are alerted as going back up if the master goes down #1
This gets rid of the alert on unknown -> up, will still alert unknown -> down by design.
1 parent 8638ab5 commit 3c85caa

2 files changed

Lines changed: 50 additions & 1 deletion

File tree

internal/alerts/dispatcher.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp
2727

2828
// OnTransition is wired as checks.TransitionFn.
2929
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
30-
if to == checks.StateUnknown {
30+
if !shouldAlert(from, to) {
3131
return
3232
}
3333
alerts := d.cluster.EffectiveAlertsFor(check)
@@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error {
7777
return d.dispatchOne(alert, msg)
7878
}
7979

80+
// shouldAlert decides whether a committed state transition warrants
81+
// firing the configured alert channels.
82+
//
83+
// A fresh master's aggregator starts every check at StateUnknown, so
84+
// the first successful evaluation always commits Unknown→Up. Without
85+
// filtering, every master failover (or daemon restart) would spam an
86+
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
87+
// silent cold start; real recoveries (Down→Up) and any transition to
88+
// Down still alert.
89+
func shouldAlert(from, to checks.State) bool {
90+
if to == checks.StateUnknown {
91+
return false
92+
}
93+
if from == checks.StateUnknown && to == checks.StateUp {
94+
return false
95+
}
96+
return true
97+
}
98+
8099
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
81100
switch a.Type {
82101
case config.AlertSMTP:

internal/alerts/dispatcher_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package alerts
2+
3+
import (
4+
"testing"
5+
6+
"git.cer.sh/axodouble/quptime/internal/checks"
7+
)
8+
9+
func TestShouldAlertFiltersColdStartUp(t *testing.T) {
10+
cases := []struct {
11+
name string
12+
from checks.State
13+
to checks.State
14+
want bool
15+
}{
16+
{"cold start to up (master failover / daemon restart)", checks.StateUnknown, checks.StateUp, false},
17+
{"cold start to down still alerts", checks.StateUnknown, checks.StateDown, true},
18+
{"real recovery alerts", checks.StateDown, checks.StateUp, true},
19+
{"regression alerts", checks.StateUp, checks.StateDown, true},
20+
{"stale (up to unknown) suppressed", checks.StateUp, checks.StateUnknown, false},
21+
{"stale (down to unknown) suppressed", checks.StateDown, checks.StateUnknown, false},
22+
}
23+
for _, c := range cases {
24+
t.Run(c.name, func(t *testing.T) {
25+
if got := shouldAlert(c.from, c.to); got != c.want {
26+
t.Errorf("shouldAlert(%s→%s) = %v, want %v", c.from, c.to, got, c.want)
27+
}
28+
})
29+
}
30+
}

0 commit comments

Comments
 (0)