@@ -211,3 +211,184 @@ func TestMembership_OnStateChange_DoesNotBlockMutation(t *testing.T) {
211211 close (releaseObserver )
212212 wg .Wait ()
213213}
214+
215+ // TestMembership_Mark_NoBumpOnSameState pins the
216+ // transition-only-bump contract: Mark() must NOT increment a node's
217+ // incarnation when the requested state matches the current state.
218+ // Without this rule, steady-state heartbeat success paths
219+ // (evaluateLiveness → Mark(peer, Alive)) inflate the counter once
220+ // per probe — operators saw incarnation values in the thousands
221+ // after a few hours of normal operation. Incarnation is owned by
222+ // the node itself in SWIM; observers should not churn it.
223+ func TestMembership_Mark_NoBumpOnSameState (t * testing.T ) {
224+ t .Parallel ()
225+
226+ m := NewMembership (NewRing ())
227+ m .Upsert (NewNode ("n1" , "127.0.0.1:7946" ))
228+
229+ before := m .List ()[0 ].Incarnation
230+
231+ // Repeat the same Mark a few times — this models the
232+ // "successful probe every second" pattern that drove the bug.
233+ for range 50 {
234+ m .Mark ("n1" , NodeAlive )
235+ }
236+
237+ after := m .List ()[0 ].Incarnation
238+ if after != before {
239+ t .Errorf ("incarnation churned on same-state Mark: before=%d after=%d (want stable)" ,
240+ before , after )
241+ }
242+ }
243+
244+ // TestMembership_Mark_SameStateIsFullNoOp pins the rest of the
245+ // no-op-Mark contract beyond just incarnation: when state matches
246+ // the current value, the membership version vector must NOT
247+ // advance and registered observers must NOT fire. Without this
248+ // rule, every successful heartbeat probe bumps the version
249+ // counter once per peer per interval — a 5-node cluster running
250+ // for a few hours showed MembershipVersion past 4,800 even though
251+ // no nodes had actually changed state. Cascading effects: gossip
252+ // fans out spurious "version went up" deltas, SSE consumers see
253+ // constant "members" event spam, and the metric stops being
254+ // useful as a real-membership-change indicator.
255+ func TestMembership_Mark_SameStateIsFullNoOp (t * testing.T ) {
256+ t .Parallel ()
257+
258+ m := NewMembership (NewRing ())
259+ m .Upsert (NewNode ("n1" , "127.0.0.1:7946" ))
260+
261+ var fired atomic.Int32
262+
263+ m .OnStateChange (func (_ NodeID , _ NodeState , _ uint64 ) {
264+ fired .Add (1 )
265+ })
266+
267+ // Capture the version baseline after the Upsert above.
268+ versionBefore := m .Version ()
269+
270+ // Pound on Mark with the existing state. Models the steady
271+ // "probe succeeded again" pattern that drove the bug.
272+ for range 100 {
273+ m .Mark ("n1" , NodeAlive )
274+ }
275+
276+ if got := m .Version (); got != versionBefore {
277+ t .Errorf ("Version drifted on same-state Mark: before=%d after=%d (want stable)" ,
278+ versionBefore , got )
279+ }
280+
281+ if got := fired .Load (); got != 0 {
282+ t .Errorf ("observer fired %d times for same-state Mark, want 0" , got )
283+ }
284+
285+ // Sanity: LastSeen still refreshes so the suspect-timeout
286+ // machinery sees probes. We can't assert exact wall-clock
287+ // values cleanly here, but we can assert it advanced past the
288+ // Upsert moment by being non-zero.
289+ if m .List ()[0 ].LastSeen .IsZero () {
290+ t .Errorf ("LastSeen wasn't refreshed by same-state Mark" )
291+ }
292+ }
293+
294+ // TestMembership_Mark_BumpsOnTransition guards the other side of the
295+ // contract: a genuine state transition (Alive→Suspect, Suspect→Alive,
296+ // etc.) MUST bump incarnation so the gossip-merge rule
297+ // "higher incarnation wins" propagates the change cluster-wide. If we
298+ // over-suppress, transitions would silently fail to propagate and a
299+ // peer briefly marked Suspect would stay Suspect on neighbouring
300+ // nodes forever.
301+ func TestMembership_Mark_BumpsOnTransition (t * testing.T ) {
302+ t .Parallel ()
303+
304+ m := NewMembership (NewRing ())
305+ m .Upsert (NewNode ("n1" , "127.0.0.1:7946" ))
306+
307+ v0 := m .List ()[0 ].Incarnation
308+
309+ m .Mark ("n1" , NodeSuspect )
310+
311+ v1 := m .List ()[0 ].Incarnation
312+ if v1 != v0 + 1 {
313+ t .Errorf ("Alive→Suspect: got incarnation %d, want %d" , v1 , v0 + 1 )
314+ }
315+
316+ m .Mark ("n1" , NodeAlive )
317+
318+ v2 := m .List ()[0 ].Incarnation
319+ if v2 != v1 + 1 {
320+ t .Errorf ("Suspect→Alive: got incarnation %d, want %d" , v2 , v1 + 1 )
321+ }
322+
323+ // Same-state again — must NOT bump even after recent transitions.
324+ m .Mark ("n1" , NodeAlive )
325+ m .Mark ("n1" , NodeAlive )
326+
327+ v3 := m .List ()[0 ].Incarnation
328+ if v3 != v2 {
329+ t .Errorf ("Alive→Alive after a transition burst: got %d, want stable at %d" , v3 , v2 )
330+ }
331+ }
332+
333+ // TestMembership_Refute_AlwaysBumps pins the SWIM self-refute
334+ // primitive: Refute() unconditionally increments incarnation, even
335+ // when the local view of the node is already Alive. Without this,
336+ // the refutation packet a node sends back to a peer that suspected
337+ // it would carry the SAME incarnation as the suspect claim — and
338+ // "higher incarnation wins" would refuse to overwrite, so the
339+ // suspect claim would stick even though the node refuted it.
340+ func TestMembership_Refute_AlwaysBumps (t * testing.T ) {
341+ t .Parallel ()
342+
343+ m := NewMembership (NewRing ())
344+ m .Upsert (NewNode ("self" , "127.0.0.1:7946" ))
345+
346+ v0 := m .List ()[0 ].Incarnation
347+
348+ // Local state is Alive — a transition-only rule would no-op
349+ // here, which would silently break refutation propagation.
350+ m .Refute ("self" )
351+
352+ v1 := m .List ()[0 ].Incarnation
353+ if v1 != v0 + 1 {
354+ t .Errorf ("first Refute: got incarnation %d, want %d" , v1 , v0 + 1 )
355+ }
356+
357+ // Each subsequent refute climbs one more — chained suspect
358+ // claims from different peers must each be answerable with a
359+ // strictly-higher incarnation.
360+ m .Refute ("self" )
361+ m .Refute ("self" )
362+
363+ v2 := m .List ()[0 ].Incarnation
364+ if v2 != v1 + 2 {
365+ t .Errorf ("chained Refute: got incarnation %d, want %d" , v2 , v1 + 2 )
366+ }
367+
368+ // State must end Alive regardless of intermediate values.
369+ if got := m .List ()[0 ].State ; got != NodeAlive {
370+ t .Errorf ("Refute state: got %v, want NodeAlive" , got )
371+ }
372+ }
373+
374+ // TestMembership_Refute_GhostReturnsFalse mirrors the
375+ // non-existent-node guard already present on Mark/Remove.
376+ func TestMembership_Refute_GhostReturnsFalse (t * testing.T ) {
377+ t .Parallel ()
378+
379+ m := NewMembership (NewRing ())
380+
381+ var fired atomic.Int32
382+
383+ m .OnStateChange (func (_ NodeID , _ NodeState , _ uint64 ) {
384+ fired .Add (1 )
385+ })
386+
387+ if m .Refute ("ghost" ) {
388+ t .Fatal ("Refute on non-existent node returned true" )
389+ }
390+
391+ if got := fired .Load (); got != 0 {
392+ t .Errorf ("observer fired %d times for ghost Refute, want 0" , got )
393+ }
394+ }
0 commit comments