@@ -75,7 +75,12 @@ type DistMemory struct {
7575 nodeID string
7676 seeds []string // static seed node addresses
7777
78- // heartbeat / failure detection (experimental)
78+ // heartbeat / failure detection. Phase E added SWIM
79+ // self-refutation (refuteIfSuspected) and HTTP gossip
80+ // dissemination, retiring the prior "experimental" marker —
81+ // the path now disseminates suspect/dead transitions across
82+ // the cluster and lets a falsely-accused node bump its
83+ // incarnation to clear suspicion.
7984 hbInterval time.Duration
8085 hbSuspectAfter time.Duration
8186 hbDeadAfter time.Duration
@@ -3023,19 +3028,32 @@ func (dm *DistMemory) runGossipTick() {
30233028 }
30243029
30253030 target := candidates [idxBig .Int64 ()]
3031+ transport := dm .loadTransport ()
3032+ snapshot := dm .membership .List ()
30263033
3027- ip , ok := dm .loadTransport ().(* InProcessTransport )
3028- if ! ok {
3029- return
3030- }
3034+ // In-process fast path: skip the wire and call acceptGossip
3035+ // directly. Pre-Phase-E this was the ONLY path; the function
3036+ // bailed for any other transport type, so cross-process
3037+ // clusters never disseminated membership / never refuted
3038+ // suspect claims. The fall-through below now uses the
3039+ // transport's Gossip method, which routes via HTTP for the
3040+ // auto-created DistHTTPTransport.
3041+ if ip , ok := transport .(* InProcessTransport ); ok {
3042+ if remote , ok2 := ip .backends [string (target .ID )]; ok2 {
3043+ remote .acceptGossip (snapshot )
3044+ }
30313045
3032- remote , ok2 := ip .backends [string (target .ID )]
3033- if ! ok2 {
30343046 return
30353047 }
30363048
3037- snapshot := dm .membership .List ()
3038- remote .acceptGossip (snapshot )
3049+ gossipErr := transport .Gossip (dm .lifeCtx , string (target .ID ), nodesToGossipMembers (snapshot ))
3050+ if gossipErr != nil {
3051+ dm .logger .Debug (
3052+ "gossip push failed" ,
3053+ slog .String ("peer_id" , string (target .ID )),
3054+ slog .Any ("err" , gossipErr ),
3055+ )
3056+ }
30393057}
30403058
30413059func (dm * DistMemory ) acceptGossip (nodes []* cluster.Node ) {
@@ -3045,6 +3063,8 @@ func (dm *DistMemory) acceptGossip(nodes []*cluster.Node) {
30453063
30463064 for _ , node := range nodes {
30473065 if node .ID == dm .localNode .ID {
3066+ dm .refuteIfSuspected (node )
3067+
30483068 continue
30493069 }
30503070
@@ -3079,6 +3099,41 @@ func (dm *DistMemory) acceptGossip(nodes []*cluster.Node) {
30793099 }
30803100}
30813101
3102+ // refuteIfSuspected handles the SWIM self-refute path: when a peer
3103+ // gossips that THIS node is Suspect or Dead at incarnation N, bump
3104+ // our local incarnation to N+1 and re-upsert ourselves as Alive.
3105+ // Higher-incarnation-wins propagation in `acceptGossip` ensures the
3106+ // next gossip tick disseminates the refutation cluster-wide.
3107+ //
3108+ // Pre-fix this path was a no-op (`continue` on local-ID match) — a
3109+ // node that fell briefly behind heartbeat would be marked Suspect by
3110+ // peers and could not undo it through gossip; only a fresh probe
3111+ // would clear suspicion. Self-refute closes the loop required for
3112+ // the heartbeat marker to drop its `experimental` qualifier.
3113+ func (dm * DistMemory ) refuteIfSuspected (claim * cluster.Node ) {
3114+ if claim == nil || dm .localNode == nil {
3115+ return
3116+ }
3117+
3118+ if claim .State == cluster .NodeAlive {
3119+ return // peer agrees we're alive — nothing to refute
3120+ }
3121+
3122+ // Only refute when the peer's claim is at >= our incarnation;
3123+ // older claims are stale and ignored.
3124+ if claim .Incarnation < dm .localNode .Incarnation {
3125+ return
3126+ }
3127+
3128+ dm .membership .Mark (dm .localNode .ID , cluster .NodeAlive )
3129+
3130+ dm .logger .Info (
3131+ "self-refuted suspect/dead claim from peer" ,
3132+ slog .Uint64 ("claim_incarnation" , claim .Incarnation ),
3133+ slog .String ("claim_state" , claim .State .String ()),
3134+ )
3135+ }
3136+
30823137// chooseNewer picks the item with higher version; on version tie uses lexicographically smaller Origin as winner.
30833138func (dm * DistMemory ) chooseNewer (itemA , itemB * cache.Item ) * cache.Item {
30843139 if itemA == nil {
@@ -3265,7 +3320,10 @@ func parseSeedSpec(raw string) seedSpec {
32653320 return seedSpec {id : id , addr : addr }
32663321}
32673322
3268- // heartbeatLoop probes peers and updates membership (best-effort experimental).
3323+ // heartbeatLoop probes peers and updates membership. SWIM-style
3324+ // indirect probes (Phase B.1) and self-refutation via gossip
3325+ // (Phase E) are wired into the surrounding helpers — this loop
3326+ // only schedules the per-tick work.
32693327func (dm * DistMemory ) heartbeatLoop (ctx context.Context , stopCh <- chan struct {}) { // reduced cognitive complexity via helpers
32703328 ticker := time .NewTicker (dm .hbInterval )
32713329 defer ticker .Stop ()
0 commit comments