@@ -2,12 +2,14 @@ package mpc
22
33import (
44 "fmt"
5+ "strings"
56 "sync"
67 "sync/atomic"
78 "time"
89
910 "github.com/fystack/mpcium/pkg/infra"
1011 "github.com/fystack/mpcium/pkg/logger"
12+ "github.com/fystack/mpcium/pkg/messaging"
1113 "github.com/hashicorp/consul/api"
1214 "github.com/samber/lo"
1315)
@@ -35,20 +37,23 @@ type registry struct {
3537 mu sync.RWMutex
3638 ready bool // ready is true when all peers are ready
3739
38- consulKV infra.ConsulKV
40+ consulKV infra.ConsulKV
41+ healthCheck messaging.DirectMessaging
3942}
4043
4144func NewRegistry (
4245 nodeID string ,
4346 peerNodeIDs []string ,
4447 consulKV infra.ConsulKV ,
48+ directMessaging messaging.DirectMessaging ,
4549) * registry {
4650 return & registry {
4751 consulKV : consulKV ,
4852 nodeID : nodeID ,
4953 peerNodeIDs : getPeerIDsExceptSelf (nodeID , peerNodeIDs ),
5054 readyMap : make (map [string ]bool ),
5155 readyCount : 1 , // self
56+ healthCheck : directMessaging ,
5257 }
5358}
5459
@@ -104,12 +109,22 @@ func (r *registry) Ready() error {
104109 return fmt .Errorf ("Put ready key failed: %w" , err )
105110 }
106111
112+ _ , err = r .healthCheck .Listen (r .composeHealthCheckTopic (r .nodeID ), func (data []byte ) {
113+ logger .Debug ("Health check" , "peerID" , string (data ))
114+ })
115+ if err != nil {
116+ return fmt .Errorf ("Listen health check failed: %w" , err )
117+ }
107118 return nil
108119}
109120
121+ func (r * registry ) composeHealthCheckTopic (nodeID string ) string {
122+ return fmt .Sprintf ("healthcheck:%s" , nodeID )
123+ }
124+
110125func (r * registry ) WatchPeersReady (callback func ()) {
111126 ticker := time .NewTicker (ReadinessCheckPeriod )
112- go r .logReadyStatus ()
127+ go r .checkPeersHeath ()
113128 // first tick is executed immediately
114129 for ; true ; <- ticker .C {
115130 pairs , _ , err := r .consulKV .List ("ready/" , nil )
@@ -146,12 +161,31 @@ func (r *registry) WatchPeersReady(callback func()) {
146161
147162}
148163
149- func (r * registry ) logReadyStatus () {
164+ func (r * registry ) checkPeersHeath () {
150165 for {
151166 time .Sleep (5 * time .Second )
152167 if ! r .ArePeersReady () {
153168 logger .Info ("Peers are not ready yet" , "ready" , r .GetReadyPeersCount (), "expected" , len (r .peerNodeIDs )+ 1 )
154169 }
170+
171+ pairs , _ , err := r .consulKV .List ("ready/" , nil )
172+ if err != nil {
173+ logger .Error ("List ready keys failed" , err )
174+ continue
175+ }
176+ readyPeerIDs := r .getReadyPeersFromKVStore (pairs )
177+ for _ , peerID := range readyPeerIDs {
178+ err := r .healthCheck .SendToOtherWithRetry (r .composeHealthCheckTopic (peerID ), []byte (peerID ), messaging.RetryConfig {
179+ RetryAttempt : 2 ,
180+ })
181+ if err != nil && strings .Contains (err .Error (), "no responders" ) {
182+ logger .Info ("No response from peer" , "peerID" , peerID )
183+ _ , err := r .consulKV .Delete (r .readyKey (peerID ), nil )
184+ if err != nil {
185+ logger .Error ("Delete ready key failed" , err )
186+ }
187+ }
188+ }
155189 }
156190}
157191
0 commit comments