2828
2929extern int ping_interval ;
3030extern int node_timeout ;
31- extern int ping_timeout ;
31+ extern long ping_timeout_us , ping_interval_us ;
3232
3333#define PING_REPLY_INTERVAL (_node ) \
34- ((_node)->last_ping .tv_sec*1000000 + (_node)->last_ping .tv_usec \
35- - (_node)->last_pong .tv_sec*1000000 - (_node)->last_pong .tv_usec)
34+ ((_node)->last_pong .tv_sec*1000000 + (_node)->last_pong .tv_usec \
35+ - (_node)->last_ping .tv_sec*1000000 - (_node)->last_ping .tv_usec)
3636
3737static int send_ping (node_info_t * node , int req_node_list )
3838{
@@ -184,7 +184,7 @@ static void do_action_trans_5(node_info_t *node, int *link_state_to_set,
184184void heartbeats_timer (void )
185185{
186186 struct timeval now ;
187- time_t last_ping_int , ping_reply_int ;
187+ time_t last_ping_int , last_sent_int , ping_reply_int ;
188188 cluster_info_t * clusters_it ;
189189 node_info_t * node ;
190190 int ev_actions_required [MAX_NO_CLUSTERS ] = {0 };
@@ -212,6 +212,7 @@ void heartbeats_timer(void)
212212 gettimeofday (& now , NULL );
213213 ping_reply_int = PING_REPLY_INTERVAL (node );
214214 last_ping_int = TIME_DIFF (node -> last_ping , now );
215+ last_sent_int = TIME_DIFF (node -> last_sent , now );
215216
216217 prev_ls = -1 ;
217218 new_ls = -1 ;
@@ -224,16 +225,21 @@ void heartbeats_timer(void)
224225 /* restart pinging sequence */
225226 do_action_trans_0 (node , & new_ls );
226227 } else if (node -> link_state == LS_RETRY_SEND_FAIL &&
227- last_ping_int >= (utime_t ) ping_timeout * 1000 ) {
228+ last_ping_int >= (time_t ) ping_timeout_us ) {
228229 CL_DBG ("case 1: RETRY_SEND_FAIL and timeout\n" );
229230 prev_ls = node -> link_state ;
230231 lock_release (node -> lock );
231232
232233 /* failed to send previous ping, retry */
233234 do_action_trans_1 (node , & new_ls );
234235 } else if ((node -> link_state == LS_UP || node -> link_state == LS_RESTARTED ) &&
235- (ping_reply_int >= (time_t )ping_timeout * 1000 ) &&
236- last_ping_int >= (utime_t )ping_timeout * 1000 ) {
236+ /* have yet to receive a ping reply, or it was unacceptably slow */
237+ (ping_reply_int <= 0 || ping_reply_int >= (time_t )ping_timeout_us ) &&
238+ /* ... and we're pinging or haven't sent a recent BIN packet */
239+ (node -> link_state == LS_RESTARTED
240+ || last_sent_int >= (time_t )ping_timeout_us ) &&
241+ /* ... and a new ping packet is due */
242+ last_ping_int >= (time_t )ping_timeout_us ) {
237243 CL_DBG ("case 2: LS_UP and timeout\n" );
238244 prev_ls = -2 ;
239245 lock_release (node -> lock );
@@ -242,24 +248,25 @@ void heartbeats_timer(void)
242248 do_action_trans_2 (node , & new_ls );
243249 ev_actions_required [no_clusters ] = 1 ;
244250 } else if (node -> link_state == LS_RETRYING &&
245- (ping_reply_int >= (time_t )ping_timeout * 1000 ) &&
246- last_ping_int >= (utime_t ) ping_timeout * 1000 ) {
251+ (ping_reply_int <= 0 || ping_reply_int >= (time_t )ping_timeout_us ) &&
252+ last_ping_int >= (time_t ) ping_timeout_us ) {
247253 CL_DBG ("case 3: LS_RETRYING and timeout\n" );
248254 prev_ls = node -> link_state ;
249255 lock_release (node -> lock );
250256
251257 /* previous ping retry not replied, continue to retry */
252258 do_action_trans_3 (node , & new_ls );
253259 } else if (node -> link_state == LS_DOWN &&
254- last_ping_int >= (utime_t )node_timeout * 1000000 ) {
260+ last_ping_int >= (time_t )node_timeout * 1000000 ) {
255261 CL_DBG ("case 4: LS_DOWN and timeout\n" );
256262 prev_ls = node -> link_state ;
257263 lock_release (node -> lock );
258264
259265 /* ping a failed node after node_timeout since last ping */
260266 do_action_trans_4 (node , & new_ls );
261267 } else if (node -> link_state == LS_UP &&
262- last_ping_int >= (utime_t )ping_interval * 1000000 ) {
268+ last_sent_int >= (time_t )ping_interval_us &&
269+ last_ping_int >= (time_t )ping_interval_us ) {
263270 CL_DBG ("case 5: LS_UP and timeout\n" );
264271 prev_ls = node -> link_state ;
265272 lock_release (node -> lock );
@@ -1346,29 +1353,39 @@ void handle_ping(bin_packet_t *received, node_info_t *src_node,
13461353void handle_pong (bin_packet_t * received , node_info_t * src_node ,
13471354 struct timeval rcv_time , int * ev_actions_required )
13481355{
1356+ time_t last_recv_int ;
13491357 int node_list [MAX_NO_NODES ], i , nr_nodes ;
13501358
13511359 bin_pop_int (received , & nr_nodes );
13521360 for (i = 0 ; i < nr_nodes ; i ++ )
13531361 bin_pop_int (received , & node_list [i ]);
13541362
1363+ last_recv_int = TIME_DIFF (src_node -> last_recv , rcv_time );
1364+
13551365 lock_get (src_node -> lock );
13561366
13571367 src_node -> last_pong = rcv_time ;
1368+ src_node -> last_recv = rcv_time ;
13581369
13591370 /* check possible races between setting the appropriate state
13601371 * after sending ping and receiving the reply */
13611372 if ((src_node -> link_state == LS_RESTART_PINGING ||
13621373 src_node -> link_state == LS_RETRY_SEND_FAIL ||
13631374 src_node -> link_state == LS_DOWN ) &&
13641375 src_node -> last_ping_state == 0 &&
1365- TIME_DIFF (src_node -> last_ping , rcv_time ) < (utime_t ) ping_timeout * 1000 )
1376+ TIME_DIFF (src_node -> last_ping , rcv_time ) < (time_t ) ping_timeout_us )
13661377 src_node -> link_state = LS_TEMP ;
13671378
13681379 /* if the node was retried and a reply was expected, it should be UP again */
1369- if (src_node -> link_state == LS_RESTARTED ||
1380+ if (( src_node -> link_state == LS_RESTARTED ||
13701381 src_node -> link_state == LS_RETRYING ||
1371- src_node -> link_state == LS_TEMP ) {
1382+ src_node -> link_state == LS_TEMP ) &&
1383+ /* if either this PONG wasn't too late or we received
1384+ * *any* other type of BIN packet in the mean time */
1385+ ((PING_REPLY_INTERVAL (src_node ) > 0 &&
1386+ PING_REPLY_INTERVAL (src_node ) < (time_t )ping_timeout_us )
1387+ || last_recv_int <= 0 || last_recv_int < (time_t )ping_timeout_us )) {
1388+
13721389 lock_release (src_node -> lock );
13731390
13741391 set_link_w_neigh_up (src_node , nr_nodes , node_list );
0 commit comments