2828
2929extern int ping_interval ;
3030extern int node_timeout ;
31- extern int ping_timeout ;
31+ extern long ping_timeout_us , ping_interval_us ;
3232extern int clusterer_enable_rerouting ;
3333
3434#define PING_REPLY_INTERVAL (_node ) \
35- ((_node)->last_ping .tv_sec*1000000 + (_node)->last_ping .tv_usec \
36- - (_node)->last_pong .tv_sec*1000000 - (_node)->last_pong .tv_usec)
35+ ((_node)->last_pong .tv_sec*1000000 + (_node)->last_pong .tv_usec \
36+ - (_node)->last_ping .tv_sec*1000000 - (_node)->last_ping .tv_usec)
3737
3838static int send_ping (node_info_t * node , int req_node_list )
3939{
@@ -185,7 +185,7 @@ static void do_action_trans_5(node_info_t *node, int *link_state_to_set,
185185void heartbeats_timer (void )
186186{
187187 struct timeval now ;
188- time_t last_ping_int , ping_reply_int ;
188+ time_t last_ping_int , last_sent_int , ping_reply_int ;
189189 cluster_info_t * clusters_it ;
190190 node_info_t * node ;
191191 int ev_actions_required [MAX_NO_CLUSTERS ] = {0 };
@@ -213,6 +213,7 @@ void heartbeats_timer(void)
213213 gettimeofday (& now , NULL );
214214 ping_reply_int = PING_REPLY_INTERVAL (node );
215215 last_ping_int = TIME_DIFF (node -> last_ping , now );
216+ last_sent_int = TIME_DIFF (node -> last_sent , now );
216217
217218 prev_ls = -1 ;
218219 new_ls = -1 ;
@@ -225,16 +226,21 @@ void heartbeats_timer(void)
225226 /* restart pinging sequence */
226227 do_action_trans_0 (node , & new_ls );
227228 } else if (node -> link_state == LS_RETRY_SEND_FAIL &&
228- last_ping_int >= (utime_t ) ping_timeout * 1000 ) {
229+ last_ping_int >= (time_t ) ping_timeout_us ) {
229230 CL_DBG ("case 1: RETRY_SEND_FAIL and timeout\n" );
230231 prev_ls = node -> link_state ;
231232 lock_release (node -> lock );
232233
233234 /* failed to send previous ping, retry */
234235 do_action_trans_1 (node , & new_ls );
235236 } else if ((node -> link_state == LS_UP || node -> link_state == LS_RESTARTED ) &&
236- (ping_reply_int >= (time_t )ping_timeout * 1000 ) &&
237- last_ping_int >= (utime_t )ping_timeout * 1000 ) {
237+ /* have yet to receive a ping reply, or it was unacceptably slow */
238+ (ping_reply_int <= 0 || ping_reply_int >= (time_t )ping_timeout_us ) &&
239+ /* ... and we're pinging or haven't sent a recent BIN packet */
240+ (node -> link_state == LS_RESTARTED
241+ || last_sent_int >= (time_t )ping_timeout_us ) &&
242+ /* ... and a new ping packet is due */
243+ last_ping_int >= (time_t )ping_timeout_us ) {
238244 CL_DBG ("case 2: LS_UP and timeout\n" );
239245 prev_ls = -2 ;
240246 lock_release (node -> lock );
@@ -243,24 +249,25 @@ void heartbeats_timer(void)
243249 do_action_trans_2 (node , & new_ls );
244250 ev_actions_required [no_clusters ] = 1 ;
245251 } else if (node -> link_state == LS_RETRYING &&
246- (ping_reply_int >= (time_t )ping_timeout * 1000 ) &&
247- last_ping_int >= (utime_t ) ping_timeout * 1000 ) {
252+ (ping_reply_int <= 0 || ping_reply_int >= (time_t )ping_timeout_us ) &&
253+ last_ping_int >= (time_t ) ping_timeout_us ) {
248254 CL_DBG ("case 3: LS_RETRYING and timeout\n" );
249255 prev_ls = node -> link_state ;
250256 lock_release (node -> lock );
251257
252258 /* previous ping retry not replied, continue to retry */
253259 do_action_trans_3 (node , & new_ls );
254260 } else if (node -> link_state == LS_DOWN &&
255- last_ping_int >= (utime_t )node_timeout * 1000000 ) {
261+ last_ping_int >= (time_t )node_timeout * 1000000 ) {
256262 CL_DBG ("case 4: LS_DOWN and timeout\n" );
257263 prev_ls = node -> link_state ;
258264 lock_release (node -> lock );
259265
260266 /* ping a failed node after node_timeout since last ping */
261267 do_action_trans_4 (node , & new_ls );
262268 } else if (node -> link_state == LS_UP &&
263- last_ping_int >= (utime_t )ping_interval * 1000000 ) {
269+ last_sent_int >= (time_t )ping_interval_us &&
270+ last_ping_int >= (time_t )ping_interval_us ) {
264271 CL_DBG ("case 5: LS_UP and timeout\n" );
265272 prev_ls = node -> link_state ;
266273 lock_release (node -> lock );
@@ -1357,29 +1364,39 @@ void handle_ping(bin_packet_t *received, node_info_t *src_node,
13571364void handle_pong (bin_packet_t * received , node_info_t * src_node ,
13581365 struct timeval rcv_time , int * ev_actions_required )
13591366{
1367+ time_t last_recv_int ;
13601368 int node_list [MAX_NO_NODES ], i , nr_nodes ;
13611369
13621370 bin_pop_int (received , & nr_nodes );
13631371 for (i = 0 ; i < nr_nodes ; i ++ )
13641372 bin_pop_int (received , & node_list [i ]);
13651373
1374+ last_recv_int = TIME_DIFF (src_node -> last_recv , rcv_time );
1375+
13661376 lock_get (src_node -> lock );
13671377
13681378 src_node -> last_pong = rcv_time ;
1379+ src_node -> last_recv = rcv_time ;
13691380
13701381 /* check possible races between setting the appropriate state
13711382 * after sending ping and receiving the reply */
13721383 if ((src_node -> link_state == LS_RESTART_PINGING ||
13731384 src_node -> link_state == LS_RETRY_SEND_FAIL ||
13741385 src_node -> link_state == LS_DOWN ) &&
13751386 src_node -> last_ping_state == 0 &&
1376- TIME_DIFF (src_node -> last_ping , rcv_time ) < (utime_t ) ping_timeout * 1000 )
1387+ TIME_DIFF (src_node -> last_ping , rcv_time ) < (time_t ) ping_timeout_us )
13771388 src_node -> link_state = LS_TEMP ;
13781389
13791390 /* if the node was retried and a reply was expected, it should be UP again */
1380- if (src_node -> link_state == LS_RESTARTED ||
1391+ if (( src_node -> link_state == LS_RESTARTED ||
13811392 src_node -> link_state == LS_RETRYING ||
1382- src_node -> link_state == LS_TEMP ) {
1393+ src_node -> link_state == LS_TEMP ) &&
1394+ /* if either this PONG wasn't too late or we received
1395+ * *any* other type of BIN packet in the mean time */
1396+ ((PING_REPLY_INTERVAL (src_node ) > 0 &&
1397+ PING_REPLY_INTERVAL (src_node ) < (time_t )ping_timeout_us )
1398+ || last_recv_int <= 0 || last_recv_int < (time_t )ping_timeout_us )) {
1399+
13831400 lock_release (src_node -> lock );
13841401
13851402 set_link_w_neigh_up (src_node , nr_nodes , node_list );
0 commit comments