Skip to content

Commit 8b85d70

Browse files
committed
perftest: fix premature exit when select() is interrupted by SIGALRM
The perftest framework makes extensive use of alarm() to control test duration (--duration) and to schedule periodic tasks. Functions such as `run_iter_bw()`, `run_iter_lat_send()`, and `run_iter_bi()` install a handler via `signal(SIGALRM, catch_alarm)` when the -D option is used, and then set an alarm. In `run_iter_bw_server()` and `run_iter_bi()`, a watchdog is also installed in iterations mode via `signal(SIGALRM, check_alive)` followed by `alarm(60)` to detect stalled tests. In the problematic case, `run_iter_bi()` with the -e option invokes `ctx_notify_send_recv_events()`, which performs a `select()` on two file descriptors: `ctx->recv_channel->fd` — CQ receive completion channel `ctx->send_channel->fd` — CQ send completion channel When a completion event is generated, the kernel marks the corresponding file descriptor readable and `select()` returns. However, due to low processing speed on the some NICs, no completion event is generated within 60 seconds(test case is not finished under high pressure test). The watchdog `alarm()` fires, delivering SIGALRM, which interrupts the blocking `select()` call. The function then exits with an error instead of retrying. This behavior exposes a robustness issue in perftest: SIGALRM in this context is meant only as a check-alive signal, not as a fatal condition. A `select()` call interrupted by SIGALRM should be restarted rather than causing an unexpected termination. This patch updates perftest to properly handle EINTR by retrying `select()` when it is interrupted by SIGALRM, ensuring correct behavior even under slow device processing conditions. Signed-off-by: Ruizhe Zhou <zhouruizhe@resnics.com>
1 parent 07d75db commit 8b85d70

3 files changed

Lines changed: 33 additions & 9 deletions

File tree

src/perftest_communication.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,9 +1125,10 @@ int rdma_client_connect(struct pingpong_context *ctx,struct perftest_parameters
11251125
}
11261126

11271127
if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
1128-
fprintf(stderr, "Unexpected CM event bl blka %d\n", event->event);
1128+
fprintf(stderr, "Unexpected CM event bl blka %s; error: %d.\n",
1129+
rdma_event_str(event->event), event->status);
11291130
rdma_ack_cm_event(event);
1130-
return FAILURE;
1131+
return FAILURE;
11311132
}
11321133

11331134
if (user_param->connection_type == UD) {

src/perftest_resources.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ static __always_inline int poll_completions(
108108
struct perftest_parameters* duration_param;
109109
struct check_alive_data check_alive_data;
110110

111+
volatile sig_atomic_t g_sigalarm_fired = 0;
111112

112113
/******************************************************************************
113114
* Beginning
@@ -5859,6 +5860,7 @@ uint16_t ctx_get_local_lid(struct ibv_context *context,int port)
58595860
******************************************************************************/
58605861
void catch_alarm(int sig)
58615862
{
5863+
g_sigalarm_fired = 1;
58625864
switch (duration_param->state) {
58635865
case START_STATE:
58645866
duration_param->state = SAMPLE_STATE;
@@ -5886,6 +5888,7 @@ void catch_alarm(int sig)
58865888

58875889
void check_alive(int sig)
58885890
{
5891+
g_sigalarm_fired = 1;
58895892
if (check_alive_data.current_totrcnt > check_alive_data.last_totrcnt) {
58905893
check_alive_data.last_totrcnt = check_alive_data.current_totrcnt;
58915894
alarm(60);

src/perftest_resources.h

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@
7272
#include <sys/socket.h>
7373
#include <netdb.h>
7474
#include <fcntl.h>
75+
#include <string.h>
76+
#include <errno.h>
77+
#include <signal.h>
7578
#include "perftest_parameters.h"
7679

7780
#define NUM_OF_RETRIES (10)
@@ -185,6 +188,7 @@ static inline uint64_t build_wr_id(uint32_t wr_index, uint16_t qp_index)
185188
return ((uint64_t)wr_index) | ((uint64_t)qp_index << WR_ID_QP_INDEX_OFFSET);
186189
}
187190

191+
extern volatile sig_atomic_t g_sigalarm_fired;
188192
/******************************************************************************
189193
* Perftest resources Structures and data types.
190194
******************************************************************************/
@@ -865,15 +869,31 @@ static __inline void increase_rem_addr(struct ibv_send_wr *wr,int size,uint64_t
865869
static __inline int ctx_notify_send_recv_events(struct pingpong_context *ctx)
866870
{
867871
fd_set rfds;
872+
int ret;
868873

869-
FD_ZERO(&rfds);
870-
FD_SET(ctx->recv_channel->fd, &rfds);
871-
FD_SET(ctx->send_channel->fd, &rfds);
874+
do {
875+
FD_ZERO(&rfds);
876+
FD_SET(ctx->recv_channel->fd, &rfds);
877+
FD_SET(ctx->send_channel->fd, &rfds);
872878

873-
if (select(MAX(ctx->recv_channel->fd,
874-
ctx->send_channel->fd) + 1,
875-
&rfds, NULL, NULL, NULL) == -1) {
876-
fprintf(stderr, "Failed to get completion events\n");
879+
g_sigalarm_fired = 0;
880+
881+
ret = select(MAX(ctx->recv_channel->fd,
882+
ctx->send_channel->fd) + 1,
883+
&rfds, NULL, NULL, NULL);
884+
885+
if (ret == -1 && errno == EINTR) {
886+
if (g_sigalarm_fired) {
887+
fprintf(stderr, "Confirmed: select() was interrupted by SIGALARM. Retrying...\n");
888+
} else {
889+
fprintf(stderr, "Warning: select() interrupted by another signal. Retrying...\n");
890+
}
891+
}
892+
893+
} while (ret == -1 && errno == EINTR);
894+
895+
if (ret == -1) {
896+
fprintf(stderr, "Failed to get completion events: %s\n", strerror(errno));
877897
return FAILURE;
878898
}
879899

0 commit comments

Comments
 (0)