Skip to content

Commit f8c1dbc

Browse files
authored
fix error checking against IB verbs API (#2265) (#3034)
* fix error checking against IB verbs API (#2265) Signed-off-by: Qun, Li <qun.li@zstack.io> * style and log changes according to comments Signed-off-by: Qun, Li <qun.li@zstack.io> --------- Signed-off-by: Qun, Li <qun.li@zstack.io>
1 parent 0e5d022 commit f8c1dbc

2 files changed

Lines changed: 47 additions & 31 deletions

File tree

src/brpc/rdma/rdma_endpoint.cpp

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -878,10 +878,14 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
878878
}
879879

880880
ibv_send_wr* bad = NULL;
881-
if (ibv_post_send(_resource->qp, &wr, &bad) < 0) {
881+
int err = ibv_post_send(_resource->qp, &wr, &bad);
882+
if (err != 0) {
882883
// We use other way to guarantee the Send Queue is not full.
883884
// So we just consider this error as an unrecoverable error.
884-
PLOG(WARNING) << "Fail to ibv_post_send";
885+
LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
886+
<< ", window=" << window
887+
<< ", sq_current=" << _sq_current;
888+
errno = err;
885889
return -1;
886890
}
887891

@@ -920,10 +924,11 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
920924
wr.send_flags |= IBV_SEND_SIGNALED;
921925

922926
ibv_send_wr* bad = NULL;
923-
if (ibv_post_send(_resource->qp, &wr, &bad) < 0) {
927+
int err = ibv_post_send(_resource->qp, &wr, &bad);
928+
if (err != 0) {
924929
// We use other way to guarantee the Send Queue is not full.
925930
// So we just consider this error as an unrecoverable error.
926-
PLOG(WARNING) << "Fail to ibv_post_send";
931+
LOG(WARNING) << "Fail to ibv_post_send: " << berror(err);
927932
return -1;
928933
}
929934
return 0;
@@ -1004,8 +1009,9 @@ int RdmaEndpoint::DoPostRecv(void* block, size_t block_size) {
10041009
wr.sg_list = &sge;
10051010

10061011
ibv_recv_wr* bad = NULL;
1007-
if (ibv_post_recv(_resource->qp, &wr, &bad) < 0) {
1008-
PLOG(WARNING) << "Fail to ibv_post_recv";
1012+
int err = ibv_post_recv(_resource->qp, &wr, &bad);
1013+
if (err != 0) {
1014+
LOG(WARNING) << "Fail to ibv_post_recv: " << berror(err);
10091015
return -1;
10101016
}
10111017
return 0;
@@ -1143,8 +1149,9 @@ int RdmaEndpoint::AllocateResources() {
11431149
return -1;
11441150
}
11451151

1146-
if (ibv_req_notify_cq(_resource->cq, 1) < 0) {
1147-
PLOG(WARNING) << "Fail to arm CQ comp channel";
1152+
int err = ibv_req_notify_cq(_resource->cq, 1);
1153+
if (err != 0) {
1154+
LOG(WARNING) << "Fail to arm CQ comp channel: " << berror(err);
11481155
return -1;
11491156
}
11501157
} else {
@@ -1186,12 +1193,13 @@ int RdmaEndpoint::BringUpQp(uint16_t lid, ibv_gid gid, uint32_t qp_num) {
11861193
attr.pkey_index = 0; // TODO: support more pkey use in future
11871194
attr.port_num = GetRdmaPortNum();
11881195
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
1189-
if (IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
1190-
IBV_QP_STATE |
1196+
int err = IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
1197+
IBV_QP_STATE |
11911198
IBV_QP_PKEY_INDEX |
11921199
IBV_QP_PORT |
1193-
IBV_QP_ACCESS_FLAGS)) < 0) {
1194-
PLOG(WARNING) << "Fail to modify QP from RESET to INIT";
1200+
IBV_QP_ACCESS_FLAGS));
1201+
if (err != 0) {
1202+
LOG(WARNING) << "Fail to modify QP from RESET to INIT: " << berror(err);
11951203
return -1;
11961204
}
11971205

@@ -1217,15 +1225,16 @@ int RdmaEndpoint::BringUpQp(uint16_t lid, ibv_gid gid, uint32_t qp_num) {
12171225
attr.rq_psn = 0;
12181226
attr.max_dest_rd_atomic = 0;
12191227
attr.min_rnr_timer = 0; // We do not allow rnr error
1220-
if (IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
1228+
err = IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
12211229
IBV_QP_STATE |
12221230
IBV_QP_PATH_MTU |
12231231
IBV_QP_MIN_RNR_TIMER |
12241232
IBV_QP_AV |
12251233
IBV_QP_MAX_DEST_RD_ATOMIC |
12261234
IBV_QP_DEST_QPN |
1227-
IBV_QP_RQ_PSN)) < 0) {
1228-
PLOG(WARNING) << "Fail to modify QP from INIT to RTR";
1235+
IBV_QP_RQ_PSN));
1236+
if (err != 0) {
1237+
LOG(WARNING) << "Fail to modify QP from INIT to RTR: " << berror(err);
12291238
return -1;
12301239
}
12311240

@@ -1235,14 +1244,15 @@ int RdmaEndpoint::BringUpQp(uint16_t lid, ibv_gid gid, uint32_t qp_num) {
12351244
attr.rnr_retry = 0; // We do not allow rnr error
12361245
attr.sq_psn = 0;
12371246
attr.max_rd_atomic = 0;
1238-
if (IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
1247+
err = IbvModifyQp(_resource->qp, &attr, (ibv_qp_attr_mask)(
12391248
IBV_QP_STATE |
12401249
IBV_QP_RNR_RETRY |
12411250
IBV_QP_RETRY_CNT |
12421251
IBV_QP_TIMEOUT |
12431252
IBV_QP_SQ_PSN |
1244-
IBV_QP_MAX_QP_RD_ATOMIC)) < 0) {
1245-
PLOG(WARNING) << "Fail to modify QP from RTR to RTS";
1253+
IBV_QP_MAX_QP_RD_ATOMIC));
1254+
if (err != 0) {
1255+
LOG(WARNING) << "Fail to modify QP from RTR to RTS: " << berror(err);
12461256
return -1;
12471257
}
12481258

@@ -1270,17 +1280,20 @@ void RdmaEndpoint::DeallocateResources() {
12701280
if (_resource->comp_channel) {
12711281
fd = _resource->comp_channel->fd;
12721282
}
1283+
int err;
12731284
if (!move_to_rdma_resource_list) {
12741285
if (_resource->qp) {
1275-
if (IbvDestroyQp(_resource->qp) < 0) {
1276-
PLOG(WARNING) << "Fail to destroy QP";
1286+
err = IbvDestroyQp(_resource->qp);
1287+
if (err != 0) {
1288+
LOG(WARNING) << "Fail to destroy QP: " << berror(err);
12771289
}
12781290
_resource->qp = NULL;
12791291
}
12801292
if (_resource->cq) {
12811293
IbvAckCqEvents(_resource->cq, _cq_events);
1282-
if (IbvDestroyCq(_resource->cq) < 0) {
1283-
PLOG(WARNING) << "Fail to destroy CQ";
1294+
err = IbvDestroyCq(_resource->cq);
1295+
if (err != 0) {
1296+
PLOG(WARNING) << "Fail to destroy CQ: " << berror(err);
12841297
}
12851298
_resource->cq = NULL;
12861299
}
@@ -1289,8 +1302,9 @@ void RdmaEndpoint::DeallocateResources() {
12891302
// so that we should remove it from epoll fd first
12901303
_socket->_io_event.RemoveConsumer(fd);
12911304
fd = -1;
1292-
if (IbvDestroyCompChannel(_resource->comp_channel) < 0) {
1293-
PLOG(WARNING) << "Fail to destroy CQ channel";
1305+
err = IbvDestroyCompChannel(_resource->comp_channel);
1306+
if (err != 0) {
1307+
LOG(WARNING) << "Fail to destroy CQ channel: " << berror(err);
12941308
}
12951309
_resource->comp_channel = NULL;
12961310
}
@@ -1328,7 +1342,7 @@ static const int MAX_CQ_EVENTS = 128;
13281342
int RdmaEndpoint::GetAndAckEvents() {
13291343
int events = 0; void* context = NULL;
13301344
while (1) {
1331-
if (IbvGetCqEvent(_resource->comp_channel, &_resource->cq, &context) < 0) {
1345+
if (IbvGetCqEvent(_resource->comp_channel, &_resource->cq, &context) != 0) {
13321346
if (errno != EAGAIN) {
13331347
return -1;
13341348
}
@@ -1392,7 +1406,8 @@ void RdmaEndpoint::PollCq(Socket* m) {
13921406
// that the event arrives after the poll but before the notify,
13931407
// we should re-poll the CQ once after the notify to check if
13941408
// there is an available CQE.
1395-
if (ibv_req_notify_cq(ep->_resource->cq, 1) < 0) {
1409+
errno = ibv_req_notify_cq(ep->_resource->cq, 1);
1410+
if (errno != 0) {
13961411
const int saved_errno = errno;
13971412
PLOG(WARNING) << "Fail to arm CQ comp channel: " << s->description();
13981413
s->SetFailed(saved_errno, "Fail to arm cq channel from %s: %s",

src/brpc/rdma/rdma_helper.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ void BlockDeallocate(void* buf) {
208208

209209
static void FindRdmaLid() {
210210
ibv_port_attr attr;
211-
if (IbvQueryPort(g_context, g_port_num, &attr) < 0) {
211+
if (IbvQueryPort(g_context, g_port_num, &attr) != 0) {
212212
return;
213213
}
214214
g_lid = attr.lid;
@@ -220,7 +220,7 @@ static bool FindRdmaGid(ibv_context* context) {
220220
bool found = false;
221221
for (int i = g_gid_tbl_len - 1; i >= 0; --i) {
222222
ibv_gid gid;
223-
if (IbvQueryGid(context, g_port_num, i, &gid) < 0) {
223+
if (IbvQueryGid(context, g_port_num, i, &gid) != 0) {
224224
continue;
225225
}
226226
if (gid.global.interface_id == 0) {
@@ -250,7 +250,7 @@ static void OnRdmaAsyncEvent(Socket* m) {
250250
int progress = Socket::PROGRESS_INIT;
251251
do {
252252
ibv_async_event event;
253-
if (IbvGetAsyncEvent(g_context, &event) < 0) {
253+
if (IbvGetAsyncEvent(g_context, &event) != 0) {
254254
break;
255255
}
256256
LOG(WARNING) << "rdma async event: " << IbvEventTypeStr(event.event_type);
@@ -416,7 +416,8 @@ static ibv_context* OpenDevice(int num_total, int* num_available_devices) {
416416
continue;
417417
}
418418
ibv_port_attr attr;
419-
if (IbvQueryPort(context.get(), uint8_t(FLAGS_rdma_port), &attr) < 0) {
419+
errno = IbvQueryPort(context.get(), uint8_t(FLAGS_rdma_port), &attr);
420+
if (errno != 0) {
420421
PLOG(WARNING) << "Fail to query port " << FLAGS_rdma_port << " on "
421422
<< dev_name;
422423
continue;
@@ -533,7 +534,7 @@ static void GlobalRdmaInitializeOrDieImpl() {
533534
}
534535

535536
ibv_device_attr attr;
536-
if (IbvQueryDevice(g_context, &attr) < 0) {
537+
if (IbvQueryDevice(g_context, &attr) != 0) {
537538
PLOG(ERROR) << "Fail to get the device information";
538539
ExitWithError();
539540
}

0 commit comments

Comments
 (0)