Skip to content

Commit e2e0afe

Browse files
committed
mctpd: implement retries through command submission logic
We have a number of separate retry loops when querying peer properties. Instead, use a common retry loop in the endpoint_query_addr path, which can be disabled by a new 'disable_retry' member on struct mctp_ctrl_cmd. This now enables retries for all commands, except the Get Endpoint ID probe during a recover. Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
1 parent 1e8f6f9 commit e2e0afe

1 file changed

Lines changed: 59 additions & 98 deletions

File tree

src/mctpd.c

Lines changed: 59 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ struct mctp_ctrl_cmd {
326326
/* populated by caller */
327327
void *req;
328328
size_t req_len;
329+
bool disable_retry;
329330

330331
/* populated on response RX */
331332
void *resp;
@@ -1710,7 +1711,9 @@ static int endpoint_query_addr(struct ctx *ctx,
17101711
const struct sockaddr_mctp_ext *req_addr,
17111712
bool ext_addr, struct mctp_ctrl_cmd *cmd)
17121713
{
1714+
unsigned int max_retries = 4;
17131715
size_t req_addr_len;
1716+
unsigned int retry;
17141717
int sd = -1, val;
17151718
ssize_t rc;
17161719
size_t buf_size;
@@ -1719,6 +1722,8 @@ static int endpoint_query_addr(struct ctx *ctx,
17191722

17201723
cmd->resp = NULL;
17211724
cmd->resp_len = 0;
1725+
if (cmd->disable_retry)
1726+
max_retries = 1;
17221727

17231728
sd = mctp_ops.mctp.socket();
17241729
if (sd < 0) {
@@ -1748,37 +1753,47 @@ static int endpoint_query_addr(struct ctx *ctx,
17481753
rc = -EPROTO;
17491754
goto out;
17501755
}
1751-
rc = mctp_ops.mctp.sendto(sd, cmd->req, cmd->req_len, 0,
1752-
(struct sockaddr *)req_addr, req_addr_len);
1753-
if (rc < 0) {
1754-
rc = -errno;
1755-
if (ctx->verbose) {
1756-
warnx("%s: sendto(%s) %zu bytes failed. %s", __func__,
1757-
ext_addr_tostr(req_addr), cmd->req_len,
1758-
strerror(-rc));
1756+
1757+
for (retry = 0; retry < max_retries; retry++) {
1758+
rc = mctp_ops.mctp.sendto(sd, cmd->req, cmd->req_len, 0,
1759+
(struct sockaddr *)req_addr,
1760+
req_addr_len);
1761+
if (rc < 0) {
1762+
rc = -errno;
1763+
if (ctx->verbose) {
1764+
warnx("%s: sendto(%s) %zu bytes failed. %s",
1765+
__func__, ext_addr_tostr(req_addr),
1766+
cmd->req_len, strerror(-rc));
1767+
}
1768+
break;
1769+
}
1770+
if ((size_t)rc != cmd->req_len) {
1771+
bug_warn("incorrect sendto %zd, expected %zu", rc,
1772+
cmd->req_len);
1773+
rc = -EPROTO;
1774+
break;
17591775
}
1760-
goto out;
1761-
}
1762-
if ((size_t)rc != cmd->req_len) {
1763-
bug_warn("incorrect sendto %zd, expected %zu", rc,
1764-
cmd->req_len);
1765-
rc = -EPROTO;
1766-
goto out;
1767-
}
17681776

1769-
rc = wait_fd_timeout(sd, EPOLLIN, ctx->mctp_timeout);
1770-
if (rc < 0) {
1771-
if (rc == -ETIMEDOUT && ctx->verbose) {
1772-
warnx("%s: receive timed out from %s", __func__,
1777+
rc = wait_fd_timeout(sd, EPOLLIN, ctx->mctp_timeout);
1778+
if (rc == 0)
1779+
break;
1780+
1781+
if (rc == -ETIMEDOUT) {
1782+
warnx("receive timed out from %s, attempt %d/%d",
1783+
ext_addr_tostr(req_addr), retry + 1, max_retries);
1784+
} else {
1785+
warnx("receive error from %s",
17731786
ext_addr_tostr(req_addr));
1787+
break;
17741788
}
1775-
goto out;
17761789
}
17771790

1791+
if (rc || retry == max_retries)
1792+
goto out;
1793+
17781794
rc = read_message(ctx, sd, &buf, &buf_size, &cmd->resp_addr);
1779-
if (rc < 0) {
1795+
if (rc < 0)
17801796
goto out;
1781-
}
17821797

17831798
if (cmd->resp_addr.smctp_base.smctp_type !=
17841799
req_addr->smctp_base.smctp_type) {
@@ -2482,7 +2497,8 @@ static void set_berr(struct ctx *ctx, int errcode, sd_bus_error *berr)
24822497

24832498
static int query_get_endpoint_id(struct ctx *ctx, const dest_phys *dest,
24842499
mctp_eid_t *ret_eid, uint8_t *ret_ep_type,
2485-
uint8_t *ret_media_spec, struct peer *peer)
2500+
uint8_t *ret_media_spec, struct peer *peer,
2501+
bool retry)
24862502
{
24872503
struct mctp_ctrl_cmd_get_eid req = { 0 };
24882504
struct mctp_ctrl_resp_get_eid *resp = NULL;
@@ -2495,6 +2511,7 @@ static int query_get_endpoint_id(struct ctx *ctx, const dest_phys *dest,
24952511
mctp_ctrl_msg_hdr_init_req(&req.ctrl_hdr, iid,
24962512
MCTP_CTRL_CMD_GET_ENDPOINT_ID);
24972513
mctp_ctrl_cmd_init_from_req_type(&cmd, req);
2514+
cmd.disable_retry = !retry;
24982515

24992516
if (peer)
25002517
rc = endpoint_query_peer(peer, &cmd);
@@ -2535,7 +2552,7 @@ static int get_endpoint_peer(struct ctx *ctx, sd_bus_error *berr,
25352552

25362553
*ret_peer = NULL;
25372554
rc = query_get_endpoint_id(ctx, dest, &eid, &ep_type, &medium_spec,
2538-
/*peer=*/NULL);
2555+
/*peer=*/NULL, /*retry=*/true);
25392556
if (rc)
25402557
return rc;
25412558

@@ -2901,7 +2918,7 @@ static int method_setup_endpoint(sd_bus_message *call, void *data,
29012918

29022919
/* Get Endpoint ID */
29032920
rc = query_get_endpoint_id(ctx, dest, &eid, &ep_type, &medium_spec,
2904-
/*peer=*/NULL);
2921+
/*peer=*/NULL, true);
29052922
if (rc)
29062923
goto err;
29072924

@@ -3170,34 +3187,13 @@ static int method_learn_endpoint(sd_bus_message *call, void *data,
31703187
// and routable.
31713188
static int query_peer_properties(struct peer *peer)
31723189
{
3173-
const unsigned int max_retries = 4;
31743190
bool supports_vdm = false;
31753191
int rc;
31763192

3177-
for (unsigned int i = 0; i < max_retries; i++) {
3178-
rc = query_get_peer_msgtypes(peer);
3179-
3180-
// Success
3181-
if (rc == 0)
3182-
break;
3183-
3184-
// On timeout, retry
3185-
if (rc == -ETIMEDOUT) {
3186-
if (peer->ctx->verbose)
3187-
warnx("Retrying to get endpoint types for %s. Attempt %u",
3188-
peer_tostr(peer), i + 1);
3189-
rc = 0;
3190-
continue;
3191-
}
3192-
3193-
// On other errors, warn and ignore
3194-
if (rc < 0) {
3195-
if (peer->ctx->verbose)
3196-
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
3197-
peer_tostr(peer), -rc, strerror(-rc));
3198-
rc = 0;
3199-
break;
3200-
}
3193+
rc = query_get_peer_msgtypes(peer);
3194+
if (rc < 0 && peer->ctx->verbose) {
3195+
errno = -rc;
3196+
warn("Error getting endpoint types for %s", peer_tostr(peer));
32013197
}
32023198

32033199
for (unsigned int i = 0; i < peer->num_message_types; i++) {
@@ -3209,54 +3205,18 @@ static int query_peer_properties(struct peer *peer)
32093205
}
32103206

32113207
if (supports_vdm) {
3212-
for (unsigned int i = 0; i < max_retries; i++) {
3213-
rc = query_get_peer_vdm_types(peer);
3214-
3215-
if (rc == 0)
3216-
break;
3217-
3218-
// On timeout, retry
3219-
if (rc == -ETIMEDOUT) {
3220-
if (peer->ctx->verbose)
3221-
warnx("Retrying to get vendor message types for %s. Attempt %u",
3222-
peer_tostr(peer), i + 1);
3223-
rc = 0;
3224-
continue;
3225-
}
3226-
3227-
if (rc < 0) {
3228-
warnx("Error getting vendor message types for %s. Ignoring error %d %s",
3229-
peer_tostr(peer), rc, strerror(-rc));
3230-
rc = 0;
3231-
break;
3232-
}
3208+
rc = query_get_peer_vdm_types(peer);
3209+
if (rc < 0 && peer->ctx->verbose) {
3210+
errno = -rc;
3211+
warn("Error getting vendor message types for %s",
3212+
peer_tostr(peer));
32333213
}
32343214
}
32353215

3236-
for (unsigned int i = 0; i < max_retries; i++) {
3237-
rc = query_get_peer_uuid(peer);
3238-
3239-
// Success
3240-
if (rc == 0)
3241-
break;
3242-
3243-
// On timeout, retry
3244-
if (rc == -ETIMEDOUT) {
3245-
if (peer->ctx->verbose)
3246-
warnx("Retrying to get peer UUID for %s. Attempt %u",
3247-
peer_tostr(peer), i + 1);
3248-
rc = 0;
3249-
continue;
3250-
}
3251-
3252-
// On other errors, warn and ignore
3253-
if (rc < 0) {
3254-
if (peer->ctx->verbose)
3255-
warnx("Error getting UUID for %s. Ignoring error %d %s",
3256-
peer_tostr(peer), -rc, strerror(-rc));
3257-
rc = 0;
3258-
break;
3259-
}
3216+
rc = query_get_peer_uuid(peer);
3217+
if (rc < 0 && peer->ctx->verbose) {
3218+
errno = -rc;
3219+
warn("Error getting UUID for %s", peer_tostr(peer));
32603220
}
32613221

32623222
// TODO: emit property changed? Though currently they are all const.
@@ -3556,7 +3516,8 @@ static int peer_endpoint_recover(sd_event_source *s, uint64_t usec,
35563516
*/
35573517
rc = query_get_endpoint_id(ctx, &peer->phys, &peer->recovery.eid,
35583518
&peer->recovery.endpoint_type,
3559-
&peer->recovery.medium_spec, /*peer=*/NULL);
3519+
&peer->recovery.medium_spec, /*peer=*/NULL,
3520+
/*retry=*/false);
35603521
if (rc < 0) {
35613522
goto reschedule;
35623523
}
@@ -3774,7 +3735,7 @@ static int method_net_learn_endpoint(sd_bus_message *call, void *data,
37743735
}
37753736

37763737
rc = query_get_endpoint_id(peer->ctx, &dest, &ret_eid, &ret_ep_type,
3777-
&ret_medium_spec, peer);
3738+
&ret_medium_spec, peer, true);
37783739
if (rc) {
37793740
warnx("Error getting endpoint id for %s. error %d %s",
37803741
peer_tostr(peer), rc, strerror(-rc));

0 commit comments

Comments
 (0)