Skip to content

Commit c17c48a

Browse files
committed
prov/shm: fix RMA read and write bandwidth regressions
Read fix: extend smr_do_fast_rma to use sender-side CMA for RMA reads at all sizes (not just > SMR_INJECT_SIZE). Delivery is inherently complete when process_vm_readv returns. This avoids the receiver-side CMA round-trip that regresses 20-25% on Graviton and AMD at 1-4096B. Write fix: allow FI_REMOTE_CQ_DATA through smr_rma_fast so writedata uses sender-side CMA instead of the receiver-side IOV path with its expensive atomic return queue. The sender does process_vm_writev directly, then posts ofi_op_write_async with cq_data. The receiver generates the remote CQ entry on seeing SMR_REMOTE_CQ_DATA in the write_async notification. Both target buffers are always registered MRs with pinned pages, making sender-side CMA safe for reads and writes. Signed-off-by: Yin Li <yinliq@amazon.com>
1 parent 0518a42 commit c17c48a

2 files changed

Lines changed: 31 additions & 9 deletions

File tree

prov/shm/src/smr_progress.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,6 +1334,16 @@ static void smr_progress_cmd(struct smr_ep *ep)
13341334
ret = smr_progress_cmd_rma(ep, cmd);
13351335
break;
13361336
case ofi_op_write_async:
1337+
if (cmd->hdr.smr_flags & SMR_REMOTE_CQ_DATA) {
1338+
smr_complete_rx(ep, NULL, ofi_op_write,
1339+
smr_rx_cq_flags(0, cmd->hdr.smr_flags),
1340+
cmd->hdr.size, NULL,
1341+
cmd->hdr.rx_id, 0, cmd->hdr.cq_data);
1342+
} else {
1343+
ofi_ep_peer_rx_cntr_inc(&ep->util_ep,
1344+
cmd->hdr.op);
1345+
}
1346+
break;
13371347
case ofi_op_read_async:
13381348
ofi_ep_peer_rx_cntr_inc(&ep->util_ep, cmd->hdr.op);
13391349
break;

prov/shm/src/smr_rma.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,20 @@ static void smr_add_rma_cmd(struct smr_region *peer_smr,
4444
static void smr_format_rma_resp(struct smr_cmd *cmd, int64_t peer_id,
4545
const struct fi_rma_iov *rma_iov, size_t count,
4646
size_t total_len, uint32_t op,
47-
uint64_t op_flags)
47+
uint64_t op_flags, uint64_t data)
4848
{
49-
smr_generic_format(cmd, 0, peer_id, op, 0, 0, op_flags);
49+
uint8_t smr_flags = 0;
50+
if (op_flags & FI_REMOTE_CQ_DATA)
51+
smr_flags |= SMR_REMOTE_CQ_DATA;
52+
smr_generic_format(cmd, 0, peer_id, op, 0, data, smr_flags);
5053
cmd->hdr.size = total_len;
5154
}
5255

5356
static ssize_t smr_rma_fast(struct smr_ep *ep, struct smr_region *peer_smr,
5457
const struct iovec *iov, size_t iov_count,
5558
const struct fi_rma_iov *rma_iov, size_t rma_count,
5659
void **desc, int rx_id, int tx_id, void *context,
57-
uint32_t op, uint64_t op_flags)
60+
uint32_t op, uint64_t op_flags, uint64_t data)
5861
{
5962
struct iovec vma_iovec[SMR_IOV_LIMIT], rma_iovec[SMR_IOV_LIMIT];
6063
struct ofi_xpmem_client *xpmem;
@@ -91,7 +94,7 @@ static ssize_t smr_rma_fast(struct smr_ep *ep, struct smr_region *peer_smr,
9194

9295
smr_format_rma_resp(&ce->cmd, rx_id, rma_iov, rma_count, total_len,
9396
(op == ofi_op_write) ? ofi_op_write_async :
94-
ofi_op_read_async, op_flags);
97+
ofi_op_read_async, op_flags, data);
9598

9699
smr_cmd_queue_commit(ce, pos);
97100

@@ -106,15 +109,24 @@ static ssize_t smr_rma_fast(struct smr_ep *ep, struct smr_region *peer_smr,
106109

107110
static inline bool smr_do_fast_rma(struct smr_ep *ep, uint64_t op_flags,
108111
size_t rma_count, size_t total_len,
109-
struct smr_region *peer_smr)
112+
struct smr_region *peer_smr, uint32_t op)
110113
{
111114
struct smr_domain *domain;
112115

113116
domain = container_of(ep->util_ep.domain, struct smr_domain,
114117
util_domain);
115118

116-
return domain->fast_rma && !(op_flags &
117-
(FI_REMOTE_CQ_DATA | FI_DELIVERY_COMPLETE)) &&
119+
/* For reads, sender-side CMA is safe at all sizes: delivery is
120+
* inherently complete when process_vm_readv returns (data is in
121+
* local buffer), and the target memory is always a registered MR
122+
* with pinned pages. */
123+
if (op == ofi_op_read_req && total_len <= SMR_INJECT_SIZE)
124+
return domain->fast_rma &&
125+
!(op_flags & FI_REMOTE_CQ_DATA) &&
126+
rma_count == 1 && smr_vma_enabled(ep, peer_smr);
127+
128+
return domain->fast_rma &&
129+
!(op_flags & FI_DELIVERY_COMPLETE) &&
118130
rma_count == 1 && smr_vma_enabled(ep, peer_smr) &&
119131
total_len > SMR_INJECT_SIZE;
120132

@@ -153,10 +165,10 @@ static ssize_t smr_generic_rma(
153165
goto unlock;
154166

155167
total_len = ofi_total_iov_len(iov, iov_count);
156-
if (smr_do_fast_rma(ep, op_flags, rma_count, total_len, peer_smr)) {
168+
if (smr_do_fast_rma(ep, op_flags, rma_count, total_len, peer_smr, op)) {
157169
ret = smr_rma_fast(ep, peer_smr, iov, iov_count, rma_iov,
158170
rma_count, desc, rx_id, tx_id, context, op,
159-
op_flags);
171+
op_flags, data);
160172
goto unlock;
161173
}
162174

0 commit comments

Comments
 (0)