bnxt_re/lib: Optimize the inline data copy

selvintxavier · selvintxavier · commit f0c4973f3dc8 · 2026-02-19T09:18:40.000-08:00
Inline data is currently copied in 16-byte chunks
to avoid ring-buffer wraparound. Wraparound is rare
for typical post_send usage, so when it does not
occur we can copy the whole inline payload in one go.

Implements a mechanism to handle wraparound and
non-wraparound cases separately.

Signed-off-by: Selvin Xavier &lt;selvin.xavier@broadcom.com&gt;
diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c
@@ -2260,50 +2260,76 @@ static int bnxt_re_put_inline(struct bnxt_re_queue *que, uint32_t *idx,
 			      struct ibv_sge *sgl, uint32_t nsg,
 			      uint16_t max_ils)
 {
-	int len, t_len, offt = 0;
-	int t_cplen = 0, cplen;
-	bool pull_dst = true;
-	void *il_dst = NULL;
-	void *il_src = NULL;
-	int alsize;
+	const int alsize = sizeof(struct bnxt_re_sge);
+	uint32_t n_slots, start_phys, slots_nowrap;
+	uint32_t sge_idx, sge_off;
+	int copied, cplen, len;
+	int nowrap_bytes;
+	int t_len = 0;
+	void *il_dst;
 	int indx;
 
-	alsize = sizeof(struct bnxt_re_sge);
-
-	t_len = 0;
 	for (indx = 0; indx < nsg; indx++) {
-		len = sgl[indx].length;
-		il_src = (void *)(uintptr_t)(sgl[indx].addr);
-		t_len += len;
+		t_len += sgl[indx].length;
 		if (t_len > max_ils)
-			goto bad;
+			return -ENOMEM;
+	}
 
-		while (len) {
-			if (pull_dst) {
-				pull_dst = false;
-				il_dst = bnxt_re_get_hwqe(que, (*idx)++);
-				if (pbuf)
-					pbuf->wqe[*idx - 1] =
-					(uintptr_t)il_dst;
-				t_cplen = 0;
-				offt = 0;
-			}
-			cplen = MIN(len, alsize);
-			cplen = MIN(cplen, (alsize - offt));
-			memcpy(il_dst, il_src, cplen);
-			t_cplen += cplen;
-			il_src += cplen;
+	n_slots = (t_len + alsize - 1) / alsize;
+	start_phys = (que->tail + *idx) % que->depth;
+	slots_nowrap = que->depth - start_phys;
+
+	/* Record push buffer slots for all slots we use */
+	if (pbuf) {
+		for (indx = 0; indx < n_slots; indx++)
+			pbuf->wqe[*idx + indx] =
+				(uintptr_t)bnxt_re_get_hwqe(que, *idx + indx);
+	}
+
+	il_dst = bnxt_re_get_hwqe(que, *idx);
+
+	if (n_slots <= slots_nowrap) {
+		/* No wraparound: copy each SGE in one shot */
+		for (indx = 0; indx < nsg; indx++) {
+			memcpy(il_dst, (void *)(uintptr_t)sgl[indx].addr,
+			       sgl[indx].length);
+			il_dst += sgl[indx].length;
+		}
+	} else {
+		/* Wraparound: copy first part to end of ring, rest from start */
+		nowrap_bytes = slots_nowrap * alsize;
+		copied = 0;
+		sge_idx = 0;
+		sge_off = 0;
+		while (copied < nowrap_bytes) {
+			len = sgl[sge_idx].length - sge_off;
+			cplen = len <= (nowrap_bytes - copied) ?
+				len : (nowrap_bytes - copied);
+			memcpy(il_dst,
+			       (char *)(uintptr_t)sgl[sge_idx].addr + sge_off,
+			       cplen);
 			il_dst += cplen;
-			offt += cplen;
-			len -= cplen;
-			if (t_cplen == alsize)
-				pull_dst = true;
+			copied += cplen;
+			sge_off += cplen;
+			if (sge_off == sgl[sge_idx].length) {
+				sge_idx++;
+				sge_off = 0;
+			}
+		}
+		il_dst = que->va;
+		while (sge_idx < nsg) {
+			len = sgl[sge_idx].length - sge_off;
+			memcpy(il_dst,
+			       (char *)(uintptr_t)sgl[sge_idx].addr + sge_off,
+			       len);
+			il_dst += len;
+			sge_off = 0;
+			sge_idx++;
 		}
 	}
 
+	*idx += n_slots;
 	return t_len;
-bad:
-	return -ENOMEM;
 }
 
 static int bnxt_re_required_slots(struct bnxt_re_qp *qp, struct ibv_send_wr *wr,