Skip to content

Commit b87f5f0

Browse files
author
zc-read-impl-leader
committed
feat(zc_recv): M0+M1 zero-copy receive (kernel mp0 passthrough + user API)
Implements FSTACK_ZC_RECV per docs/zc_read_spec spec 12/13 (all gated by the FSTACK_ZC_RECV macro; default build unaffected). M0 (kernel): - kern_zc_recvit (freebsd/kern/uipc_syscalls.c): compact recvit sibling that passes a non-NULL mbuf out-param into soreceive, handing back the socket buffer chain without uiomove. Decl in freebsd/sys/syscallsubr.h. Switch: - lib/Makefile: FF_ZC_RECV -> -DFSTACK_ZC_RECV (mirrors FF_ZC_SEND). M1 (userspace, lib/): - ff_zc_recv (ff_syscall_wrapper.c): builds uio (resid only) + calls kern_zc_recvit; fills struct ff_zc_mbuf. - ff_zc_mbuf_read rewritten (ff_veth.c): real copy-out with intra-mbuf cursor (was empty stub; signature data const->mutable OUT). - ff_zc_mbuf_segment / ff_zc_recv_free (ff_veth.c): zero-copy traversal + chain release via m_freem (returns backing DPDK seg). - ff_api.h: declarations. Verified: lib builds clean under -Werror both with FF_ZC_RECV=1 and default. Runtime/integration tests (M2-M3) follow.
1 parent 81ad466 commit b87f5f0

7 files changed

Lines changed: 257 additions & 6 deletions

File tree

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# 20 · 实现执行计划(FSTACK_ZC_RECV)
2+
3+
> 依据 spec 10-19。本阶段开始写实现代码。所有改动 `#ifdef FSTACK_ZC_RECV` 门控,默认不启用 → 现有构建零影响。
4+
> 铁律:每步实际编译验证;改动以 spec 为蓝本但以实际代码/编译结果为准。
5+
6+
## 执行顺序(M0→M1→M2,M3-M5 后续)
7+
|| 里程碑 | 改动文件 | 退出条件 |
8+
|---|---|---|---|
9+
| 1 | 构建基线 | —— | 确认 lib 当前可编译(或记录现状)|
10+
| 2 | M0-内核 | freebsd/kern/uipc_syscalls.c(+ 声明)| kern_zc_recvit 编译通过 |
11+
| 3 | 开关 | lib/Makefile | FF_ZC_RECV→FSTACK_ZC_RECV |
12+
| 4 | M1-API 声明 | lib/ff_api.h | ff_zc_recv/ff_zc_recv_free/ff_zc_mbuf_read 声明 |
13+
| 5 | M1-用户态 | lib/ff_syscall_wrapper.c(ff_zc_recv)、lib/ff_veth.c(read 重写 + free)| 编译通过 |
14+
| 6 | 构建验证 | —— | FSTACK_ZC_RECV=1 编译通过;默认编译不回归 |
15+
| 7 | M2/测试 | tests/ | 后续 |
16+
17+
## 防回归
18+
- 所有新增代码 `#ifdef FSTACK_ZC_RECV`
19+
- 不改 soreceive 核心、不改原 kern_recvit/soo_read;
20+
- 默认(不定义 FF_ZC_RECV)构建与改动前一致。

freebsd/kern/uipc_syscalls.c

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,68 @@ kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg,
10461046
return (error);
10471047
}
10481048

1049+
#ifdef FSTACK_ZC_RECV
1050+
/*
1051+
* FSTACK_ZC_RECV: zero-copy receive.
1052+
*
1053+
* A compact sibling of kern_recvit that passes a non-NULL mbuf out-parameter
1054+
* (mp0) into soreceive(). Per the FreeBSD soreceive(9) contract, when mp0 is
1055+
* non-NULL the socket-buffer mbuf chain is handed back to the caller without
1056+
* a uiomove copy (only uio_resid is consulted). The caller (ff_zc_recv) owns
1057+
* the returned chain and must release it via m_freem()/ff_zc_recv_free().
1058+
*
1059+
* No address/control-message handling here: ZC receive targets the bulk data
1060+
* fast path. Boundaries that cannot be zero-copied (split mbuf, MSG_PEEK,
1061+
* OOB, TLS, UDP) are handled inside soreceive by falling back to a copy
1062+
* (m_copym) or by the dgram path; the returned chain stays correct either way.
1063+
*/
1064+
int
1065+
kern_zc_recvit(struct thread *td, int s, struct uio *uio, struct mbuf **mp0)
1066+
{
1067+
struct file *fp;
1068+
struct socket *so;
1069+
struct mbuf *zc_chain = NULL;
1070+
ssize_t len;
1071+
int error, flags = 0;
1072+
1073+
if (mp0 == NULL)
1074+
return (EINVAL);
1075+
*mp0 = NULL;
1076+
1077+
AUDIT_ARG_FD(s);
1078+
error = getsock(td, s, &cap_recv_rights, &fp);
1079+
if (error != 0)
1080+
return (error);
1081+
so = fp->f_data;
1082+
1083+
#ifdef MAC
1084+
error = mac_socket_check_receive(td->td_ucred, so);
1085+
if (error != 0) {
1086+
fdrop(fp, td);
1087+
return (error);
1088+
}
1089+
#endif
1090+
1091+
len = uio->uio_resid;
1092+
error = soreceive(so, NULL, uio, &zc_chain, NULL, &flags);
1093+
if (error != 0) {
1094+
if (uio->uio_resid != len && (error == ERESTART ||
1095+
error == EINTR || error == EWOULDBLOCK))
1096+
error = 0;
1097+
}
1098+
if (error == 0) {
1099+
td->td_retval[0] = len - uio->uio_resid;
1100+
*mp0 = zc_chain;
1101+
} else if (zc_chain != NULL) {
1102+
/* error after some mbufs were detached: free them */
1103+
m_freem(zc_chain);
1104+
}
1105+
1106+
fdrop(fp, td);
1107+
return (error);
1108+
}
1109+
#endif /* FSTACK_ZC_RECV */
1110+
10491111
static int
10501112
recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp)
10511113
{

freebsd/sys/syscallsubr.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,13 @@ int kern_readlinkat(struct thread *td, int fd, const char *path,
301301
int kern_readv(struct thread *td, int fd, struct uio *auio);
302302
int kern_recvit(struct thread *td, int s, struct msghdr *mp,
303303
enum uio_seg fromseg, struct mbuf **controlp);
304+
#ifdef FSTACK_ZC_RECV
305+
/* FSTACK_ZC_RECV: zero-copy receive variant — hands the socket-buffer mbuf
306+
* chain back to the caller via *mp0 (soreceive's mp0 out-parameter), avoiding
307+
* the soreceive->uiomove copy. Caller owns *mp0 and must m_freem() it. */
308+
int kern_zc_recvit(struct thread *td, int s, struct uio *uio,
309+
struct mbuf **mp0);
310+
#endif
304311
int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
305312
const char *new, enum uio_seg pathseg);
306313
int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,

lib/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,10 @@ ifdef FF_ZC_SEND
212212
CFLAGS+= -DFSTACK_ZC_SEND
213213
endif
214214

215+
ifdef FF_ZC_RECV
216+
CFLAGS+= -DFSTACK_ZC_RECV
217+
endif
218+
215219
# add for LVS tcp option toa, disabled by default
216220
# CFLAGS+= -DLVS_TCPOPT_TOA
217221

lib/ff_api.h

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,10 +394,45 @@ int ff_zc_mbuf_get(struct ff_zc_mbuf *m, int len);
394394
int ff_zc_mbuf_write(struct ff_zc_mbuf *m, const char *data, int len);
395395

396396
/*
397-
* Read data to the mbuf chain in 'sturct ff_zc_mbuf'.
398-
* not implemented now.
397+
* Read data out of the mbuf chain in 'struct ff_zc_mbuf' into the caller's
398+
* buffer, advancing the internal cursor (zm->bsd_mbuf_off / zm->off).
399+
*
400+
* NOTE: `data` is an OUT buffer (the chain is copied INTO it); the previous
401+
* 'const char *' signature was a not-implemented placeholder.
402+
*
403+
* @return bytes read this call (>0), 0 when the chain is exhausted, -1 error.
404+
*/
405+
int ff_zc_mbuf_read(struct ff_zc_mbuf *m, char *data, int len);
406+
407+
#ifdef FSTACK_ZC_RECV
408+
/*
409+
* FSTACK_ZC_RECV: zero-copy receive entry. Retrieves the socket-buffer mbuf
410+
* chain directly (data still points into the underlying DPDK mbuf), avoiding
411+
* the soreceive->uiomove copy. On success zm->bsd_mbuf holds the chain head,
412+
* zm->len the byte count, and the cursor is reset for ff_zc_mbuf_read /
413+
* ff_zc_mbuf_segment traversal.
414+
*
415+
* The caller OWNS the returned chain and MUST release it via
416+
* ff_zc_recv_free() once done — otherwise the backing DPDK mbufs leak.
417+
*
418+
* @return bytes received (>0), 0 on peer close, -1 on error (errno set).
419+
*/
420+
ssize_t ff_zc_recv(int fd, struct ff_zc_mbuf *zm, size_t nbytes);
421+
422+
/*
423+
* Zero-copy traversal: return the current segment's data pointer + length
424+
* (pointing into the mbuf, no copy) and advance the cursor.
425+
* @return seg bytes (>0), 0 when exhausted, -1 error.
426+
*/
427+
int ff_zc_mbuf_segment(struct ff_zc_mbuf *zm, void **seg_data, int *seg_len);
428+
429+
/*
430+
* Release a chain obtained from ff_zc_recv (m_freem the whole chain, which
431+
* returns each backing DPDK mbuf segment). Idempotent; zeroes zm. Must be
432+
* called exactly once per successful ff_zc_recv.
399433
*/
400-
int ff_zc_mbuf_read(struct ff_zc_mbuf *m, const char *data, int len);
434+
void ff_zc_recv_free(struct ff_zc_mbuf *zm);
435+
#endif /* FSTACK_ZC_RECV */
401436

402437
/*
403438
* M8: zero-copy send entry. Caller must pass the mbuf chain

lib/ff_syscall_wrapper.c

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,58 @@ ff_zc_send(int fd, const void *mb, size_t nbytes)
12251225
}
12261226
#endif /* FSTACK_ZC_SEND */
12271227

1228+
#ifdef FSTACK_ZC_RECV
1229+
/*
1230+
* FSTACK_ZC_RECV: zero-copy receive entry. Builds a uio carrying only the
1231+
* requested byte budget (uio_resid) and calls kern_zc_recvit, which passes a
1232+
* non-NULL mbuf out-parameter into soreceive so the socket-buffer mbuf chain
1233+
* is handed back without a uiomove copy. On success zm->bsd_mbuf holds the
1234+
* chain head; the caller must release it via ff_zc_recv_free().
1235+
*
1236+
* `zm` must be a valid 'struct ff_zc_mbuf *'. Do NOT pass a char buffer.
1237+
*/
1238+
ssize_t
1239+
ff_zc_recv(int fd, struct ff_zc_mbuf *zm, size_t nbytes)
1240+
{
1241+
struct uio auio;
1242+
struct iovec aiov;
1243+
struct mbuf *chain = NULL;
1244+
int rc;
1245+
1246+
if (zm == NULL || nbytes == 0 || nbytes > INT_MAX) {
1247+
rc = EINVAL;
1248+
goto kern_fail;
1249+
}
1250+
1251+
/* uio is only consulted for uio_resid when mp0 is non-NULL (soreceive(9));
1252+
* iov_base is unused on the ZC path but set for completeness. */
1253+
aiov.iov_base = NULL;
1254+
aiov.iov_len = nbytes;
1255+
auio.uio_iov = &aiov;
1256+
auio.uio_iovcnt = 1;
1257+
auio.uio_resid = nbytes;
1258+
auio.uio_segflg = UIO_SYSSPACE;
1259+
auio.uio_rw = UIO_READ;
1260+
auio.uio_td = curthread;
1261+
auio.uio_offset = 0;
1262+
1263+
if ((rc = kern_zc_recvit(curthread, fd, &auio, &chain)))
1264+
goto kern_fail;
1265+
1266+
rc = curthread->td_retval[0];
1267+
1268+
zm->bsd_mbuf = chain;
1269+
zm->bsd_mbuf_off = chain;
1270+
zm->off = 0;
1271+
zm->len = rc;
1272+
1273+
return (rc);
1274+
kern_fail:
1275+
ff_os_errno(rc);
1276+
return (-1);
1277+
}
1278+
#endif /* FSTACK_ZC_RECV */
1279+
12281280
ssize_t
12291281
ff_send(int s, const void *buf, size_t len, int flags)
12301282
{

lib/ff_veth.c

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -356,11 +356,82 @@ ff_zc_mbuf_write(struct ff_zc_mbuf *zm, const char *data, int len)
356356
}
357357

358358
int
359-
ff_zc_mbuf_read(struct ff_zc_mbuf *m, const char *data, int len)
359+
ff_zc_mbuf_read(struct ff_zc_mbuf *zm, char *data, int len)
360360
{
361-
// DOTO: Support read zero copy
362-
return 0;
361+
struct mbuf *mb;
362+
int progress = 0, length, moff;
363+
364+
if (zm == NULL || data == NULL || len <= 0) {
365+
return -1;
366+
}
367+
368+
/* For RECV, bsd_mbuf_off is the current mbuf cursor and `off` is the
369+
* already-consumed offset WITHIN that mbuf. Copy out up to `len` bytes,
370+
* spanning mbufs as needed. */
371+
mb = (struct mbuf *)zm->bsd_mbuf_off;
372+
moff = zm->off;
373+
while (mb != NULL && progress < len) {
374+
length = min(mb->m_len - moff, len - progress);
375+
bcopy(mtod(mb, char *) + moff, data + progress, length);
376+
progress += length;
377+
moff += length;
378+
if (moff >= mb->m_len) {
379+
mb = mb->m_next;
380+
moff = 0;
381+
}
382+
}
383+
zm->bsd_mbuf_off = mb;
384+
zm->off = moff;
385+
386+
return progress;
387+
}
388+
389+
#ifdef FSTACK_ZC_RECV
390+
/*
391+
* FSTACK_ZC_RECV: return the current segment's data pointer + length without
392+
* copying (pointer aliases into the underlying DPDK mbuf), then advance the
393+
* cursor to the next mbuf. Valid until ff_zc_recv_free() is called.
394+
*/
395+
int
396+
ff_zc_mbuf_segment(struct ff_zc_mbuf *zm, void **seg_data, int *seg_len)
397+
{
398+
struct mbuf *mb;
399+
400+
if (zm == NULL || seg_data == NULL || seg_len == NULL) {
401+
return -1;
402+
}
403+
404+
mb = (struct mbuf *)zm->bsd_mbuf_off;
405+
if (mb == NULL) {
406+
return 0; /* chain exhausted */
407+
}
408+
409+
*seg_data = mtod(mb, void *);
410+
*seg_len = mb->m_len;
411+
zm->bsd_mbuf_off = mb->m_next;
412+
zm->off = 0;
413+
414+
return mb->m_len;
415+
}
416+
417+
/*
418+
* FSTACK_ZC_RECV: release the whole chain obtained from ff_zc_recv. m_freem
419+
* walks m_next and frees each mbuf; ext-mbuf segments trigger ff_mbuf_ext_free
420+
* which returns the backing DPDK mbuf seg (see docs/zc_read_spec). Idempotent.
421+
*/
422+
void
423+
ff_zc_recv_free(struct ff_zc_mbuf *zm)
424+
{
425+
if (zm == NULL || zm->bsd_mbuf == NULL) {
426+
return;
427+
}
428+
m_freem((struct mbuf *)zm->bsd_mbuf);
429+
zm->bsd_mbuf = NULL;
430+
zm->bsd_mbuf_off = NULL;
431+
zm->off = 0;
432+
zm->len = 0;
363433
}
434+
#endif /* FSTACK_ZC_RECV */
364435

365436
void *
366437
ff_mbuf_gethdr(void *pkt, uint16_t total, void *data,

0 commit comments

Comments
 (0)