Skip to content

Commit 005508c

Browse files
sjmiller609claude
andcommitted
uffd: serve firecracker page faults from a shared template mem-file
Adds lib/uffd, a userfaultfd page server that backs many concurrent fan-out forks against a single read-only template mem-file instead of letting each fork mmap it privately. Firecracker connects to a per-fork UDS, hands us its userfaultfd via SCM_RIGHTS along with a JSON mappings handshake, and the server then services UFFD_EVENT_PAGEFAULT events with UFFDIO_COPY reads from the template. The Linux hot path lives behind a build tag; non-Linux builds return ErrUnsupported so callers can fall back to MAP_PRIVATE. Cross-platform tests cover the handshake parser and the server lifecycle. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 7b799f7 commit 005508c

4 files changed

Lines changed: 703 additions & 0 deletions

File tree

lib/uffd/server_linux.go

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
//go:build linux
2+
3+
package uffd
4+
5+
import (
6+
"context"
7+
"encoding/binary"
8+
"errors"
9+
"fmt"
10+
"io"
11+
"net"
12+
"os"
13+
"sync"
14+
"syscall"
15+
"unsafe"
16+
17+
"golang.org/x/sys/unix"
18+
)
19+
20+
// userfaultfd ioctl numbers and feature flags. The constants are derived
21+
// from <linux/userfaultfd.h>: _IOWR(0xAA, ...) with the size of each
22+
// argument struct in bits 16–29.
23+
const (
24+
uffdAPI = 0xAA
25+
uffdAPIFeature = 0x0 // we only need missing-page faults; no extra features.
26+
27+
uffdioAPI = 0xC018AA3F // _IOWR(0xAA, 0x3F, struct uffdio_api{24})
28+
uffdioRegister = 0xC020AA00 // _IOWR(0xAA, 0x00, struct uffdio_register{32})
29+
uffdioCopyIoctl = 0xC028AA03 // _IOWR(0xAA, 0x03, struct uffdio_copy{40})
30+
uffdioZeropage = 0xC020AA04 // _IOWR(0xAA, 0x04, struct uffdio_zeropage{32})
31+
uffdRegMissing = 1 << 0
32+
uffdEventPagefnt = 0x12 // UFFD_EVENT_PAGEFAULT
33+
)
34+
35+
// uffdMsg mirrors struct uffd_msg from <linux/userfaultfd.h>. It is a
36+
// 32-byte fixed-size record; we only consume the pagefault arm.
37+
type uffdMsg struct {
38+
Event uint8
39+
_ uint8
40+
_ uint16
41+
_ uint32
42+
Pagefault struct {
43+
Flags uint64
44+
Address uint64
45+
Ptid uint32
46+
_ uint32
47+
}
48+
}
49+
50+
// uffdioAPIArg is struct uffdio_api.
51+
type uffdioAPIArg struct {
52+
API uint64
53+
Features uint64
54+
Ioctls uint64
55+
}
56+
57+
// uffdioRegisterArg is struct uffdio_register.
58+
type uffdioRegisterArg struct {
59+
Start uint64
60+
Len uint64
61+
Mode uint64
62+
Ioctls uint64
63+
}
64+
65+
// uffdioCopyArg is struct uffdio_copy.
66+
type uffdioCopyArg struct {
67+
Dst uint64
68+
Src uint64
69+
Len uint64
70+
Mode uint64
71+
Copy int64
72+
}
73+
74+
// startListener opens the per-fork UDS, accepts firecracker's connection,
75+
// receives the userfaultfd via SCM_RIGHTS plus the JSON handshake, and
76+
// then runs the page-fault loop. The returned closer stops accept,
77+
// signals the handler, and removes the socket file.
78+
func (s *Server) startListener(ctx context.Context, forkID string, socketPath string) (func() error, error) {
79+
// Remove any stale socket file from a prior run; UDS bind fails otherwise.
80+
_ = os.Remove(socketPath)
81+
ln, err := net.Listen("unix", socketPath)
82+
if err != nil {
83+
return nil, fmt.Errorf("uffd: listen %s: %w", socketPath, err)
84+
}
85+
86+
hctx, hcancel := context.WithCancel(ctx)
87+
88+
var (
89+
wg sync.WaitGroup
90+
mu sync.Mutex
91+
uffdFd int = -1
92+
closed bool
93+
)
94+
95+
closer := func() error {
96+
mu.Lock()
97+
if closed {
98+
mu.Unlock()
99+
wg.Wait()
100+
return nil
101+
}
102+
closed = true
103+
fd := uffdFd
104+
uffdFd = -1
105+
mu.Unlock()
106+
107+
hcancel()
108+
_ = ln.Close()
109+
if fd >= 0 {
110+
_ = unix.Close(fd)
111+
}
112+
wg.Wait()
113+
_ = os.Remove(socketPath)
114+
return nil
115+
}
116+
117+
wg.Add(1)
118+
go func() {
119+
defer wg.Done()
120+
conn, err := ln.Accept()
121+
if err != nil {
122+
return
123+
}
124+
defer conn.Close()
125+
126+
fd, regions, err := receiveHandshake(conn)
127+
if err != nil {
128+
return
129+
}
130+
mu.Lock()
131+
if closed {
132+
mu.Unlock()
133+
_ = unix.Close(fd)
134+
return
135+
}
136+
uffdFd = fd
137+
mu.Unlock()
138+
139+
if err := uffdAPIHandshake(fd); err != nil {
140+
return
141+
}
142+
for _, r := range regions {
143+
if err := uffdRegisterRegion(fd, r); err != nil {
144+
return
145+
}
146+
}
147+
148+
s.servePageFaults(hctx, fd, regions, forkID)
149+
}()
150+
151+
return closer, nil
152+
}
153+
154+
// receiveHandshake reads firecracker's JSON payload and the userfaultfd
155+
// over a single recvmsg(2) call. Firecracker sends them together; if the
156+
// kernel splits them across reads we loop until the fd arrives.
157+
func receiveHandshake(conn net.Conn) (int, []MemoryRegion, error) {
158+
uc, ok := conn.(*net.UnixConn)
159+
if !ok {
160+
return -1, nil, errors.New("uffd: connection is not a unix socket")
161+
}
162+
f, err := uc.File()
163+
if err != nil {
164+
return -1, nil, fmt.Errorf("uffd: get fd from unix conn: %w", err)
165+
}
166+
defer f.Close()
167+
168+
// Read until we have the SCM_RIGHTS fd. The JSON body is small, so
169+
// a 4 KiB buffer plus one OOB control message is plenty.
170+
buf := make([]byte, 4096)
171+
oob := make([]byte, unix.CmsgSpace(4))
172+
var (
173+
jsonBytes []byte
174+
fd int = -1
175+
)
176+
for fd < 0 {
177+
n, oobn, _, _, err := unix.Recvmsg(int(f.Fd()), buf, oob, 0)
178+
if err != nil {
179+
return -1, nil, fmt.Errorf("uffd: recvmsg: %w", err)
180+
}
181+
if n > 0 {
182+
jsonBytes = append(jsonBytes, buf[:n]...)
183+
}
184+
if oobn > 0 {
185+
scms, perr := unix.ParseSocketControlMessage(oob[:oobn])
186+
if perr != nil {
187+
return -1, nil, fmt.Errorf("uffd: parse cmsg: %w", perr)
188+
}
189+
for _, scm := range scms {
190+
fds, ferr := unix.ParseUnixRights(&scm)
191+
if ferr != nil {
192+
return -1, nil, fmt.Errorf("uffd: parse fds: %w", ferr)
193+
}
194+
if len(fds) > 0 {
195+
fd = fds[0]
196+
for _, extra := range fds[1:] {
197+
_ = unix.Close(extra)
198+
}
199+
}
200+
}
201+
}
202+
if n == 0 && oobn == 0 {
203+
return -1, nil, io.ErrUnexpectedEOF
204+
}
205+
}
206+
207+
hs, err := parseHandshake(jsonBytes)
208+
if err != nil {
209+
_ = unix.Close(fd)
210+
return -1, nil, err
211+
}
212+
return fd, hs.Mappings, nil
213+
}
214+
215+
func uffdAPIHandshake(fd int) error {
216+
api := uffdioAPIArg{API: uffdAPI, Features: uffdAPIFeature}
217+
if err := ioctl(fd, uffdioAPI, unsafe.Pointer(&api)); err != nil {
218+
return fmt.Errorf("uffd: UFFDIO_API: %w", err)
219+
}
220+
return nil
221+
}
222+
223+
func uffdRegisterRegion(fd int, r MemoryRegion) error {
224+
reg := uffdioRegisterArg{
225+
Start: uint64(r.BaseHostAddr),
226+
Len: r.Size,
227+
Mode: uffdRegMissing,
228+
}
229+
if err := ioctl(fd, uffdioRegister, unsafe.Pointer(&reg)); err != nil {
230+
return fmt.Errorf("uffd: UFFDIO_REGISTER: %w", err)
231+
}
232+
return nil
233+
}
234+
235+
// servePageFaults blocks reading uffd events on fd. For each
236+
// UFFD_EVENT_PAGEFAULT we look up the region containing the faulting
237+
// address, read a page from the template mem-file, and call UFFDIO_COPY
238+
// to satisfy the fault.
239+
func (s *Server) servePageFaults(ctx context.Context, fd int, regions []MemoryRegion, forkID string) {
240+
page := make([]byte, s.pageSize)
241+
var msg uffdMsg
242+
msgSize := int(unsafe.Sizeof(msg))
243+
rawBuf := make([]byte, msgSize)
244+
245+
for {
246+
if ctx.Err() != nil {
247+
return
248+
}
249+
n, err := unix.Read(fd, rawBuf)
250+
if err != nil {
251+
if errors.Is(err, syscall.EINTR) {
252+
continue
253+
}
254+
return
255+
}
256+
if n != msgSize {
257+
return
258+
}
259+
event := rawBuf[0]
260+
if event != uffdEventPagefnt {
261+
continue
262+
}
263+
// pagefault.address starts at offset 16 of uffd_msg.
264+
addr := binary.LittleEndian.Uint64(rawBuf[16:24])
265+
if err := s.copyPageForFault(fd, regions, addr, page); err != nil {
266+
return
267+
}
268+
}
269+
}
270+
271+
func (s *Server) copyPageForFault(fd int, regions []MemoryRegion, addr uint64, page []byte) error {
272+
pageSize := uint64(s.pageSize)
273+
pageStart := addr &^ (pageSize - 1)
274+
275+
for _, r := range regions {
276+
base := uint64(r.BaseHostAddr)
277+
if pageStart < base || pageStart >= base+r.Size {
278+
continue
279+
}
280+
offset := int64(r.MemFileOffset + (pageStart - base))
281+
if _, err := s.memFile.ReadAt(page, offset); err != nil && !errors.Is(err, io.EOF) {
282+
return fmt.Errorf("uffd: read template at %d: %w", offset, err)
283+
}
284+
copyArg := uffdioCopyArg{
285+
Dst: pageStart,
286+
Src: uint64(uintptr(unsafe.Pointer(&page[0]))),
287+
Len: pageSize,
288+
}
289+
if err := ioctl(fd, uffdioCopyIoctl, unsafe.Pointer(&copyArg)); err != nil {
290+
// Spurious/duplicate faults can race other vCPUs; treat
291+
// them as benign and keep serving.
292+
if errors.Is(err, syscall.EEXIST) || errors.Is(err, syscall.EAGAIN) {
293+
return nil
294+
}
295+
return fmt.Errorf("uffd: UFFDIO_COPY: %w", err)
296+
}
297+
return nil
298+
}
299+
return fmt.Errorf("uffd: fault addr 0x%x outside any registered region", addr)
300+
}
301+
302+
func ioctl(fd int, req uintptr, arg unsafe.Pointer) error {
303+
_, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), req, uintptr(arg))
304+
if errno != 0 {
305+
return errno
306+
}
307+
return nil
308+
}

lib/uffd/server_other.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
//go:build !linux
2+
3+
package uffd
4+
5+
import "context"
6+
7+
// startListener returns ErrUnsupported on non-Linux platforms.
8+
// userfaultfd is a Linux-only kernel feature; callers should fall back
9+
// to letting firecracker mmap the mem-file privately.
10+
func (s *Server) startListener(ctx context.Context, forkID string, socketPath string) (func() error, error) {
11+
_ = ctx
12+
_ = forkID
13+
_ = socketPath
14+
return nil, ErrUnsupported
15+
}

0 commit comments

Comments
 (0)