Skip to content

Commit 579a72e

Browse files
sjmiller609claude
andcommitted
uffd: record warmup faults and prefetch them on later forks
Adds a hot-page recorder + prefetch primitive on top of the userfaultfd page server. During a template's first warmup fork the server can record every served page (Config.RecordHotPages); the resulting HotPageList is stable-sorted, deduplicated, and saved to disk in a small binary format alongside the template. Later forks call Server.Prefetch(forkID, list) to issue UFFDIO_COPY for every recorded page against their userfaultfd before the guest unpauses, eliminating the fault round-trips on those addresses. The prefetcher is installed by the platform-specific listener once the fork's uffd has been received and registered, so callers can race Prefetch and the fault loop safely. EEXIST/EAGAIN are tolerated the same way the fault handler does to absorb first-touch races with vCPUs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 005508c commit 579a72e

4 files changed

Lines changed: 352 additions & 2 deletions

File tree

lib/uffd/hotpages.go

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
package uffd
2+
3+
import (
4+
"bufio"
5+
"encoding/binary"
6+
"errors"
7+
"fmt"
8+
"os"
9+
"path/filepath"
10+
"sort"
11+
"sync"
12+
)
13+
14+
// HotPage points at a single page-aligned location inside a registered
15+
// memory region. Region is the index into the handshake's mappings list;
16+
// PageOffset is the byte offset of the page within that region (always
17+
// a multiple of the server's page size).
18+
type HotPage struct {
19+
Region uint32
20+
PageOffset uint64
21+
}
22+
23+
// HotPageList is the persisted "what pages should we eagerly populate
24+
// before the guest unpauses" list. PR 8 records one of these during a
25+
// template's first fork warm-up and bakes it into Template.HotPagesPath;
26+
// later forks call Server.Prefetch with the loaded list to skip the
27+
// fault round-trips on those pages.
28+
//
29+
// Concurrent Add/Snapshot is safe; Save and Load are not — callers
30+
// generally Save once at the end of warmup and Load once at boot.
31+
type HotPageList struct {
32+
mu sync.Mutex
33+
pages []HotPage
34+
}
35+
36+
// hotPagesFileMagic prefixes saved files so we can refuse to load
37+
// arbitrary garbage. The version byte exists so a future format change
38+
// can be rejected loudly instead of silently misinterpreted.
39+
var hotPagesFileMagic = []byte("HPL1")
40+
41+
// Add records a single hot page. Duplicates are tolerated; Snapshot
42+
// dedups before returning.
43+
func (h *HotPageList) Add(p HotPage) {
44+
h.mu.Lock()
45+
h.pages = append(h.pages, p)
46+
h.mu.Unlock()
47+
}
48+
49+
// Len returns the number of recorded pages (with duplicates).
50+
func (h *HotPageList) Len() int {
51+
h.mu.Lock()
52+
defer h.mu.Unlock()
53+
return len(h.pages)
54+
}
55+
56+
// Snapshot returns a sorted, deduplicated copy of the recorded pages.
57+
// Sort order is (Region, PageOffset) so prefetch issues sequential
58+
// reads against the template mem-file.
59+
func (h *HotPageList) Snapshot() []HotPage {
60+
h.mu.Lock()
61+
src := make([]HotPage, len(h.pages))
62+
copy(src, h.pages)
63+
h.mu.Unlock()
64+
65+
sort.Slice(src, func(i, j int) bool {
66+
if src[i].Region != src[j].Region {
67+
return src[i].Region < src[j].Region
68+
}
69+
return src[i].PageOffset < src[j].PageOffset
70+
})
71+
out := src[:0]
72+
var last HotPage
73+
for i, p := range src {
74+
if i == 0 || p != last {
75+
out = append(out, p)
76+
last = p
77+
}
78+
}
79+
return out
80+
}
81+
82+
// Save atomically writes the deduplicated snapshot to path. The format
83+
// is: 4-byte magic ("HPL1"), uvarint count, then for each page a
84+
// uvarint region index and a uvarint page offset. Atomic via tmp+rename.
85+
func (h *HotPageList) Save(path string) error {
86+
pages := h.Snapshot()
87+
tmp := path + ".tmp"
88+
f, err := os.Create(tmp)
89+
if err != nil {
90+
return fmt.Errorf("uffd: create hot pages tmp: %w", err)
91+
}
92+
bw := bufio.NewWriter(f)
93+
if _, err := bw.Write(hotPagesFileMagic); err != nil {
94+
_ = f.Close()
95+
_ = os.Remove(tmp)
96+
return fmt.Errorf("uffd: write hot pages magic: %w", err)
97+
}
98+
var ibuf [binary.MaxVarintLen64]byte
99+
n := binary.PutUvarint(ibuf[:], uint64(len(pages)))
100+
if _, err := bw.Write(ibuf[:n]); err != nil {
101+
_ = f.Close()
102+
_ = os.Remove(tmp)
103+
return fmt.Errorf("uffd: write hot pages count: %w", err)
104+
}
105+
for _, p := range pages {
106+
n = binary.PutUvarint(ibuf[:], uint64(p.Region))
107+
if _, err := bw.Write(ibuf[:n]); err != nil {
108+
_ = f.Close()
109+
_ = os.Remove(tmp)
110+
return fmt.Errorf("uffd: write hot pages region: %w", err)
111+
}
112+
n = binary.PutUvarint(ibuf[:], p.PageOffset)
113+
if _, err := bw.Write(ibuf[:n]); err != nil {
114+
_ = f.Close()
115+
_ = os.Remove(tmp)
116+
return fmt.Errorf("uffd: write hot pages offset: %w", err)
117+
}
118+
}
119+
if err := bw.Flush(); err != nil {
120+
_ = f.Close()
121+
_ = os.Remove(tmp)
122+
return fmt.Errorf("uffd: flush hot pages: %w", err)
123+
}
124+
if err := f.Close(); err != nil {
125+
_ = os.Remove(tmp)
126+
return fmt.Errorf("uffd: close hot pages tmp: %w", err)
127+
}
128+
if err := os.Rename(tmp, path); err != nil {
129+
return fmt.Errorf("uffd: rename hot pages: %w", err)
130+
}
131+
return nil
132+
}
133+
134+
// LoadHotPageList reads a HotPageList from path. Returns an empty list
135+
// (not an error) when path does not exist; the absence of a baked
136+
// hot-page file simply means "don't prefetch."
137+
func LoadHotPageList(path string) (*HotPageList, error) {
138+
clean := filepath.Clean(path)
139+
data, err := os.ReadFile(clean)
140+
if err != nil {
141+
if errors.Is(err, os.ErrNotExist) {
142+
return &HotPageList{}, nil
143+
}
144+
return nil, fmt.Errorf("uffd: read hot pages: %w", err)
145+
}
146+
if len(data) < len(hotPagesFileMagic) {
147+
return nil, errors.New("uffd: hot pages file truncated")
148+
}
149+
if string(data[:len(hotPagesFileMagic)]) != string(hotPagesFileMagic) {
150+
return nil, errors.New("uffd: hot pages file has bad magic")
151+
}
152+
rest := data[len(hotPagesFileMagic):]
153+
count, n := binary.Uvarint(rest)
154+
if n <= 0 {
155+
return nil, errors.New("uffd: hot pages file has bad count")
156+
}
157+
rest = rest[n:]
158+
out := &HotPageList{pages: make([]HotPage, 0, count)}
159+
for i := uint64(0); i < count; i++ {
160+
region, n := binary.Uvarint(rest)
161+
if n <= 0 {
162+
return nil, fmt.Errorf("uffd: hot pages file truncated at entry %d (region)", i)
163+
}
164+
rest = rest[n:]
165+
offset, n := binary.Uvarint(rest)
166+
if n <= 0 {
167+
return nil, fmt.Errorf("uffd: hot pages file truncated at entry %d (offset)", i)
168+
}
169+
rest = rest[n:]
170+
out.pages = append(out.pages, HotPage{Region: uint32(region), PageOffset: offset})
171+
}
172+
if len(rest) != 0 {
173+
return nil, fmt.Errorf("uffd: hot pages file has %d trailing bytes", len(rest))
174+
}
175+
return out, nil
176+
}

lib/uffd/hotpages_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package uffd
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestHotPageList_SnapshotSortsAndDedups(t *testing.T) {
13+
var l HotPageList
14+
l.Add(HotPage{Region: 1, PageOffset: 8192})
15+
l.Add(HotPage{Region: 0, PageOffset: 4096})
16+
l.Add(HotPage{Region: 0, PageOffset: 4096}) // duplicate
17+
l.Add(HotPage{Region: 0, PageOffset: 0})
18+
19+
got := l.Snapshot()
20+
want := []HotPage{
21+
{Region: 0, PageOffset: 0},
22+
{Region: 0, PageOffset: 4096},
23+
{Region: 1, PageOffset: 8192},
24+
}
25+
assert.Equal(t, want, got)
26+
}
27+
28+
func TestHotPageList_SaveLoadRoundTrip(t *testing.T) {
29+
var l HotPageList
30+
l.Add(HotPage{Region: 0, PageOffset: 0})
31+
l.Add(HotPage{Region: 0, PageOffset: 4096})
32+
l.Add(HotPage{Region: 2, PageOffset: 1 << 20})
33+
34+
path := filepath.Join(t.TempDir(), "hot.bin")
35+
require.NoError(t, l.Save(path))
36+
37+
got, err := LoadHotPageList(path)
38+
require.NoError(t, err)
39+
assert.Equal(t, l.Snapshot(), got.Snapshot())
40+
}
41+
42+
func TestLoadHotPageList_MissingReturnsEmpty(t *testing.T) {
43+
got, err := LoadHotPageList(filepath.Join(t.TempDir(), "absent.bin"))
44+
require.NoError(t, err)
45+
assert.Equal(t, 0, got.Len())
46+
}
47+
48+
func TestLoadHotPageList_BadMagic(t *testing.T) {
49+
path := filepath.Join(t.TempDir(), "bad.bin")
50+
require.NoError(t, writeFile(path, []byte("XXXX\x00")))
51+
_, err := LoadHotPageList(path)
52+
assert.Error(t, err)
53+
}
54+
55+
func TestLoadHotPageList_TruncatedAtEntry(t *testing.T) {
56+
path := filepath.Join(t.TempDir(), "trunc.bin")
57+
// magic + count=2 + only one entry
58+
data := append([]byte("HPL1"), 0x02, 0x00, 0x00) // count=2, region=0, offset=0
59+
require.NoError(t, writeFile(path, data))
60+
_, err := LoadHotPageList(path)
61+
assert.Error(t, err)
62+
}
63+
64+
func writeFile(path string, data []byte) error {
65+
return os.WriteFile(path, data, 0o600)
66+
}

lib/uffd/server_linux.go

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ func (s *Server) startListener(ctx context.Context, forkID string, socketPath st
145145
}
146146
}
147147

148+
s.installPrefetcher(forkID, func(list *HotPageList) error {
149+
return s.prefetchInto(fd, regions, list)
150+
})
151+
148152
s.servePageFaults(hctx, fd, regions, forkID)
149153
}()
150154

@@ -272,12 +276,13 @@ func (s *Server) copyPageForFault(fd int, regions []MemoryRegion, addr uint64, p
272276
pageSize := uint64(s.pageSize)
273277
pageStart := addr &^ (pageSize - 1)
274278

275-
for _, r := range regions {
279+
for idx, r := range regions {
276280
base := uint64(r.BaseHostAddr)
277281
if pageStart < base || pageStart >= base+r.Size {
278282
continue
279283
}
280-
offset := int64(r.MemFileOffset + (pageStart - base))
284+
regionOff := pageStart - base
285+
offset := int64(r.MemFileOffset + regionOff)
281286
if _, err := s.memFile.ReadAt(page, offset); err != nil && !errors.Is(err, io.EOF) {
282287
return fmt.Errorf("uffd: read template at %d: %w", offset, err)
283288
}
@@ -294,11 +299,56 @@ func (s *Server) copyPageForFault(fd int, regions []MemoryRegion, addr uint64, p
294299
}
295300
return fmt.Errorf("uffd: UFFDIO_COPY: %w", err)
296301
}
302+
if s.cfg.RecordHotPages {
303+
s.hotPages.Add(HotPage{Region: uint32(idx), PageOffset: regionOff})
304+
}
297305
return nil
298306
}
299307
return fmt.Errorf("uffd: fault addr 0x%x outside any registered region", addr)
300308
}
301309

310+
// prefetchInto walks list and issues a UFFDIO_COPY for each entry
311+
// against the supplied fork's userfaultfd. It tolerates EEXIST/EAGAIN
312+
// the same way the fault handler does so a racing first-touch fault
313+
// from a vCPU does not abort the whole prefetch.
314+
func (s *Server) prefetchInto(fd int, regions []MemoryRegion, list *HotPageList) error {
315+
if list == nil {
316+
return nil
317+
}
318+
pages := list.Snapshot()
319+
if len(pages) == 0 {
320+
return nil
321+
}
322+
page := make([]byte, s.pageSize)
323+
pageSize := uint64(s.pageSize)
324+
for _, hp := range pages {
325+
if int(hp.Region) >= len(regions) {
326+
return fmt.Errorf("uffd: prefetch entry refers to region %d (only %d registered)", hp.Region, len(regions))
327+
}
328+
r := regions[hp.Region]
329+
if hp.PageOffset+pageSize > r.Size {
330+
return fmt.Errorf("uffd: prefetch offset %d outside region %d size %d", hp.PageOffset, hp.Region, r.Size)
331+
}
332+
dst := uint64(r.BaseHostAddr) + hp.PageOffset
333+
src := int64(r.MemFileOffset + hp.PageOffset)
334+
if _, err := s.memFile.ReadAt(page, src); err != nil && !errors.Is(err, io.EOF) {
335+
return fmt.Errorf("uffd: prefetch read template at %d: %w", src, err)
336+
}
337+
copyArg := uffdioCopyArg{
338+
Dst: dst,
339+
Src: uint64(uintptr(unsafe.Pointer(&page[0]))),
340+
Len: pageSize,
341+
}
342+
if err := ioctl(fd, uffdioCopyIoctl, unsafe.Pointer(&copyArg)); err != nil {
343+
if errors.Is(err, syscall.EEXIST) || errors.Is(err, syscall.EAGAIN) {
344+
continue
345+
}
346+
return fmt.Errorf("uffd: prefetch UFFDIO_COPY: %w", err)
347+
}
348+
}
349+
return nil
350+
}
351+
302352
func ioctl(fd int, req uintptr, arg unsafe.Pointer) error {
303353
_, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), req, uintptr(arg))
304354
if errno != 0 {

0 commit comments

Comments
 (0)