Skip to content

Commit 61931ef

Browse files
committed
fix(uffd): close read-vs-apply race with separate readSerial lock
A worker holding settleRequests.RLock must never block readEvents, because madvise(MADV_DONTNEED) blocks the producer until userspace reads the UFFD_EVENT_REMOVE — and the producer can be the FC balloon thread that other syscalls depend on. Use a dedicated readSerial mutex (not settleRequests) to serialize serve-loop iterations with snapshot-time Export, while keeping the existing settleRequests discipline (workers RLock, REMOVE batch Lock) intact so readEvents remains lock-free relative to workers. Restores liveness for TestNoMadviseDeadlockWithInflightCopy while closing the read-vs-apply race that motivated the prior buggy commit (345f7e9, now amended).
1 parent d2303ae commit 61931ef

1 file changed

Lines changed: 41 additions & 25 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ type Userfaultfd struct {
5858
// RLock for the lookup→install→SetRange sequence; the REMOVE batch takes
5959
// Lock so a concurrent worker can't overwrite a removed state.
6060
settleRequests sync.RWMutex
61+
62+
// readSerial serializes serve-loop iterations (read+apply) with snapshot-time
63+
// Export. Workers do NOT touch this lock — it must remain disjoint from
64+
// settleRequests so readEvents can always drain the kernel UFFD queue and
65+
// unblock madvise even when settleRequests.RLock is held by an in-flight
66+
// worker. See TestNoMadviseDeadlockWithInflightCopy.
67+
readSerial sync.Mutex
68+
6169
prefetchTracker *block.PrefetchTracker
6270

6371
// defaultCopyMode overrides UFFDIO_COPY mode for all faults when non-zero.
@@ -247,41 +255,49 @@ func (u *Userfaultfd) Serve(
247255
var pagefaults []*UffdPagefault
248256

249257
if hasEvent(uffdFd.Revents, unix.POLLIN) {
258+
// readSerial keeps Export from interleaving between read and
259+
// SetRange(Zero) in the same serve-loop iteration. It does
260+
// NOT couple readEvents to settleRequests — workers don't take
261+
// readSerial, so a worker holding settleRequests.RLock can
262+
// never block this read. See TestNoMadviseDeadlockWithInflightCopy.
263+
u.readSerial.Lock()
264+
250265
var err error
251266
removes, pagefaults, err = u.readEvents(ctx)
252267
if err != nil {
268+
u.readSerial.Unlock()
253269
u.logger.Error(ctx, "uffd: read error", zap.Error(err))
254270

255271
return fmt.Errorf("failed to read: %w", err)
256272
}
257-
} else {
258-
noDataCounter.Increase("POLLIN")
259-
}
260273

261-
// REMOVE batch under write lock so an in-flight worker's SetRange(Dirty)
262-
// can't overwrite the Zero state we are about to install.
263-
if len(removes) > 0 {
264-
u.settleRequests.Lock()
265-
for _, rm := range removes {
266-
// rm.start (inclusive) and rm.end (exclusive) are page-aligned
267-
// to u.pageSize for the registered VMA (UFFD invariant), so
268-
// startOff is a multiple of pageSize and length is an integer
269-
// number of pages — both divisions below are exact, and
270-
// SetRange's half-open [startIdx, endIdx) lines up with the
271-
// half-open [rm.start, rm.end).
272-
startOff, err := u.ma.GetOffset(uintptr(rm.start))
273-
if err != nil {
274-
u.logger.Error(ctx, "UFFD REMOVE: failed to map start address",
275-
zap.Uintptr("start", uintptr(rm.start)), zap.Error(err))
276-
277-
continue
274+
if len(removes) > 0 {
275+
u.settleRequests.Lock()
276+
for _, rm := range removes {
277+
// rm.start (inclusive) and rm.end (exclusive) are page-aligned
278+
// to u.pageSize for the registered VMA (UFFD invariant), so
279+
// startOff is a multiple of pageSize and length is an integer
280+
// number of pages — both divisions below are exact, and
281+
// SetRange's half-open [startIdx, endIdx) lines up with the
282+
// half-open [rm.start, rm.end).
283+
startOff, err := u.ma.GetOffset(uintptr(rm.start))
284+
if err != nil {
285+
u.logger.Error(ctx, "UFFD REMOVE: failed to map start address",
286+
zap.Uintptr("start", uintptr(rm.start)), zap.Error(err))
287+
288+
continue
289+
}
290+
291+
startIdx := uint64(header.BlockIdx(startOff, int64(u.pageSize)))
292+
endIdx := startIdx + uint64(rm.end-rm.start)/uint64(u.pageSize)
293+
u.pageTracker.SetRange(startIdx, endIdx, block.Zero)
278294
}
279-
280-
startIdx := uint64(header.BlockIdx(startOff, int64(u.pageSize)))
281-
endIdx := startIdx + uint64(rm.end-rm.start)/uint64(u.pageSize)
282-
u.pageTracker.SetRange(startIdx, endIdx, block.Zero)
295+
u.settleRequests.Unlock()
283296
}
284-
u.settleRequests.Unlock()
297+
298+
u.readSerial.Unlock()
299+
} else {
300+
noDataCounter.Increase("POLLIN")
285301
}
286302

287303
pagefaults = append(deferred.drain(), pagefaults...)

0 commit comments

Comments
 (0)