Skip to content

Commit 345f7e9

Browse files
committed
fix(uffd): hold settle Lock across read+SetRange in serve loop
Serve loop previously took the lock only around SetRange after readEvents returned. A snapshot-path goroutine could win the Lock race in the gap between read and SetRange, exporting an incomplete Removed bitmap before the serve loop applied the just-read events. Hold Lock across the read itself so read+SetRange are atomic from any other goroutine's perspective.
1 parent d725d87 commit 345f7e9

1 file changed

Lines changed: 11 additions & 7 deletions

File tree

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -262,21 +262,22 @@ func (u *Userfaultfd) Serve(
262262
var pagefaults []*UffdPagefault
263263

264264
if hasEvent(uffdFd.Revents, unix.POLLIN) {
265+
// Hold Lock across read+SetRange so any goroutine that later
266+
// acquires Lock (e.g. ExportPageStates) sees a tracker state
267+
// where every read() event has been applied. Without this,
268+
// the snapshot path could acquire Lock between read() and
269+
// the SetRange below and Export an incomplete Removed bitmap.
270+
u.settleRequests.Lock()
271+
265272
var err error
266273
removes, pagefaults, err = u.readEvents(ctx)
267274
if err != nil {
275+
u.settleRequests.Unlock()
268276
u.logger.Error(ctx, "uffd: read error", zap.Error(err))
269277

270278
return fmt.Errorf("failed to read: %w", err)
271279
}
272-
} else {
273-
noDataCounter.Increase("POLLIN")
274-
}
275280

276-
// REMOVE batch under write lock so an in-flight worker's SetRange(faulted)
277-
// can't overwrite the removed state we are about to install.
278-
if len(removes) > 0 {
279-
u.settleRequests.Lock()
280281
for _, rm := range removes {
281282
// rm.start (inclusive) and rm.end (exclusive) are page-aligned
282283
// to u.pageSize for the registered VMA (UFFD invariant), so
@@ -301,7 +302,10 @@ func (u *Userfaultfd) Serve(
301302
zap.Uint64("start_idx", startIdx), zap.Uint64("end_idx", endIdx), zap.Error(err))
302303
}
303304
}
305+
304306
u.settleRequests.Unlock()
307+
} else {
308+
noDataCounter.Increase("POLLIN")
305309
}
306310

307311
pagefaults = append(deferred.drain(), pagefaults...)

0 commit comments

Comments
 (0)