Skip to content

Commit 15305a6

Browse files
committed
Fix epoll_ctl dropping registrations in multi-threaded guests
In a multi-threaded guest host_fd_ref_open() hands back a dup of the target fd that host_fd_ref_close() closes when the syscall returns. sys_epoll_ctl() used that transient dup as the kqueue knote ident, and the kernel drops a knote the moment its fd is closed -- so every epoll registration made while multi-threaded was torn down the instant epoll_ctl() returned, and epoll_pwait() never reported readiness again. Single-threaded guests borrow the raw fd (no dup, no close) and never hit it. Node's libuv DelayedTaskScheduler (eventfd + epoll backing uv_async_send) relied on this path and hung forever at process exit: the main thread blocked in WorkerThreadsTaskRunner::Shutdown -> uv_thread_join on a scheduler thread that could no longer be woken. Key the knote on the persistent host fd from the fd table. Take it from the same atomic fd_snapshot() that validates the fd, so the ident comes from the entry that was validated rather than a second fd_to_host() lookup that could race a concurrent close/reopen. Result mapping already uses udata (the guest fd), so the ident only needs to stay open and refer to the same open file description. Guard the close+reopen ABA with a per-slot generation counter. fd_table entries now carry a monotonic generation bumped on every allocation; epoll registrations stamp it at ADD/MOD. If the guest closes a watched fd and reopens it (reusing the guest fd number), the kernel has already dropped the original knote, yet reg->active still looks live -- a later DEL/MOD would EV_DELETE the wrong knote on the reused host fd. A mismatched generation now marks the registration gone, so DEL/MOD report ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. Also implement the FIONBIO / FIOCLEX / FIONCLEX ioctls, which were falling through to ENOTTY. libuv's uv_pipe_open() sets non-blocking via FIONBIO, so Node's console.log() to a pipe threw "open ENOTTY". FIONBIO maps to F_SETFL O_NONBLOCK (status flag, shared across the dup). FIOCLEX/FIONCLEX mirror F_SETFD by toggling the fd_table cloexec bit rather than the host fd's FD_CLOEXEC, which is per-descriptor and lost on the dup. They need no host fd, so they dispatch before host_fd_ref_open_regular_io() -- which rejects O_PATH (FD_PATH) with EBADF, while Linux allows these ioctls (like F_SETFD) on O_PATH fds -- and validate the slot and flip the flag in a single fd_lock section, so there is no validate-then-mutate window for a concurrent close/reuse to flip cloexec on a different file. Add tests/test-epoll-mt.c: a CLONE_THREAD sibling keeps the guest multi-threaded across epoll_ctl, then asserts a registered eventfd and pipe still deliver an EPOLLIN edge. It fails without the poll.c fix. Add tests/test-ioctl-cloexec.c covering FIOCLEX/FIONCLEX round-trip on both a regular and an O_PATH fd. Both are listed in tests/manifest.txt so the driver runs them under make check. With these, node:alpine (node v26.3.0) runs JavaScript, timers, the libuv threadpool, and promises, and exits cleanly.
1 parent bde0b37 commit 15305a6

8 files changed

Lines changed: 548 additions & 16 deletions

File tree

src/syscall/abi.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,9 @@ typedef struct {
352352
#define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */
353353
#define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */
354354
#define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */
355+
#define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */
356+
#define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */
357+
#define LINUX_FIOCLEX 0x5451 /* set close-on-exec on fd */
355358
#define LINUX_TIOCNOTTY 0x5422 /* -> macOS TIOCNOTTY (same semantics) */
356359
#define LINUX_TIOCGSID 0x5429 /* -> macOS TIOCGSID (same semantics) */
357360
/* termios2 variant (adds c_ispeed/c_ospeed) */
@@ -700,6 +703,10 @@ typedef struct {
700703
int type; /* FD_CLOSED, FD_STDIO, FD_REGULAR, FD_DIR */
701704
int host_fd; /* Underlying macOS file descriptor */
702705
int linux_flags; /* Linux open flags (for CLOEXEC tracking) */
706+
uint64_t generation; /* Monotonic stamp bumped on every (re)allocation of
707+
* this slot. Lets long-lived references (e.g. epoll
708+
* registrations) detect a close+reopen ABA where the
709+
* slot now holds a different open file. */
703710
void *dir; /* DIR* for FD_DIR entries (NULL otherwise) */
704711
char proc_path[FD_VIRTUAL_PATH_MAX]; /* Virtual /proc dir root for *at */
705712
int seals; /* F_SEAL_* bits (non-zero only for memfd_create fds) */

src/syscall/fdtable.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ int fd_get_rlimit_nofile(void)
5959
#define FD_BITMAP_WORDS (FD_TABLE_SIZE / 64)
6060
static uint64_t fd_free_bitmap[FD_BITMAP_WORDS];
6161

62+
/* Monotonic generation source. Bumped (under fd_lock, in fd_init_entry) on
63+
* every slot allocation so each open file gets a value distinct from whatever
64+
* previously occupied the slot. 64 bits never wraps in practice, so a stamped
65+
* generation uniquely identifies one allocation of one fd. Starts at 0; the
66+
* first allocation gets 1, leaving 0 to mean "never stamped".
67+
*/
68+
static uint64_t fd_generation_next;
69+
6270
static inline void fd_bitmap_set_free(int fd)
6371
{
6472
fd_free_bitmap[fd / 64] |= BIT64(fd % 64);
@@ -77,6 +85,7 @@ static inline void fd_init_entry(int fd,
7785
fd_bitmap_set_used(fd);
7886
fd_table[fd].type = type;
7987
fd_table[fd].host_fd = host_fd;
88+
fd_table[fd].generation = ++fd_generation_next;
8089
fd_table[fd].linux_flags = 0;
8190
fd_table[fd].dir = NULL;
8291
fd_table[fd].proc_path[0] = '\0';

src/syscall/io.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,33 @@ int64_t sys_pwritev2(guest_t *g,
14761476

14771477
int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
14781478
{
1479+
/* FIOCLEX/FIONCLEX are the ioctl form of fcntl(F_SETFD): they set/clear the
1480+
* guest close-on-exec flag, which lives in fd_table linux_flags (not the
1481+
* host fd's FD_CLOEXEC, which is per-descriptor and would be lost on the dup
1482+
* that host_fd_ref hands multi-threaded callers, so mirror the F_SETFD path
1483+
* in sys_fcntl). They need no host fd, so dispatch them before
1484+
* host_fd_ref_open_regular_io(): that helper rejects O_PATH (FD_PATH) fds
1485+
* with EBADF, but Linux allows these ioctls -- like fcntl(F_SETFD) -- on
1486+
* O_PATH descriptors. Validate the slot and mutate the flag in a single
1487+
* fd_lock section so there is no validate-then-mutate window in which a
1488+
* concurrent close/reuse could flip CLOEXEC on a different file that took
1489+
* the slot. The arg is ignored. */
1490+
if (request == LINUX_FIOCLEX || request == LINUX_FIONCLEX) {
1491+
if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
1492+
return -LINUX_EBADF;
1493+
pthread_mutex_lock(&fd_lock);
1494+
if (fd_table[fd].type == FD_CLOSED) {
1495+
pthread_mutex_unlock(&fd_lock);
1496+
return -LINUX_EBADF;
1497+
}
1498+
if (request == LINUX_FIOCLEX)
1499+
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
1500+
else
1501+
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
1502+
pthread_mutex_unlock(&fd_lock);
1503+
return 0;
1504+
}
1505+
14791506
host_fd_ref_t host_ref;
14801507
int64_t err = host_fd_ref_open_regular_io(fd, &host_ref);
14811508
if (err < 0)
@@ -1688,6 +1715,29 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
16881715
return 0;
16891716
}
16901717

1718+
case LINUX_FIONBIO: {
1719+
/* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg:
1720+
* nonzero enables non-blocking, zero disables it. libuv's
1721+
* uv__nonblock_ioctl() (its default on Linux) issues this on pipe and
1722+
* socket fds at setup; without it the guest's uv_pipe_open() fails with
1723+
* ENOTTY and Node's stdio stream construction throws.
1724+
*/
1725+
int32_t on = 0;
1726+
if (guest_read_small(g, arg, &on, sizeof(on)) < 0) {
1727+
host_fd_ref_close(&host_ref);
1728+
return -LINUX_EFAULT;
1729+
}
1730+
int flags = fcntl(host_fd, F_GETFL);
1731+
if (flags < 0) {
1732+
host_fd_ref_close(&host_ref);
1733+
return linux_errno();
1734+
}
1735+
flags = on ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK);
1736+
int r = fcntl(host_fd, F_SETFL, flags);
1737+
host_fd_ref_close(&host_ref);
1738+
return r < 0 ? linux_errno() : 0;
1739+
}
1740+
16911741
default:
16921742
host_fd_ref_close(&host_ref);
16931743
return -LINUX_ENOTTY;

src/syscall/poll.c

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,11 @@ typedef struct {
620620
typedef struct {
621621
uint32_t events; /* Registered EPOLL* events mask */
622622
uint64_t data; /* User data to return in epoll_wait */
623+
uint64_t generation; /* fd_entry_t.generation captured at ADD/MOD. Detects a
624+
* close+reopen ABA: if the guest fd's current
625+
* generation no longer matches, the registered open
626+
* file is gone and this stale entry must not drive
627+
* kevent against the reused host fd. */
623628
bool active; /* Registered in this instance */
624629
bool oneshot_armed; /* EPOLLONESHOT and event already fired,
625630
* waiting for EPOLL_CTL_MOD re-arm.
@@ -707,18 +712,43 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
707712
return -LINUX_EINVAL;
708713
}
709714

710-
host_fd_ref_t target_ref;
711-
if (host_fd_ref_open(fd, &target_ref) < 0) {
715+
/* Validate the target fd and read its persistent host fd in a single
716+
* fd_lock snapshot, so the kqueue knote ident is taken from the same entry
717+
* that was validated. A kqueue knote is keyed by the fd number and the
718+
* kernel drops it the moment that fd is closed, so the ident must be the
719+
* persistent host fd from the fd table -- not the dup that
720+
* host_fd_ref_open() hands multi-threaded callers, which
721+
* host_fd_ref_close() closes when the syscall returns (silently tearing the
722+
* registration down). Snapshotting (rather than host_fd_ref_open() + a
723+
* separate fd_to_host()) keeps the validate and the ident read atomic under
724+
* one fd_lock. The snapshot's generation then guards the cross-call ABA
725+
* below. Result mapping uses udata (the guest fd), so the ident only needs
726+
* to stay open and refer to the same open file description. */
727+
fd_entry_t target_snap;
728+
if (!fd_snapshot(fd, &target_snap)) {
712729
host_fd_ref_close(&epoll_ref);
713730
return -LINUX_EBADF;
714731
}
732+
int target_host_fd = target_snap.host_fd;
715733

716734
epoll_reg_t *reg = &inst->regs[fd];
717735

736+
/* Cross-call ABA guard. If the guest closed this fd and reopened it (or the
737+
* slot was reused) since the registration was stamped, the kernel already
738+
* dropped the original knote when the old host fd closed, yet the guest fd
739+
* number -- and thus reg->active -- still looks live. Acting on it would
740+
* EV_DELETE/EV_MOD the wrong knote on the reused host fd. A mismatched
741+
* generation means the registration is gone: drop it so DEL/MOD report
742+
* ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. */
743+
if ((reg->active || reg->oneshot_armed) &&
744+
reg->generation != target_snap.generation) {
745+
reg->active = false;
746+
reg->oneshot_armed = false;
747+
}
748+
718749
if (op == LINUX_EPOLL_CTL_DEL) {
719750
/* Linux returns ENOENT when removing an unregistered fd */
720751
if (!reg->active) {
721-
host_fd_ref_close(&target_ref);
722752
host_fd_ref_close(&epoll_ref);
723753
return -LINUX_ENOENT;
724754
}
@@ -730,12 +760,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
730760
int nchanges = 0;
731761
{
732762
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
733-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ,
763+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ,
734764
EV_DELETE, 0, 0, NULL);
735765
nchanges++;
736766
}
737767
if (reg->events & LINUX_EPOLLOUT) {
738-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE,
768+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE,
739769
EV_DELETE, 0, 0, NULL);
740770
nchanges++;
741771
}
@@ -745,7 +775,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
745775
/* Clear stale state for potential re-add */
746776
reg->oneshot_armed = false;
747777
}
748-
host_fd_ref_close(&target_ref);
749778
host_fd_ref_close(&epoll_ref);
750779
return 0;
751780
}
@@ -755,20 +784,17 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
755784
* (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD.
756785
*/
757786
if (op == LINUX_EPOLL_CTL_ADD && reg->active) {
758-
host_fd_ref_close(&target_ref);
759787
host_fd_ref_close(&epoll_ref);
760788
return -LINUX_EEXIST;
761789
}
762790
if (op == LINUX_EPOLL_CTL_MOD && !reg->active && !reg->oneshot_armed) {
763-
host_fd_ref_close(&target_ref);
764791
host_fd_ref_close(&epoll_ref);
765792
return -LINUX_ENOENT;
766793
}
767794

768795
/* ADD or MOD: read the epoll_event from guest */
769796
linux_epoll_event_t ev;
770797
if (guest_read_small(g, event_gva, &ev, sizeof(ev)) < 0) {
771-
host_fd_ref_close(&target_ref);
772798
host_fd_ref_close(&epoll_ref);
773799
return -LINUX_EFAULT;
774800
}
@@ -786,11 +812,11 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
786812
if (op == LINUX_EPOLL_CTL_MOD && reg->active) {
787813
struct kevent del;
788814
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
789-
EV_SET(&del, target_ref.fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
815+
EV_SET(&del, target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
790816
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
791817
}
792818
if (reg->events & LINUX_EPOLLOUT) {
793-
EV_SET(&del, target_ref.fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
819+
EV_SET(&del, target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
794820
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
795821
}
796822
}
@@ -820,33 +846,34 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
820846
void *udata = (void *) (uintptr_t) fd;
821847

822848
if (ev.events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
823-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, kflags, 0, 0,
849+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, kflags, 0, 0,
824850
udata);
825851
nchanges++;
826852
}
827853
if (ev.events & LINUX_EPOLLOUT) {
828-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, kflags, 0, 0,
854+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, kflags, 0, 0,
829855
udata);
830856
nchanges++;
831857
}
832858

833859
if (nchanges > 0) {
834860
if (kevent(epoll_ref.fd, changes, nchanges, NULL, 0, NULL) < 0) {
835-
host_fd_ref_close(&target_ref);
836861
host_fd_ref_close(&epoll_ref);
837862
return linux_errno();
838863
}
839864
}
840865

841866
/* Store registration data in per-instance table.
842-
* Clear oneshot_armed when MOD successfully re-arms.
867+
* Clear oneshot_armed when MOD successfully re-arms. Stamp the snapshot's
868+
* generation so a later close+reopen of this guest fd is detected as a
869+
* stale registration by the ABA guard above.
843870
*/
844871
reg->events = ev.events;
845872
reg->data = ev.data;
873+
reg->generation = target_snap.generation;
846874
reg->active = true;
847875
reg->oneshot_armed = false;
848876

849-
host_fd_ref_close(&target_ref);
850877
host_fd_ref_close(&epoll_ref);
851878
return 0;
852879
}

tests/manifest.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,11 @@ test-signalfd
6262
test-signalfd-hardening
6363
test-epoll
6464
test-epoll-edge
65+
test-epoll-mt
66+
test-epoll-aba
6567
test-timerfd
6668
test-large-io-boundary
69+
test-ioctl-cloexec
6770

6871
[section] /proc and /dev emulation tests
6972
test-proc

0 commit comments

Comments
 (0)