Skip to content

Commit 5fe3f6a

Browse files
committed
Fix epoll_ctl dropping registrations in multi-threaded guests
In a multi-threaded guest host_fd_ref_open() hands back a dup of the target fd that host_fd_ref_close() closes when the syscall returns. sys_epoll_ctl() used that transient dup as the kqueue knote ident, and the kernel drops a knote the moment its fd is closed -- so every epoll registration made while multi-threaded was torn down the instant epoll_ctl() returned, and epoll_pwait() never reported readiness again. Single-threaded guests borrow the raw fd (no dup, no close) and never hit it. Node's libuv DelayedTaskScheduler (eventfd + epoll backing uv_async_send) relied on this path and hung forever at process exit: the main thread blocked in WorkerThreadsTaskRunner::Shutdown -> uv_thread_join on a scheduler thread that could no longer be woken. Key the knote on the persistent host fd from the fd table. Take it from the same atomic fd_snapshot() that validates the fd, so the ident comes from the entry that was validated rather than a second fd_to_host() lookup that could race a concurrent close/reopen. Result mapping already uses udata (the guest fd), so the ident only needs to stay open and refer to the same open file description. Also implement the FIONBIO / FIOCLEX / FIONCLEX ioctls, which were falling through to ENOTTY. libuv's uv_pipe_open() sets non-blocking via FIONBIO, so Node's console.log() to a pipe threw "open ENOTTY". FIONBIO maps to F_SETFL O_NONBLOCK (status flag, shared across the dup); FIOCLEX/FIONCLEX mirror F_SETFD by toggling the fd_table cloexec bit under fd_lock (re-checking the slot is still open) rather than the host fd's FD_CLOEXEC, which is per-descriptor and lost on the dup. Add tests/test-epoll-mt.c: a CLONE_THREAD sibling keeps the guest multi-threaded across epoll_ctl, then asserts a registered eventfd and pipe still deliver an EPOLLIN edge. It fails without the poll.c fix. With these, node:alpine (node v26.3.0) runs JavaScript, timers, the libuv threadpool, and promises, and exits cleanly.
1 parent bde0b37 commit 5fe3f6a

4 files changed

Lines changed: 216 additions & 15 deletions

File tree

src/syscall/abi.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,9 @@ typedef struct {
352352
#define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */
353353
#define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */
354354
#define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */
355+
#define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */
356+
#define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */
357+
#define LINUX_FIOCLEX 0x5451 /* set close-on-exec on fd */
355358
#define LINUX_TIOCNOTTY 0x5422 /* -> macOS TIOCNOTTY (same semantics) */
356359
#define LINUX_TIOCGSID 0x5429 /* -> macOS TIOCGSID (same semantics) */
357360
/* termios2 variant (adds c_ispeed/c_ospeed) */

src/syscall/io.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,51 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
16881688
return 0;
16891689
}
16901690

1691+
case LINUX_FIONBIO: {
1692+
/* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg:
1693+
* nonzero enables non-blocking, zero disables it. libuv's
1694+
* uv__nonblock_ioctl() (its default on Linux) issues this on pipe and
1695+
* socket fds at setup; without it the guest's uv_pipe_open() fails with
1696+
* ENOTTY and Node's stdio stream construction throws.
1697+
*/
1698+
int32_t on = 0;
1699+
if (guest_read_small(g, arg, &on, sizeof(on)) < 0) {
1700+
host_fd_ref_close(&host_ref);
1701+
return -LINUX_EFAULT;
1702+
}
1703+
int flags = fcntl(host_fd, F_GETFL);
1704+
if (flags < 0) {
1705+
host_fd_ref_close(&host_ref);
1706+
return linux_errno();
1707+
}
1708+
flags = on ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK);
1709+
int r = fcntl(host_fd, F_SETFL, flags);
1710+
host_fd_ref_close(&host_ref);
1711+
return r < 0 ? linux_errno() : 0;
1712+
}
1713+
1714+
case LINUX_FIOCLEX:
1715+
case LINUX_FIONCLEX:
1716+
/* Set (FIOCLEX) or clear (FIONCLEX) the guest close-on-exec flag; the
1717+
* ioctl form of fcntl(F_SETFD). libuv's uv__cloexec() uses these by
1718+
* default on Linux. Guest cloexec lives in fd_table linux_flags (not
1719+
* the host fd's FD_CLOEXEC, which is per-descriptor and would be lost
1720+
* on the dup that host_fd_ref hands multi-threaded callers), so mirror
1721+
* the F_SETFD path in sys_fcntl. The arg is ignored. Mutate under
1722+
* fd_lock and re-check the slot is still open so a concurrent
1723+
* close/reuse cannot flip CLOEXEC on a different fd that took the slot.
1724+
*/
1725+
pthread_mutex_lock(&fd_lock);
1726+
if (fd_table[fd].type != FD_CLOSED) {
1727+
if (request == LINUX_FIOCLEX)
1728+
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
1729+
else
1730+
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
1731+
}
1732+
pthread_mutex_unlock(&fd_lock);
1733+
host_fd_ref_close(&host_ref);
1734+
return 0;
1735+
16911736
default:
16921737
host_fd_ref_close(&host_ref);
16931738
return -LINUX_ENOTTY;

src/syscall/poll.c

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -707,18 +707,30 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
707707
return -LINUX_EINVAL;
708708
}
709709

710-
host_fd_ref_t target_ref;
711-
if (host_fd_ref_open(fd, &target_ref) < 0) {
710+
/* Validate the target fd and read its persistent host fd in a single
711+
* fd_lock snapshot, so the kqueue knote ident is taken from the same entry
712+
* that was validated. A kqueue knote is keyed by the fd number and the
713+
* kernel drops it the moment that fd is closed, so the ident must be the
714+
* persistent host fd from the fd table -- not the dup that
715+
* host_fd_ref_open() hands multi-threaded callers, which
716+
* host_fd_ref_close() closes when the syscall returns (silently tearing the
717+
* registration down). Snapshotting (rather than host_fd_ref_open() + a
718+
* separate fd_to_host()) also avoids a TOCTOU window where a concurrent
719+
* close/reopen could key the knote on a different file than the one
720+
* validated. Result mapping uses udata (the guest fd), so the ident only
721+
* needs to stay open and refer to the same open file description. */
722+
fd_entry_t target_snap;
723+
if (!fd_snapshot(fd, &target_snap)) {
712724
host_fd_ref_close(&epoll_ref);
713725
return -LINUX_EBADF;
714726
}
727+
int target_host_fd = target_snap.host_fd;
715728

716729
epoll_reg_t *reg = &inst->regs[fd];
717730

718731
if (op == LINUX_EPOLL_CTL_DEL) {
719732
/* Linux returns ENOENT when removing an unregistered fd */
720733
if (!reg->active) {
721-
host_fd_ref_close(&target_ref);
722734
host_fd_ref_close(&epoll_ref);
723735
return -LINUX_ENOENT;
724736
}
@@ -730,12 +742,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
730742
int nchanges = 0;
731743
{
732744
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
733-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ,
745+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ,
734746
EV_DELETE, 0, 0, NULL);
735747
nchanges++;
736748
}
737749
if (reg->events & LINUX_EPOLLOUT) {
738-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE,
750+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE,
739751
EV_DELETE, 0, 0, NULL);
740752
nchanges++;
741753
}
@@ -745,7 +757,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
745757
/* Clear stale state for potential re-add */
746758
reg->oneshot_armed = false;
747759
}
748-
host_fd_ref_close(&target_ref);
749760
host_fd_ref_close(&epoll_ref);
750761
return 0;
751762
}
@@ -755,20 +766,17 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
755766
* (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD.
756767
*/
757768
if (op == LINUX_EPOLL_CTL_ADD && reg->active) {
758-
host_fd_ref_close(&target_ref);
759769
host_fd_ref_close(&epoll_ref);
760770
return -LINUX_EEXIST;
761771
}
762772
if (op == LINUX_EPOLL_CTL_MOD && !reg->active && !reg->oneshot_armed) {
763-
host_fd_ref_close(&target_ref);
764773
host_fd_ref_close(&epoll_ref);
765774
return -LINUX_ENOENT;
766775
}
767776

768777
/* ADD or MOD: read the epoll_event from guest */
769778
linux_epoll_event_t ev;
770779
if (guest_read_small(g, event_gva, &ev, sizeof(ev)) < 0) {
771-
host_fd_ref_close(&target_ref);
772780
host_fd_ref_close(&epoll_ref);
773781
return -LINUX_EFAULT;
774782
}
@@ -786,11 +794,11 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
786794
if (op == LINUX_EPOLL_CTL_MOD && reg->active) {
787795
struct kevent del;
788796
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
789-
EV_SET(&del, target_ref.fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
797+
EV_SET(&del, target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
790798
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
791799
}
792800
if (reg->events & LINUX_EPOLLOUT) {
793-
EV_SET(&del, target_ref.fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
801+
EV_SET(&del, target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
794802
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
795803
}
796804
}
@@ -820,19 +828,18 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
820828
void *udata = (void *) (uintptr_t) fd;
821829

822830
if (ev.events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
823-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, kflags, 0, 0,
831+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, kflags, 0, 0,
824832
udata);
825833
nchanges++;
826834
}
827835
if (ev.events & LINUX_EPOLLOUT) {
828-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, kflags, 0, 0,
836+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, kflags, 0, 0,
829837
udata);
830838
nchanges++;
831839
}
832840

833841
if (nchanges > 0) {
834842
if (kevent(epoll_ref.fd, changes, nchanges, NULL, 0, NULL) < 0) {
835-
host_fd_ref_close(&target_ref);
836843
host_fd_ref_close(&epoll_ref);
837844
return linux_errno();
838845
}
@@ -846,7 +853,6 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
846853
reg->active = true;
847854
reg->oneshot_armed = false;
848855

849-
host_fd_ref_close(&target_ref);
850856
host_fd_ref_close(&epoll_ref);
851857
return 0;
852858
}

tests/test-epoll-mt.c

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/* Multi-threaded epoll registration regression test
2+
*
3+
* Copyright 2026 elfuse contributors
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Regression for the epoll_ctl host-fd-reference bug: in a multi-threaded
7+
* guest, host_fd_ref_open() hands back a *dup* of the target fd that is
8+
* closed when the syscall returns. sys_epoll_ctl() used that transient dup
9+
* as the kqueue knote ident, so the kernel dropped the registration the
10+
* moment epoll_ctl() returned -- and epoll_pwait() never reported readiness
11+
* again. Single-threaded guests borrow the raw fd (no dup, no close) and so
12+
* never hit it; this only reproduces with at least one CLONE_THREAD sibling
13+
* active. Node's libuv DelayedTaskScheduler relied on exactly this path
14+
* (eventfd + epoll for uv_async_send) and hung forever at process exit.
15+
*
16+
* The test keeps a sibling thread alive across the epoll_ctl() call, then
17+
* checks that both a pipe and an eventfd registered while multi-threaded
18+
* still deliver an EPOLLIN edge.
19+
*
20+
* Syscalls exercised: clone(220), epoll_create1(20), epoll_ctl(21),
21+
* epoll_pwait(22), eventfd2(19), pipe2(59), write(64),
22+
* read(63), close(57), futex(98), exit(93)
23+
*/
24+
25+
#include <stdint.h>
26+
#include <errno.h>
27+
#include <unistd.h>
28+
#include <sys/epoll.h>
29+
#include <sys/eventfd.h>
30+
31+
#include "test-harness.h"
32+
#include "raw-syscall.h"
33+
34+
int passes = 0, fails = 0;
35+
36+
static volatile int child_should_exit = 0;
37+
static char sibling_stack[16384] __attribute__((aligned(16)));
38+
39+
/* Sibling thread: stays alive (raw nanosleep loop) so the guest is
40+
* multi-threaded for the duration of the parent's epoll_ctl() calls, then
41+
* exits via the raw exit syscall. Uses only raw syscalls because a
42+
* clone(CLONE_THREAD) child has no libc TLS set up.
43+
*/
44+
static int sibling_fn(void *arg)
45+
{
46+
(void) arg;
47+
struct {
48+
long tv_sec, tv_nsec;
49+
} ts = {0, 5000000}; /* 5ms */
50+
while (!child_should_exit)
51+
raw_syscall2(__NR_nanosleep, (long) &ts, 0);
52+
raw_syscall1(__NR_exit, 0);
53+
return 0;
54+
}
55+
56+
/* Register host_fd for EPOLLIN on epfd, make it readable via make_ready(),
57+
* and assert epoll_pwait() observes the edge within the timeout. Returns 1
58+
* on success. */
59+
static int expect_ready_edge(int epfd, int fd, void (*make_ready)(int), int arg)
60+
{
61+
struct epoll_event ev = {.events = EPOLLIN, .data.fd = fd};
62+
if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) < 0)
63+
return 0;
64+
65+
make_ready(arg);
66+
67+
struct epoll_event out[4];
68+
/* 2s budget: the bug manifests as an indefinite miss, so any generous
69+
* finite timeout distinguishes pass from fail without flaking. */
70+
int n = epoll_wait(epfd, out, 4, 2000);
71+
return n == 1 && out[0].data.fd == fd;
72+
}
73+
74+
static void poke_eventfd(int fd)
75+
{
76+
uint64_t one = 1;
77+
(void) !write(fd, &one, sizeof(one));
78+
}
79+
80+
static int g_pipe_wr;
81+
static void poke_pipe(int unused)
82+
{
83+
(void) unused;
84+
(void) !write(g_pipe_wr, "x", 1);
85+
}
86+
87+
int main(void)
88+
{
89+
printf("test-epoll-mt: epoll registration under CLONE_THREAD\n");
90+
91+
/* Spawn a CLONE_THREAD sibling so host_fd_ref_open() takes the dup path.
92+
* Flags: CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
93+
* CLONE_CHILD_CLEARTID.
94+
*/
95+
long flags = 0x00000100 | 0x00000200 | 0x00000400 | 0x00000800 |
96+
0x00010000 | 0x00200000;
97+
volatile uint32_t child_tid = 1;
98+
long ret = raw_syscall5(__NR_clone, flags,
99+
(long) (sibling_stack + sizeof(sibling_stack)), 0,
100+
0, (long) &child_tid);
101+
if (ret == 0) {
102+
sibling_fn(NULL);
103+
return 0; /* unreachable: sibling_fn exits the thread */
104+
}
105+
106+
TEST("clone sibling for multi-threaded context");
107+
EXPECT_TRUE(ret > 0, "clone failed");
108+
109+
/* eventfd registered + signalled while multi-threaded (the Node path). */
110+
TEST("MT epoll: eventfd EPOLLIN edge delivered");
111+
{
112+
int epfd = epoll_create1(EPOLL_CLOEXEC);
113+
int efd = eventfd(0, EFD_NONBLOCK);
114+
EXPECT_TRUE(epfd >= 0 && efd >= 0, "epoll/eventfd create failed");
115+
EXPECT_TRUE(expect_ready_edge(epfd, efd, poke_eventfd, efd),
116+
"eventfd registration lost across epoll_ctl");
117+
close(efd);
118+
close(epfd);
119+
}
120+
121+
/* Same with a pipe read end, to show the fix is fd-type independent. */
122+
TEST("MT epoll: pipe EPOLLIN edge delivered");
123+
{
124+
int epfd = epoll_create1(EPOLL_CLOEXEC);
125+
int pipefd[2];
126+
EXPECT_TRUE(epfd >= 0 && pipe(pipefd) == 0, "epoll/pipe create failed");
127+
g_pipe_wr = pipefd[1];
128+
EXPECT_TRUE(expect_ready_edge(epfd, pipefd[0], poke_pipe, 0),
129+
"pipe registration lost across epoll_ctl");
130+
close(pipefd[0]);
131+
close(pipefd[1]);
132+
close(epfd);
133+
}
134+
135+
/* Release the sibling and join via the CLONE_CHILD_CLEARTID futex. */
136+
child_should_exit = 1;
137+
for (int i = 0; i < 200 && child_tid != 0; i++) {
138+
struct {
139+
long tv_sec, tv_nsec;
140+
} ts = {0, 10000000}; /* 10ms */
141+
raw_syscall6(__NR_futex, (long) &child_tid, 0 /* FUTEX_WAIT */,
142+
child_tid, (long) &ts, 0, 0);
143+
}
144+
145+
SUMMARY("test-epoll-mt");
146+
return fails > 0 ? 1 : 0;
147+
}

0 commit comments

Comments
 (0)