Skip to content

Commit 18ea38e

Browse files
committed
Fix epoll_ctl dropping registrations in multi-threaded guests
In a multi-threaded guest host_fd_ref_open() hands back a dup of the target fd that host_fd_ref_close() closes when the syscall returns. sys_epoll_ctl() used that transient dup as the kqueue knote ident, and the kernel drops a knote the moment its fd is closed -- so every epoll registration made while multi-threaded was torn down the instant epoll_ctl() returned, and epoll_pwait() never reported readiness again. Single-threaded guests borrow the raw fd (no dup, no close) and never hit it. Node's libuv DelayedTaskScheduler (eventfd + epoll backing uv_async_send) relied on this path and hung forever at process exit: the main thread blocked in WorkerThreadsTaskRunner::Shutdown -> uv_thread_join on a scheduler thread that could no longer be woken. Key the knote on the persistent host fd from the fd table instead of the transient dup. Result mapping already uses udata (the guest fd), so the ident only needs to stay open and refer to the same open file description. Also implement the FIONBIO / FIOCLEX / FIONCLEX ioctls, which were falling through to ENOTTY. libuv's uv_pipe_open() sets non-blocking via FIONBIO, so Node's console.log() to a pipe threw "open ENOTTY". FIONBIO maps to F_SETFL O_NONBLOCK (status flag, shared across the dup); FIOCLEX/FIONCLEX mirror F_SETFD by toggling the fd_table cloexec bit rather than the host fd's FD_CLOEXEC (which would be lost on the dup). Add tests/test-epoll-mt.c: a CLONE_THREAD sibling keeps the guest multi-threaded across epoll_ctl, then asserts a registered eventfd and pipe still deliver an EPOLLIN edge. It fails without the poll.c fix. With these, node:alpine (node v26.3.0) runs JavaScript, timers, the libuv threadpool, and promises, and exits cleanly.
1 parent bde0b37 commit 18ea38e

4 files changed

Lines changed: 210 additions & 6 deletions

File tree

src/syscall/abi.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,9 @@ typedef struct {
352352
#define LINUX_TIOCSCTTY 0x540E /* -> macOS TIOCSCTTY (same semantics) */
353353
#define LINUX_TIOCGWINSZ 0x5413 /* -> macOS TIOCGWINSZ (same struct) */
354354
#define LINUX_FIONREAD 0x541B /* -> macOS FIONREAD (same semantics) */
355+
#define LINUX_FIONBIO 0x5421 /* set/clear O_NONBLOCK (arg: int *) */
356+
#define LINUX_FIONCLEX 0x5450 /* clear close-on-exec on fd */
357+
#define LINUX_FIOCLEX 0x5451 /* set close-on-exec on fd */
355358
#define LINUX_TIOCNOTTY 0x5422 /* -> macOS TIOCNOTTY (same semantics) */
356359
#define LINUX_TIOCGSID 0x5429 /* -> macOS TIOCGSID (same semantics) */
357360
/* termios2 variant (adds c_ispeed/c_ospeed) */

src/syscall/io.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,44 @@ int64_t sys_ioctl(guest_t *g, int fd, uint64_t request, uint64_t arg)
16881688
return 0;
16891689
}
16901690

1691+
case LINUX_FIONBIO: {
1692+
/* Set/clear O_NONBLOCK on the fd. Linux FIONBIO takes an int* arg:
1693+
* nonzero enables non-blocking, zero disables it. libuv's
1694+
* uv__nonblock_ioctl() (its default on Linux) issues this on pipe and
1695+
* socket fds at setup; without it the guest's uv_pipe_open() fails with
1696+
* ENOTTY and Node's stdio stream construction throws.
1697+
*/
1698+
int32_t on = 0;
1699+
if (guest_read_small(g, arg, &on, sizeof(on)) < 0) {
1700+
host_fd_ref_close(&host_ref);
1701+
return -LINUX_EFAULT;
1702+
}
1703+
int flags = fcntl(host_fd, F_GETFL);
1704+
if (flags < 0) {
1705+
host_fd_ref_close(&host_ref);
1706+
return linux_errno();
1707+
}
1708+
flags = on ? (flags | O_NONBLOCK) : (flags & ~O_NONBLOCK);
1709+
int r = fcntl(host_fd, F_SETFL, flags);
1710+
host_fd_ref_close(&host_ref);
1711+
return r < 0 ? linux_errno() : 0;
1712+
}
1713+
1714+
case LINUX_FIOCLEX:
1715+
case LINUX_FIONCLEX:
1716+
/* Set (FIOCLEX) or clear (FIONCLEX) the guest close-on-exec flag; the
1717+
* ioctl form of fcntl(F_SETFD). libuv's uv__cloexec() uses these by
1718+
* default on Linux. Guest cloexec lives in fd_table linux_flags (not
1719+
* the host fd's FD_CLOEXEC, which is per-descriptor and would be lost
1720+
* on the dup that host_fd_ref hands multi-threaded callers), so mirror
1721+
* the F_SETFD path in sys_fcntl. The arg is ignored. */
1722+
if (request == LINUX_FIOCLEX)
1723+
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
1724+
else
1725+
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
1726+
host_fd_ref_close(&host_ref);
1727+
return 0;
1728+
16911729
default:
16921730
host_fd_ref_close(&host_ref);
16931731
return -LINUX_ENOTTY;

src/syscall/poll.c

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,22 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
713713
return -LINUX_EBADF;
714714
}
715715

716+
/* kqueue knotes are keyed by the fd number (ident), and the kernel drops
717+
* a knote as soon as that fd is closed. In multi-threaded guests
718+
* host_fd_ref_open() hands back a *dup* that host_fd_ref_close() closes at
719+
* the end of this call, which would silently tear down the registration we
720+
* just made. epoll registrations must outlive the syscall, so key the
721+
* knote on the persistent host fd from the fd table instead of the
722+
* transient dup. Result mapping uses udata (the guest fd), so the ident
723+
* only needs to be a host fd that stays open and refers to the same open
724+
* file description. */
725+
int target_host_fd = fd_to_host(fd);
726+
if (target_host_fd < 0) {
727+
host_fd_ref_close(&target_ref);
728+
host_fd_ref_close(&epoll_ref);
729+
return -LINUX_EBADF;
730+
}
731+
716732
epoll_reg_t *reg = &inst->regs[fd];
717733

718734
if (op == LINUX_EPOLL_CTL_DEL) {
@@ -730,12 +746,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
730746
int nchanges = 0;
731747
{
732748
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
733-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ,
749+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ,
734750
EV_DELETE, 0, 0, NULL);
735751
nchanges++;
736752
}
737753
if (reg->events & LINUX_EPOLLOUT) {
738-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE,
754+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE,
739755
EV_DELETE, 0, 0, NULL);
740756
nchanges++;
741757
}
@@ -786,11 +802,11 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
786802
if (op == LINUX_EPOLL_CTL_MOD && reg->active) {
787803
struct kevent del;
788804
if (reg->events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
789-
EV_SET(&del, target_ref.fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
805+
EV_SET(&del, target_host_fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
790806
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
791807
}
792808
if (reg->events & LINUX_EPOLLOUT) {
793-
EV_SET(&del, target_ref.fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
809+
EV_SET(&del, target_host_fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
794810
kevent(epoll_ref.fd, &del, 1, NULL, 0, NULL);
795811
}
796812
}
@@ -820,12 +836,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
820836
void *udata = (void *) (uintptr_t) fd;
821837

822838
if (ev.events & (LINUX_EPOLLIN | LINUX_EPOLLRDHUP)) {
823-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_READ, kflags, 0, 0,
839+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_READ, kflags, 0, 0,
824840
udata);
825841
nchanges++;
826842
}
827843
if (ev.events & LINUX_EPOLLOUT) {
828-
EV_SET(&changes[nchanges], target_ref.fd, EVFILT_WRITE, kflags, 0, 0,
844+
EV_SET(&changes[nchanges], target_host_fd, EVFILT_WRITE, kflags, 0, 0,
829845
udata);
830846
nchanges++;
831847
}

tests/test-epoll-mt.c

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/* Multi-threaded epoll registration regression test
2+
*
3+
* Copyright 2026 elfuse contributors
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Regression for the epoll_ctl host-fd-reference bug: in a multi-threaded
7+
* guest, host_fd_ref_open() hands back a *dup* of the target fd that is
8+
* closed when the syscall returns. sys_epoll_ctl() used that transient dup
9+
* as the kqueue knote ident, so the kernel dropped the registration the
10+
* moment epoll_ctl() returned -- and epoll_pwait() never reported readiness
11+
* again. Single-threaded guests borrow the raw fd (no dup, no close) and so
12+
* never hit it; this only reproduces with at least one CLONE_THREAD sibling
13+
* active. Node's libuv DelayedTaskScheduler relied on exactly this path
14+
* (eventfd + epoll for uv_async_send) and hung forever at process exit.
15+
*
16+
* The test keeps a sibling thread alive across the epoll_ctl() call, then
17+
* checks that both a pipe and an eventfd registered while multi-threaded
18+
* still deliver an EPOLLIN edge.
19+
*
20+
* Syscalls exercised: clone(220), epoll_create1(20), epoll_ctl(21),
21+
* epoll_pwait(22), eventfd2(19), pipe2(59), write(64),
22+
* read(63), close(57), futex(98), exit(93)
23+
*/
24+
25+
#include <stdint.h>
26+
#include <errno.h>
27+
#include <unistd.h>
28+
#include <sys/epoll.h>
29+
#include <sys/eventfd.h>
30+
31+
#include "test-harness.h"
32+
#include "raw-syscall.h"
33+
34+
int passes = 0, fails = 0;
35+
36+
static volatile int child_should_exit = 0;
37+
static char sibling_stack[16384] __attribute__((aligned(16)));
38+
39+
/* Sibling thread: stays alive (raw nanosleep loop) so the guest is
40+
* multi-threaded for the duration of the parent's epoll_ctl() calls, then
41+
* exits via the raw exit syscall. Uses only raw syscalls because a
42+
* clone(CLONE_THREAD) child has no libc TLS set up.
43+
*/
44+
static int sibling_fn(void *arg)
45+
{
46+
(void) arg;
47+
struct {
48+
long tv_sec, tv_nsec;
49+
} ts = {0, 5000000}; /* 5ms */
50+
while (!child_should_exit)
51+
raw_syscall2(__NR_nanosleep, (long) &ts, 0);
52+
raw_syscall1(__NR_exit, 0);
53+
return 0;
54+
}
55+
56+
/* Register host_fd for EPOLLIN on epfd, make it readable via make_ready(),
57+
* and assert epoll_pwait() observes the edge within the timeout. Returns 1
58+
* on success. */
59+
static int expect_ready_edge(int epfd, int fd, void (*make_ready)(int), int arg)
60+
{
61+
struct epoll_event ev = {.events = EPOLLIN, .data.fd = fd};
62+
if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) < 0)
63+
return 0;
64+
65+
make_ready(arg);
66+
67+
struct epoll_event out[4];
68+
/* 2s budget: the bug manifests as an indefinite miss, so any generous
69+
* finite timeout distinguishes pass from fail without flaking. */
70+
int n = epoll_wait(epfd, out, 4, 2000);
71+
return n == 1 && out[0].data.fd == fd;
72+
}
73+
74+
static void poke_eventfd(int fd)
75+
{
76+
uint64_t one = 1;
77+
(void) !write(fd, &one, sizeof(one));
78+
}
79+
80+
static int g_pipe_wr;
81+
static void poke_pipe(int unused)
82+
{
83+
(void) unused;
84+
(void) !write(g_pipe_wr, "x", 1);
85+
}
86+
87+
int main(void)
88+
{
89+
printf("test-epoll-mt: epoll registration under CLONE_THREAD\n");
90+
91+
/* Spawn a CLONE_THREAD sibling so host_fd_ref_open() takes the dup path.
92+
* Flags: CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
93+
* CLONE_CHILD_CLEARTID.
94+
*/
95+
long flags = 0x00000100 | 0x00000200 | 0x00000400 | 0x00000800 |
96+
0x00010000 | 0x00200000;
97+
volatile uint32_t child_tid = 1;
98+
long ret = raw_syscall5(__NR_clone, flags,
99+
(long) (sibling_stack + sizeof(sibling_stack)), 0, 0,
100+
(long) &child_tid);
101+
if (ret == 0) {
102+
sibling_fn(NULL);
103+
return 0; /* unreachable: sibling_fn exits the thread */
104+
}
105+
106+
TEST("clone sibling for multi-threaded context");
107+
EXPECT_TRUE(ret > 0, "clone failed");
108+
109+
/* eventfd registered + signalled while multi-threaded (the Node path). */
110+
TEST("MT epoll: eventfd EPOLLIN edge delivered");
111+
{
112+
int epfd = epoll_create1(EPOLL_CLOEXEC);
113+
int efd = eventfd(0, EFD_NONBLOCK);
114+
EXPECT_TRUE(epfd >= 0 && efd >= 0, "epoll/eventfd create failed");
115+
EXPECT_TRUE(expect_ready_edge(epfd, efd, poke_eventfd, efd),
116+
"eventfd registration lost across epoll_ctl");
117+
close(efd);
118+
close(epfd);
119+
}
120+
121+
/* Same with a pipe read end, to show the fix is fd-type independent. */
122+
TEST("MT epoll: pipe EPOLLIN edge delivered");
123+
{
124+
int epfd = epoll_create1(EPOLL_CLOEXEC);
125+
int pipefd[2];
126+
EXPECT_TRUE(epfd >= 0 && pipe(pipefd) == 0, "epoll/pipe create failed");
127+
g_pipe_wr = pipefd[1];
128+
EXPECT_TRUE(expect_ready_edge(epfd, pipefd[0], poke_pipe, 0),
129+
"pipe registration lost across epoll_ctl");
130+
close(pipefd[0]);
131+
close(pipefd[1]);
132+
close(epfd);
133+
}
134+
135+
/* Release the sibling and join via the CLONE_CHILD_CLEARTID futex. */
136+
child_should_exit = 1;
137+
for (int i = 0; i < 200 && child_tid != 0; i++) {
138+
struct {
139+
long tv_sec, tv_nsec;
140+
} ts = {0, 10000000}; /* 10ms */
141+
raw_syscall6(__NR_futex, (long) &child_tid, 0 /* FUTEX_WAIT */,
142+
child_tid, (long) &ts, 0, 0);
143+
}
144+
145+
SUMMARY("test-epoll-mt");
146+
return fails > 0 ? 1 : 0;
147+
}

0 commit comments

Comments
 (0)