Skip to content

Commit fc2a9b0

Browse files
committed
Re-block ppoll/pselect6/epoll_pwait
sys_{ppoll,pselect6,epoll_pwait} add the host wakeup pipe to the wait set on indefinite waits so exit_group can interrupt threads stuck in host poll() / pselect() / kevent(). When that wakeup fired without a real exit (for example, a futex interrupt already consumed by a sibling thread), the syscall decremented the count for the drained wakeup-pipe event and returned 0 to the guest. ppoll and friends documenting timeout < 0 as block-forever, so returning 0 spuriously broke guests that loop on revents = 0. Add a ppoll_retry / pselect_retry / epoll_retry label before the wait construction and goto it when the post-drain count is 0 and the caller asked for an indefinite wait. The retry re-enters the same do-while that already short-circuits with EINTR on proc_exit_group_requested() or futex_interrupt_consume(), so a real exit or interrupt still surfaces immediately. pselect_retry also resets reqs[].revents and poll_wakeup_fired up front so the second pass sees a clean slate.
1 parent 5b7ed31 commit fc2a9b0

1 file changed

Lines changed: 58 additions & 45 deletions

File tree

src/syscall/poll.c

Lines changed: 58 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
* Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
55
* SPDX-License-Identifier: Apache-2.0
66
*
7-
* ppoll, pselect6, and epoll (emulated via macOS kqueue). All functions
8-
* are called from syscall_dispatch() in syscall/syscall.c.
7+
* ppoll, pselect6, and epoll (emulated via macOS kqueue). All functions are
8+
* called from syscall_dispatch() in syscall/syscall.c.
99
*/
1010

1111
#include <stdbool.h>
@@ -182,10 +182,10 @@ int64_t sys_ppoll(guest_t *g,
182182
mask_installed = true;
183183
}
184184

185-
/* For indefinite polls, add the wakeup pipe so exit_group can
186-
* interrupt threads blocked in host poll(). Without this, threads
187-
* in poll(timeout=-1) cannot be interrupted by hv_vcpus_exit()
188-
* because they're not in hv_vcpu_run().
185+
/* For indefinite polls, add the wakeup pipe so exit_group can interrupt
186+
* threads blocked in host poll(). Without this, threads in poll(timeout=-1)
187+
* cannot be interrupted by hv_vcpus_exit() because they're not in
188+
* hv_vcpu_run().
189189
*/
190190
bool added_wakeup = false;
191191
if (timeout_ms < 0 && wakeup_pipe_rd >= 0 && nfds < 256) {
@@ -205,6 +205,7 @@ int64_t sys_ppoll(guest_t *g,
205205
poll_timeout_ms = 0;
206206

207207
int ret;
208+
ppoll_retry:
208209
do {
209210
ret = poll(host_fds, nfds + added_wakeup,
210211
poll_timeout_ms < 0 ? 200 : poll_timeout_ms);
@@ -222,9 +223,9 @@ int64_t sys_ppoll(guest_t *g,
222223
*/
223224
} while (ret == 0 && poll_timeout_ms < 0);
224225

225-
/* POSIX poll() ignores entries with fd < 0 and resets revents to 0,
226-
* so re-stamp POLLNVAL on the invalid slots and credit them to the
227-
* return count.
226+
/* POSIX poll() ignores entries with fd < 0 and resets revents to 0, so
227+
* re-stamp POLLNVAL on the invalid slots and credit them to the return
228+
* count.
228229
*/
229230
if (ret >= 0 && invalid_count > 0) {
230231
for (uint32_t i = 0; i < nfds; i++)
@@ -244,6 +245,8 @@ int64_t sys_ppoll(guest_t *g,
244245
;
245246
if (ret > 0)
246247
ret--;
248+
if (ret == 0 && poll_timeout_ms < 0)
249+
goto ppoll_retry;
247250
}
248251

249252
/* Restore original signal mask */
@@ -284,8 +287,8 @@ int64_t sys_pselect6(guest_t *g,
284287
uint64_t timeout_gva,
285288
uint64_t sigmask_gva)
286289
{
287-
/* pselect6 atomically sets the signal mask during the wait, then
288-
* restores it. The sixth argument is a pointer to a struct:
290+
/* pselect6 atomically sets the signal mask during the wait, then restores
291+
* it. The sixth argument is a pointer to a struct:
289292
* { const sigset_t *ss; size_t ss_len; }
290293
*/
291294
if (nfds < 0 || nfds > FD_SETSIZE)
@@ -424,10 +427,9 @@ int64_t sys_pselect6(guest_t *g,
424427
ts.tv_nsec = lts.tv_nsec;
425428
}
426429

427-
/* Apply signal mask atomically around the select.
428-
* Linux pselect6 arg6 points to { sigset_t *ss; size_t ss_len }.
429-
* Save the current blocked mask, apply the new one, do the select, then
430-
* restore the original mask.
430+
/* Apply signal mask atomically around the select. Linux pselect6 arg6
431+
* points to { sigset_t *ss; size_t ss_len }. Save the current blocked mask,
432+
* apply the new one, do the select, then restore the original mask.
431433
*/
432434
uint64_t saved_blocked = 0;
433435
bool mask_applied = false;
@@ -492,6 +494,10 @@ int64_t sys_pselect6(guest_t *g,
492494

493495
int ret;
494496
bool poll_wakeup_fired = false;
497+
pselect_retry:
498+
poll_wakeup_fired = false;
499+
for (int i = 0; i < req_count; i++)
500+
reqs[i].revents = 0;
495501
do {
496502
if (!has_timeout) {
497503
if (read_setp)
@@ -573,6 +579,8 @@ int64_t sys_pselect6(guest_t *g,
573579
FD_CLR(wakeup_pipe_rd, &read_set);
574580
if (ret > 0)
575581
ret--;
582+
if (ret == 0 && !has_timeout)
583+
goto pselect_retry;
576584
}
577585

578586
/* Restore original signal mask */
@@ -657,9 +665,9 @@ int64_t sys_pselect6(guest_t *g,
657665

658666
/* epoll emulation via kqueue
659667
*
660-
* Linux epoll is emulated using macOS kqueue. Each epoll_create1() creates
661-
* a kqueue fd. epoll_ctl translates to kevent() calls. epoll_pwait translates
662-
* to kevent() with timeout.
668+
* Linux epoll is emulated using macOS kqueue. Each epoll_create1() creates a
669+
* kqueue fd. epoll_ctl translates to kevent() calls. epoll_pwait translates to
670+
* kevent() with timeout.
663671
*
664672
* Limitations:
665673
* - EPOLLEXCLUSIVE not supported (rare, for load balancing)
@@ -707,9 +715,9 @@ typedef struct {
707715
*/
708716
} epoll_reg_t;
709717

710-
/* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance
711-
* has its own registration table so multiple epoll instances watching
712-
* the same FD do not overwrite each other's user data.
718+
/* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance has its
719+
* own registration table so multiple epoll instances watching the same FD do
720+
* not overwrite each other's user data.
713721
*/
714722
typedef struct {
715723
epoll_reg_t regs[FD_TABLE_SIZE];
@@ -797,7 +805,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
797805
* separate fd_to_host()) keeps the validate and the ident read atomic under
798806
* one fd_lock. The snapshot's generation then guards the cross-call ABA
799807
* below. Result mapping uses udata (the guest fd), so the ident only needs
800-
* to stay open and refer to the same open file description. */
808+
* to stay open and refer to the same open file description.
809+
*/
801810
fd_entry_t target_snap;
802811
if (!fd_snapshot(fd, &target_snap)) {
803812
host_fd_ref_close(&epoll_ref);
@@ -813,7 +822,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
813822
* number -- and thus reg->active -- still looks live. Acting on it would
814823
* EV_DELETE/EV_MOD the wrong knote on the reused host fd. A mismatched
815824
* generation means the registration is gone: drop it so DEL/MOD report
816-
* ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. */
825+
* ENOENT (matching Linux's auto-removal on close) and ADD starts fresh.
826+
*/
817827
if ((reg->active || reg->oneshot_armed) &&
818828
reg->generation != target_snap.generation) {
819829
reg->active = false;
@@ -854,8 +864,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
854864
}
855865

856866
/* Linux semantics: ADD fails with EEXIST if already registered; MOD fails
857-
* with ENOENT if not registered. oneshot_armed registrations
858-
* (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD.
867+
* with ENOENT if not registered. oneshot_armed registrations (EPOLLONESHOT
868+
* fired, waiting for re-arm) are still valid for MOD.
859869
*/
860870
if (op == LINUX_EPOLL_CTL_ADD && reg->active) {
861871
host_fd_ref_close(&epoll_ref);
@@ -876,12 +886,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
876886
/* For MOD, remove old registrations first if they exist in kqueue.
877887
* EPOLLRDHUP alone registers EVFILT_READ (see ADD path), so check both
878888
* EPOLLIN and EPOLLRDHUP (same logic as CTL_DEL). Always attempt the
879-
* deletes even when oneshot_armed: with multi-filter EPOLLONESHOT, only
880-
* the filter that fired was removed by EV_ONESHOT; the other filter is
881-
* still registered and must be cleaned. Issue each delete in its own
882-
* kevent call so an ENOENT on one filter does not abort the other --
883-
* with a single batched call and NULL eventlist, kevent stops at the
884-
* first failed change and leaks the survivor.
889+
* deletes even when oneshot_armed: with multi-filter EPOLLONESHOT, only the
890+
* filter that fired was removed by EV_ONESHOT; the other filter is still
891+
* registered and must be cleaned. Issue each delete in its own kevent call
892+
* so an ENOENT on one filter does not abort the other -- with a single
893+
* batched call and NULL eventlist, kevent stops at the first failed change
894+
* and leaks the survivor.
885895
*/
886896
if (op == LINUX_EPOLL_CTL_MOD && reg->active) {
887897
struct kevent del;
@@ -937,10 +947,10 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
937947
}
938948
}
939949

940-
/* Store registration data in per-instance table.
941-
* Clear oneshot_armed when MOD successfully re-arms. Stamp the snapshot's
942-
* generation so a later close+reopen of this guest fd is detected as a
943-
* stale registration by the ABA guard above.
950+
/* Store registration data in per-instance table. Clear oneshot_armed when
951+
* MOD successfully re-arms. Stamp the snapshot's generation so a later
952+
* close+reopen of this guest fd is detected as a stale registration by the
953+
* ABA guard above.
944954
*/
945955
reg->events = ev.events;
946956
reg->data = ev.data;
@@ -998,8 +1008,9 @@ int64_t sys_epoll_pwait(guest_t *g,
9981008
ts.tv_nsec = (timeout_ms % 1000) * 1000000L;
9991009
}
10001010

1001-
/* For indefinite waits, register the wakeup pipe with the kqueue
1002-
* so exit_group can interrupt threads blocked in kevent().
1011+
epoll_retry:;
1012+
/* For indefinite waits, register the wakeup pipe with the kqueue so
1013+
* exit_group can interrupt threads blocked in kevent().
10031014
*/
10041015
bool added_wakeup = false;
10051016
if (!has_timeout && wakeup_pipe_rd >= 0) {
@@ -1056,6 +1067,8 @@ int64_t sys_epoll_pwait(guest_t *g,
10561067
i--;
10571068
}
10581069
}
1070+
if (nready == 0 && !has_timeout)
1071+
goto epoll_retry;
10591072
}
10601073

10611074
/* Restore original signal mask after the blocking wait */
@@ -1069,9 +1082,9 @@ int64_t sys_epoll_pwait(guest_t *g,
10691082
}
10701083

10711084
/* Merge kevent results into epoll_event results. Multiple kevents for the
1072-
* same fd (READ + WRITE) merge into one epoll_event.
1073-
* Use guest FD (not user data) as the merge key, since two different FDs
1074-
* could legitimately share the same epoll_data value.
1085+
* same fd (READ + WRITE) merge into one epoll_event. Use guest FD (not user
1086+
* data) as the merge key, since two different FDs could legitimately share
1087+
* the same epoll_data value.
10751088
*/
10761089
linux_epoll_event_t out[256];
10771090
/* Parallel array tracking which guest FD each output entry represents. */
@@ -1086,8 +1099,8 @@ int64_t sys_epoll_pwait(guest_t *g,
10861099
if (!RANGE_CHECK(gfd, 0, FD_TABLE_SIZE) || !inst->regs[gfd].active)
10871100
continue;
10881101

1089-
/* EPOLLONESHOT semantics: once any event fired and was reported, the
1090-
* fd stays disarmed until EPOLL_CTL_MOD re-arms it. With multi-filter
1102+
/* EPOLLONESHOT semantics: once any event fired and was reported, the fd
1103+
* stays disarmed until EPOLL_CTL_MOD re-arms it. With multi-filter
10911104
* registrations (e.g. EPOLLIN | EPOLLOUT), EV_ONESHOT only removed the
10921105
* filter that fired; surviving filters can still fire later and would
10931106
* be reported here without this guard.
@@ -1112,9 +1125,9 @@ int64_t sys_epoll_pwait(guest_t *g,
11121125
epoll_merge_event(&out[idx], &kevents[i], reg);
11131126
}
11141127

1115-
/* Mark EPOLLONESHOT FDs as armed (fired but waiting for MOD re-arm).
1116-
* kqueue already removed the event (EV_ONESHOT), so poll emulation marks
1117-
* the registration as oneshot_armed to allow MOD but prevent further event
1128+
/* Mark EPOLLONESHOT FDs as armed (fired but waiting for MOD re-arm). kqueue
1129+
* already removed the event (EV_ONESHOT), so poll emulation marks the
1130+
* registration as oneshot_armed to allow MOD but prevent further event
11181131
* reporting until re-armed.
11191132
*/
11201133
for (int i = 0; i < nout; i++) {

0 commit comments

Comments
 (0)