44 * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
55 * SPDX-License-Identifier: Apache-2.0
66 *
7- * ppoll, pselect6, and epoll (emulated via macOS kqueue). All functions
8- * are called from syscall_dispatch() in syscall/syscall.c.
7+ * ppoll, pselect6, and epoll (emulated via macOS kqueue). All functions are
8+ * called from syscall_dispatch() in syscall/syscall.c.
99 */
1010
1111#include <stdbool.h>
@@ -182,10 +182,10 @@ int64_t sys_ppoll(guest_t *g,
182182 mask_installed = true;
183183 }
184184
185- /* For indefinite polls, add the wakeup pipe so exit_group can
186- * interrupt threads blocked in host poll(). Without this, threads
187- * in poll(timeout=-1) cannot be interrupted by hv_vcpus_exit()
188- * because they're not in hv_vcpu_run().
185+ /* For indefinite polls, add the wakeup pipe so exit_group can interrupt
186+ * threads blocked in host poll(). Without this, threads in poll(timeout=-1)
187+ * cannot be interrupted by hv_vcpus_exit() because they're not in
188+ * hv_vcpu_run().
189189 */
190190 bool added_wakeup = false;
191191 if (timeout_ms < 0 && wakeup_pipe_rd >= 0 && nfds < 256 ) {
@@ -205,6 +205,7 @@ int64_t sys_ppoll(guest_t *g,
205205 poll_timeout_ms = 0 ;
206206
207207 int ret ;
208+ ppoll_retry :
208209 do {
209210 ret = poll (host_fds , nfds + added_wakeup ,
210211 poll_timeout_ms < 0 ? 200 : poll_timeout_ms );
@@ -222,9 +223,9 @@ int64_t sys_ppoll(guest_t *g,
222223 */
223224 } while (ret == 0 && poll_timeout_ms < 0 );
224225
225- /* POSIX poll() ignores entries with fd < 0 and resets revents to 0,
226- * so re-stamp POLLNVAL on the invalid slots and credit them to the
227- * return count.
226+ /* POSIX poll() ignores entries with fd < 0 and resets revents to 0, so
227+ * re-stamp POLLNVAL on the invalid slots and credit them to the return
228+ * count.
228229 */
229230 if (ret >= 0 && invalid_count > 0 ) {
230231 for (uint32_t i = 0 ; i < nfds ; i ++ )
@@ -244,6 +245,8 @@ int64_t sys_ppoll(guest_t *g,
244245 ;
245246 if (ret > 0 )
246247 ret -- ;
248+ if (ret == 0 && poll_timeout_ms < 0 )
249+ goto ppoll_retry ;
247250 }
248251
249252 /* Restore original signal mask */
@@ -284,8 +287,8 @@ int64_t sys_pselect6(guest_t *g,
284287 uint64_t timeout_gva ,
285288 uint64_t sigmask_gva )
286289{
287- /* pselect6 atomically sets the signal mask during the wait, then
288- * restores it. The sixth argument is a pointer to a struct:
290+ /* pselect6 atomically sets the signal mask during the wait, then restores
291+ * it. The sixth argument is a pointer to a struct:
289292 * { const sigset_t *ss; size_t ss_len; }
290293 */
291294 if (nfds < 0 || nfds > FD_SETSIZE )
@@ -424,10 +427,9 @@ int64_t sys_pselect6(guest_t *g,
424427 ts .tv_nsec = lts .tv_nsec ;
425428 }
426429
427- /* Apply signal mask atomically around the select.
428- * Linux pselect6 arg6 points to { sigset_t *ss; size_t ss_len }.
429- * Save the current blocked mask, apply the new one, do the select, then
430- * restore the original mask.
430+ /* Apply signal mask atomically around the select. Linux pselect6 arg6
431+ * points to { sigset_t *ss; size_t ss_len }. Save the current blocked mask,
432+ * apply the new one, do the select, then restore the original mask.
431433 */
432434 uint64_t saved_blocked = 0 ;
433435 bool mask_applied = false;
@@ -492,6 +494,10 @@ int64_t sys_pselect6(guest_t *g,
492494
493495 int ret ;
494496 bool poll_wakeup_fired = false;
497+ pselect_retry :
498+ poll_wakeup_fired = false;
499+ for (int i = 0 ; i < req_count ; i ++ )
500+ reqs [i ].revents = 0 ;
495501 do {
496502 if (!has_timeout ) {
497503 if (read_setp )
@@ -573,6 +579,8 @@ int64_t sys_pselect6(guest_t *g,
573579 FD_CLR (wakeup_pipe_rd , & read_set );
574580 if (ret > 0 )
575581 ret -- ;
582+ if (ret == 0 && !has_timeout )
583+ goto pselect_retry ;
576584 }
577585
578586 /* Restore original signal mask */
@@ -657,9 +665,9 @@ int64_t sys_pselect6(guest_t *g,
657665
658666/* epoll emulation via kqueue
659667 *
660- * Linux epoll is emulated using macOS kqueue. Each epoll_create1() creates
661- * a kqueue fd. epoll_ctl translates to kevent() calls. epoll_pwait translates
662- * to kevent() with timeout.
668+ * Linux epoll is emulated using macOS kqueue. Each epoll_create1() creates a
669+ * kqueue fd. epoll_ctl translates to kevent() calls. epoll_pwait translates to
670+ * kevent() with timeout.
663671 *
664672 * Limitations:
665673 * - EPOLLEXCLUSIVE not supported (rare, for load balancing)
@@ -707,9 +715,9 @@ typedef struct {
707715 */
708716} epoll_reg_t ;
709717
710- /* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance
711- * has its own registration table so multiple epoll instances watching
712- * the same FD do not overwrite each other's user data.
718+ /* Per-epoll-instance data, stored in fd_table[epfd].dir. Each instance has its
719+ * own registration table so multiple epoll instances watching the same FD do
720+ * not overwrite each other's user data.
713721 */
714722typedef struct {
715723 epoll_reg_t regs [FD_TABLE_SIZE ];
@@ -797,7 +805,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
797805 * separate fd_to_host()) keeps the validate and the ident read atomic under
798806 * one fd_lock. The snapshot's generation then guards the cross-call ABA
799807 * below. Result mapping uses udata (the guest fd), so the ident only needs
800- * to stay open and refer to the same open file description. */
808+ * to stay open and refer to the same open file description.
809+ */
801810 fd_entry_t target_snap ;
802811 if (!fd_snapshot (fd , & target_snap )) {
803812 host_fd_ref_close (& epoll_ref );
@@ -813,7 +822,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
813822 * number -- and thus reg->active -- still looks live. Acting on it would
814823 * EV_DELETE/EV_MOD the wrong knote on the reused host fd. A mismatched
815824 * generation means the registration is gone: drop it so DEL/MOD report
816- * ENOENT (matching Linux's auto-removal on close) and ADD starts fresh. */
825+ * ENOENT (matching Linux's auto-removal on close) and ADD starts fresh.
826+ */
817827 if ((reg -> active || reg -> oneshot_armed ) &&
818828 reg -> generation != target_snap .generation ) {
819829 reg -> active = false;
@@ -854,8 +864,8 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
854864 }
855865
856866 /* Linux semantics: ADD fails with EEXIST if already registered; MOD fails
857- * with ENOENT if not registered. oneshot_armed registrations
858- * (EPOLLONESHOT fired, waiting for re-arm) are still valid for MOD.
867+ * with ENOENT if not registered. oneshot_armed registrations (EPOLLONESHOT
868+ * fired, waiting for re-arm) are still valid for MOD.
859869 */
860870 if (op == LINUX_EPOLL_CTL_ADD && reg -> active ) {
861871 host_fd_ref_close (& epoll_ref );
@@ -876,12 +886,12 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
876886 /* For MOD, remove old registrations first if they exist in kqueue.
877887 * EPOLLRDHUP alone registers EVFILT_READ (see ADD path), so check both
878888 * EPOLLIN and EPOLLRDHUP (same logic as CTL_DEL). Always attempt the
879- * deletes even when oneshot_armed: with multi-filter EPOLLONESHOT, only
880- * the filter that fired was removed by EV_ONESHOT; the other filter is
881- * still registered and must be cleaned. Issue each delete in its own
882- * kevent call so an ENOENT on one filter does not abort the other --
883- * with a single batched call and NULL eventlist, kevent stops at the
884- * first failed change and leaks the survivor.
889+ * deletes even when oneshot_armed: with multi-filter EPOLLONESHOT, only the
890+ * filter that fired was removed by EV_ONESHOT; the other filter is still
891+ * registered and must be cleaned. Issue each delete in its own kevent call
892+ * so an ENOENT on one filter does not abort the other -- with a single
893+ * batched call and NULL eventlist, kevent stops at the first failed change
894+ * and leaks the survivor.
885895 */
886896 if (op == LINUX_EPOLL_CTL_MOD && reg -> active ) {
887897 struct kevent del ;
@@ -937,10 +947,10 @@ int64_t sys_epoll_ctl(guest_t *g, int epfd, int op, int fd, uint64_t event_gva)
937947 }
938948 }
939949
940- /* Store registration data in per-instance table.
941- * Clear oneshot_armed when MOD successfully re-arms. Stamp the snapshot's
942- * generation so a later close+reopen of this guest fd is detected as a
943- * stale registration by the ABA guard above.
950+ /* Store registration data in per-instance table. Clear oneshot_armed when
951+ * MOD successfully re-arms. Stamp the snapshot's generation so a later
952+ * close+reopen of this guest fd is detected as a stale registration by the
953+ * ABA guard above.
944954 */
945955 reg -> events = ev .events ;
946956 reg -> data = ev .data ;
@@ -998,8 +1008,9 @@ int64_t sys_epoll_pwait(guest_t *g,
9981008 ts .tv_nsec = (timeout_ms % 1000 ) * 1000000L ;
9991009 }
10001010
1001- /* For indefinite waits, register the wakeup pipe with the kqueue
1002- * so exit_group can interrupt threads blocked in kevent().
1011+ epoll_retry :;
1012+ /* For indefinite waits, register the wakeup pipe with the kqueue so
1013+ * exit_group can interrupt threads blocked in kevent().
10031014 */
10041015 bool added_wakeup = false;
10051016 if (!has_timeout && wakeup_pipe_rd >= 0 ) {
@@ -1056,6 +1067,8 @@ int64_t sys_epoll_pwait(guest_t *g,
10561067 i -- ;
10571068 }
10581069 }
1070+ if (nready == 0 && !has_timeout )
1071+ goto epoll_retry ;
10591072 }
10601073
10611074 /* Restore original signal mask after the blocking wait */
@@ -1069,9 +1082,9 @@ int64_t sys_epoll_pwait(guest_t *g,
10691082 }
10701083
10711084 /* Merge kevent results into epoll_event results. Multiple kevents for the
1072- * same fd (READ + WRITE) merge into one epoll_event.
1073- * Use guest FD (not user data) as the merge key, since two different FDs
1074- * could legitimately share the same epoll_data value.
1085+ * same fd (READ + WRITE) merge into one epoll_event. Use guest FD (not user
1086+ * data) as the merge key, since two different FDs could legitimately share
1087+ * the same epoll_data value.
10751088 */
10761089 linux_epoll_event_t out [256 ];
10771090 /* Parallel array tracking which guest FD each output entry represents. */
@@ -1086,8 +1099,8 @@ int64_t sys_epoll_pwait(guest_t *g,
10861099 if (!RANGE_CHECK (gfd , 0 , FD_TABLE_SIZE ) || !inst -> regs [gfd ].active )
10871100 continue ;
10881101
1089- /* EPOLLONESHOT semantics: once any event fired and was reported, the
1090- * fd stays disarmed until EPOLL_CTL_MOD re-arms it. With multi-filter
1102+ /* EPOLLONESHOT semantics: once any event fired and was reported, the fd
1103+ * stays disarmed until EPOLL_CTL_MOD re-arms it. With multi-filter
10911104 * registrations (e.g. EPOLLIN | EPOLLOUT), EV_ONESHOT only removed the
10921105 * filter that fired; surviving filters can still fire later and would
10931106 * be reported here without this guard.
@@ -1112,9 +1125,9 @@ int64_t sys_epoll_pwait(guest_t *g,
11121125 epoll_merge_event (& out [idx ], & kevents [i ], reg );
11131126 }
11141127
1115- /* Mark EPOLLONESHOT FDs as armed (fired but waiting for MOD re-arm).
1116- * kqueue already removed the event (EV_ONESHOT), so poll emulation marks
1117- * the registration as oneshot_armed to allow MOD but prevent further event
1128+ /* Mark EPOLLONESHOT FDs as armed (fired but waiting for MOD re-arm). kqueue
1129+ * already removed the event (EV_ONESHOT), so poll emulation marks the
1130+ * registration as oneshot_armed to allow MOD but prevent further event
11181131 * reporting until re-armed.
11191132 */
11201133 for (int i = 0 ; i < nout ; i ++ ) {
0 commit comments