Skip to content

Commit 7cc0267

Browse files
committed
poll: Write back only revents to userspace, matching Linux behavior
doPoll() was modifying pfd.Events (adding POLLHUP|POLLERR) and writing back the entire pollfd struct. Linux only writes back revents via unsafe_put_user() in do_sys_poll(). The polluted events field broke libevent's poll_del(), causing a busy-loop in tmux (CPU 96.6%). Fix: use an internal mask for POLLHUP|POLLERR in initReadiness() and pollBlock(); write back only the 2-byte revents field per fd in doPoll(). Test: PollTest.EventsFieldNotModified, PollTest.EventsFieldNotModifiedOnError Signed-off-by: Tan Yifeng <yiftan@tencent.com>
1 parent 4eeb780 commit 7cc0267

2 files changed

Lines changed: 85 additions & 16 deletions

File tree

pkg/sentry/syscalls/linux/sys_poll.go

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"gvisor.dev/gvisor/pkg/abi/linux"
2222
"gvisor.dev/gvisor/pkg/errors/linuxerr"
2323
"gvisor.dev/gvisor/pkg/hostarch"
24+
"gvisor.dev/gvisor/pkg/marshal/primitive"
2425
"gvisor.dev/gvisor/pkg/sentry/arch"
2526
"gvisor.dev/gvisor/pkg/sentry/kernel"
2627
"gvisor.dev/gvisor/pkg/sentry/ktime"
@@ -34,6 +35,16 @@ import (
3435
// unrecoverable.
3536
const fileCap = 1024 * 1024
3637

38+
var (
39+
// sizeofPollFD is the size of linux.PollFD struct in bytes.
40+
sizeofPollFD = (*linux.PollFD)(nil).SizeBytes()
41+
42+
// reventsOffsetInFD is the byte offset of the REvents field within
43+
// linux.PollFD. Since REvents is the last field (int16), its offset
44+
// equals the struct size minus the size of int16.
45+
reventsOffsetInFD = sizeofPollFD - (*primitive.Int16)(nil).SizeBytes()
46+
)
47+
3748
// Masks for "readable", "writable", and "exceptional" events as defined by
3849
// select(2).
3950
const (
@@ -72,18 +83,24 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
7283
return nil
7384
}
7485

86+
// Like Linux's fs/select.c:do_pollfd(), always check for POLLHUP and
87+
// POLLERR in addition to the caller-requested events. We add these
88+
// to a local mask rather than modifying pfd.Events, because Linux
89+
// never writes back the events field to userspace (only revents).
90+
mask := pfd.Events | linux.POLLHUP | linux.POLLERR
91+
7592
if ch == nil {
7693
defer file.DecRef(t)
7794
} else {
7895
state.file = file
79-
state.waiter.Init(waiter.ChannelNotifier(ch), waiter.EventMaskFromLinux(uint32(pfd.Events)))
96+
state.waiter.Init(waiter.ChannelNotifier(ch), waiter.EventMaskFromLinux(uint32(mask)))
8097
if err := file.EventRegister(&state.waiter); err != nil {
8198
return err
8299
}
83100
}
84101

85-
r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
86-
pfd.REvents = int16(r.ToLinux()) & pfd.Events
102+
r := file.Readiness(waiter.EventMaskFromLinux(uint32(mask)))
103+
pfd.REvents = int16(r.ToLinux()) & mask
87104
return nil
88105
}
89106

@@ -150,8 +167,9 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
150167
continue
151168
}
152169

153-
r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
154-
rl := int16(r.ToLinux()) & pfd[i].Events
170+
mask := pfd[i].Events | linux.POLLHUP | linux.POLLERR
171+
r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(mask)))
172+
rl := int16(r.ToLinux()) & mask
155173
if rl != 0 {
156174
pfd[i].REvents = rl
157175
n++
@@ -184,21 +202,23 @@ func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration
184202
return timeout, 0, err
185203
}
186204

187-
// Compatibility warning: Linux adds POLLHUP and POLLERR just before
188-
// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
189-
// polling, changing event masks here is an application-visible difference.
190-
// (Linux also doesn't copy out event masks at all, only revents.)
191-
for i := range pfd {
192-
pfd[i].Events |= linux.POLLHUP | linux.POLLERR
193-
}
194205
remainingTimeout, n, err := pollBlock(t, pfd, timeout)
195206
err = linuxerr.ConvertIntr(err, linuxerr.EINTR)
196207

197-
// The poll entries are copied out regardless of whether
198-
// any are set or not. This aligns with the Linux behavior.
208+
// Copy out only the revents field, matching Linux behavior.
209+
// Linux's do_sys_poll() only writes back revents via
210+
// unsafe_put_user(fds->revents, &ufds->revents), never the
211+
// events field. Writing back the full struct would corrupt
212+
// the caller's events mask (e.g. libevent's poll backend),
213+
// causing busy-loops when event_del() fails to fully remove
214+
// an fd from the pollfd array due to stale POLLHUP/POLLERR bits.
215+
//
199216
if nfds > 0 && err == nil {
200-
if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
201-
return remainingTimeout, 0, err
217+
for i := range pfd {
218+
off := hostarch.Addr(i*sizeofPollFD + reventsOffsetInFD)
219+
if _, copyErr := primitive.CopyInt16Out(t, addr+off, pfd[i].REvents); copyErr != nil {
220+
return remainingTimeout, 0, copyErr
221+
}
202222
}
203223
}
204224

test/syscalls/linux/poll.cc

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,55 @@ TEST_F(PollTest, UnpollableFile) {
364364
EXPECT_EQ(poll_fd.revents, POLLIN | POLLOUT);
365365
}
366366

367+
// Test that poll(2) does not write back the events field to userspace.
368+
// Linux's do_sys_poll() only writes back revents via unsafe_put_user(),
369+
// never the events field. This verifies that POLLHUP/POLLERR are not
370+
// injected into the events field.
371+
TEST_F(PollTest, EventsFieldNotModified) {
372+
// Create a pipe.
373+
int fds[2];
374+
ASSERT_THAT(pipe(fds), SyscallSucceeds());
375+
376+
FileDescriptor fd0(fds[0]);
377+
FileDescriptor fd1(fds[1]);
378+
379+
// Close the writer fd so the reader gets POLLHUP.
380+
fd1.reset();
381+
382+
// Poll with only POLLIN in events.
383+
struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
384+
EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
385+
386+
// revents should contain POLLHUP (since writer is closed).
387+
EXPECT_NE(poll_fd.revents & POLLHUP, 0);
388+
389+
// The events field must remain unchanged (only POLLIN, no POLLHUP/POLLERR).
390+
EXPECT_EQ(poll_fd.events, POLLIN);
391+
}
392+
393+
// Test that poll(2) does not write back the events field when POLLERR occurs.
394+
TEST_F(PollTest, EventsFieldNotModifiedOnError) {
395+
// Create a pipe.
396+
int fds[2];
397+
ASSERT_THAT(pipe(fds), SyscallSucceeds());
398+
399+
FileDescriptor fd0(fds[0]);
400+
FileDescriptor fd1(fds[1]);
401+
402+
// Close the reader fd so the writer gets POLLERR.
403+
fd0.reset();
404+
405+
// Poll with only POLLOUT in events.
406+
struct pollfd poll_fd = {fd1.get(), POLLOUT, 0};
407+
EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
408+
409+
// revents should contain POLLERR (since reader is closed).
410+
EXPECT_NE(poll_fd.revents & POLLERR, 0);
411+
412+
// The events field must remain unchanged (only POLLOUT, no POLLERR/POLLHUP).
413+
EXPECT_EQ(poll_fd.events, POLLOUT);
414+
}
415+
367416
} // namespace
368417
} // namespace testing
369418
} // namespace gvisor

0 commit comments

Comments
 (0)