Skip to content

Commit 3fb7e18

Browse files
authored
Merge pull request #87 from sysprog21/timerfd
Honor CLOCK_BOOTTIME and TFD_NONBLOCK in timerfd
2 parents 99fe7ed + 0a2199c commit 3fb7e18

9 files changed

Lines changed: 312 additions & 43 deletions

File tree

src/syscall/abi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,13 +378,15 @@ typedef struct {
378378
#define LINUX_O_TRUNC 0x0200
379379
#define LINUX_O_APPEND 0x0400
380380
#define LINUX_O_NONBLOCK 0x0800
381+
#define LINUX_O_ASYNC 0x2000
381382
/* aarch64-linux open flag values (from asm-generic/fcntl.h).
382383
* These differ from x86_64-linux values.
383384
*/
384385
#define LINUX_O_DIRECTORY 0x4000 /* 040000 octal */
385386
#define LINUX_O_NOFOLLOW 0x8000 /* 0100000 octal */
386387
#define LINUX_O_DIRECT 0x10000 /* 0200000 octal */
387388
#define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */
389+
#define LINUX_O_NOATIME 0x40000 /* 01000000 octal */
388390
#define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */
389391
#define LINUX_O_PATH 0x200000 /* 010000000 octal */
390392

src/syscall/fd.c

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ typedef struct {
8989
#define LINUX_CLOCK_REALTIME 0
9090
#define LINUX_CLOCK_MONOTONIC 1
9191

92+
/* Linux CLOCK_BOOTTIME counts time including suspend; macOS has no equivalent.
93+
* timerfd_settime treats non-REALTIME slots as MONOTONIC for ABSTIME
94+
* conversion, which matches translate_clockid() in time.c.
95+
*/
96+
#define LINUX_CLOCK_BOOTTIME 7
97+
9298
static struct {
9399
int guest_fd; /* Guest fd (-1 if unused) */
94100
int kq_fd; /* kqueue fd for this timer */
@@ -167,7 +173,8 @@ static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns)
167173

168174
int64_t sys_timerfd_create(int clockid, int flags)
169175
{
170-
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC)
176+
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC &&
177+
clockid != LINUX_CLOCK_BOOTTIME)
171178
return -LINUX_EINVAL;
172179
if (flags & ~(LINUX_TFD_CLOEXEC | LINUX_TFD_NONBLOCK))
173180
return -LINUX_EINVAL;
@@ -176,8 +183,12 @@ int64_t sys_timerfd_create(int clockid, int flags)
176183
if (kq < 0)
177184
return linux_errno();
178185

179-
if (((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) ||
180-
((flags & LINUX_TFD_NONBLOCK) && fd_set_nonblock(kq) < 0)) {
186+
/* macOS kqueue fds reject fcntl(F_SETFL, O_NONBLOCK) with ENOTTY, so
187+
* track the non-blocking mode in fd_table[gfd].linux_flags below and
188+
* let timerfd_read consult that field directly. F_SETFD CLOEXEC still
189+
* works on a kqueue fd.
190+
*/
191+
if ((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) {
181192
close(kq);
182193
return linux_errno();
183194
}
@@ -203,8 +214,14 @@ int64_t sys_timerfd_create(int clockid, int flags)
203214
timerfd_state[slot].clockid = clockid;
204215
pthread_mutex_unlock(&sfd_lock);
205216

206-
fd_table[gfd].linux_flags =
207-
(flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
217+
/* Linux opens the timerfd inode O_RDWR (anon_inode_getfd in
218+
* fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch
219+
* below can surface the access mode without re-deriving it.
220+
*/
221+
fd_publish_linux_flags(
222+
gfd, LINUX_O_RDWR |
223+
((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) |
224+
((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0));
208225
return gfd;
209226
}
210227

@@ -396,6 +413,15 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
396413
if (count < 8)
397414
return -LINUX_EINVAL;
398415

416+
/* Snapshot the NONBLOCK status under fd_lock before sfd_lock to match the
417+
* documented lock order (fd_lock=3 < sfd_lock=5a). The kqueue host fd
418+
* rejects fcntl(F_SETFL, O_NONBLOCK) on macOS, so the flag lives in
419+
* fd_table[guest_fd].linux_flags rather than on the host fd.
420+
*/
421+
pthread_mutex_lock(&fd_lock);
422+
bool nonblock = fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK;
423+
pthread_mutex_unlock(&fd_lock);
424+
399425
pthread_mutex_lock(&sfd_lock);
400426
int slot = timerfd_find(guest_fd);
401427
if (slot < 0) {
@@ -409,9 +435,7 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
409435
timerfd_drain_pending_locked(slot);
410436

411437
if (timerfd_state[slot].expirations == 0) {
412-
/* No events yet; check if non-blocking */
413-
int fl = fcntl(kq, F_GETFL);
414-
if (fl >= 0 && (fl & O_NONBLOCK)) {
438+
if (nonblock) {
415439
pthread_mutex_unlock(&sfd_lock);
416440
return -LINUX_EAGAIN;
417441
}
@@ -636,8 +660,8 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
636660
eventfd_owner[gfd] = slot;
637661
pthread_mutex_unlock(&sfd_lock);
638662

639-
fd_table[gfd].linux_flags =
640-
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
663+
fd_publish_linux_flags(gfd,
664+
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
641665

642666
/* If initial counter > 0, make the pipe readable so poll sees it */
643667
if (initval > 0) {
@@ -1060,8 +1084,8 @@ int64_t sys_signalfd4(guest_t *g,
10601084
signalfd_state[slot].nonblock = (flags & LINUX_SFD_NONBLOCK) ? 1 : 0;
10611085
pthread_mutex_unlock(&sfd_lock);
10621086

1063-
fd_table[gfd].linux_flags =
1064-
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
1087+
fd_publish_linux_flags(gfd,
1088+
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
10651089

10661090
return gfd;
10671091
}

src/syscall/fdtable.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,15 @@ int fd_get_type(int guest_fd)
404404
return type;
405405
}
406406

407+
void fd_publish_linux_flags(int guest_fd, int linux_flags)
408+
{
409+
if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
410+
return;
411+
pthread_mutex_lock(&fd_lock);
412+
fd_table[guest_fd].linux_flags = linux_flags;
413+
pthread_mutex_unlock(&fd_lock);
414+
}
415+
407416
/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed
408417
* by type. Each slot defaults to NULL (no per-type cleanup). Modules that
409418
* own a type call fd_register_cleanup() at init time; dup and fork-restore

src/syscall/fs.c

Lines changed: 99 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -501,10 +501,15 @@ static bool install_fd_alias_metadata_atomic(int dst_fd,
501501
int linux_flags,
502502
DIR *dir)
503503
{
504+
/* LINUX_O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2).
505+
* Required for FD_TIMERFD (and any other type that stores NONBLOCK in
506+
* linux_flags rather than on the host fd) so a duplicated non-blocking
507+
* timerfd does not silently turn blocking.
508+
*/
504509
int preserved_flags =
505510
src_snap->linux_flags &
506511
(LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
507-
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
512+
LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK);
508513
int final_flags = preserved_flags | linux_flags;
509514

510515
bool installed = false;
@@ -663,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
663668
if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
664669
return -LINUX_EBADF;
665670

666-
int fd_type = fd_table[fd].type;
671+
/* Snapshot the slot under fd_lock once; readers use fd_snap below, and
672+
* writers reacquire fd_lock and revalidate against fd_snap.generation
673+
* so a close+reopen between the snapshot and the RMW returns EBADF
674+
* instead of mutating an unrelated fd.
675+
*/
676+
fd_entry_t fd_snap;
677+
if (!fd_snapshot(fd, &fd_snap))
678+
return -LINUX_EBADF;
679+
680+
int fd_type = fd_snap.type;
667681
bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE ||
668682
fd_type == FD_FUSE_DIR);
669683

@@ -676,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
676690
if ((int) arg < 0) {
677691
return -LINUX_EINVAL;
678692
}
679-
int dup_flags = fd_table[fd].linux_flags & ~LINUX_O_CLOEXEC;
693+
int dup_flags = fd_snap.linux_flags & ~LINUX_O_CLOEXEC;
680694
if (cmd == 1030)
681695
dup_flags |= LINUX_O_CLOEXEC;
682696
int gfd = duplicate_guest_fd(fd, (int) arg, -1, false, dup_flags);
@@ -690,20 +704,38 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
690704
return gfd;
691705
}
692706
case 1: /* F_GETFD */
693-
return (fd_table[fd].linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC
694-
: 0;
707+
return (fd_snap.linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC : 0;
695708
case 2: /* F_SETFD */
709+
/* Hold fd_lock across the read-modify-write so the CLOEXEC flip is
710+
* atomic against a concurrent F_SETFL on the same shadow word and
711+
* against any fd_lock-protected reader. Revalidate against the
712+
* snapshot generation so a close+reopen returns EBADF.
713+
*/
714+
pthread_mutex_lock(&fd_lock);
715+
if (fd_table[fd].type == FD_CLOSED ||
716+
fd_table[fd].generation != fd_snap.generation) {
717+
pthread_mutex_unlock(&fd_lock);
718+
return -LINUX_EBADF;
719+
}
696720
if ((int) arg & LINUX_FD_CLOEXEC)
697721
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
698722
else
699723
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
724+
pthread_mutex_unlock(&fd_lock);
700725
return 0;
701726
case 3: { /* F_GETFL */
702727
if (fuse_fd)
703-
return fd_table[fd].linux_flags;
704-
fd_entry_t snap;
705-
if (!fd_snapshot(fd, &snap))
706-
return -LINUX_EBADF;
728+
return fd_snap.linux_flags;
729+
/* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits
730+
* the kernel honors. Surface only those bits from the shadow rather
731+
* than echoing arbitrary linux_flags bits so stray F_SETFL args
732+
* cannot leak through here. O_ASYNC stays off because timerfd_fops
733+
* lacks ->fasync, so generic_setfl drops it.
734+
*/
735+
if (fd_type == FD_TIMERFD)
736+
return LINUX_O_RDWR |
737+
(fd_snap.linux_flags &
738+
(LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME));
707739
host_fd_ref_t host_ref;
708740
if (host_fd_ref_open(fd, &host_ref) < 0)
709741
return -LINUX_EBADF;
@@ -712,26 +744,72 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
712744
if (mac_fl < 0)
713745
return linux_errno();
714746
int linux_fl = mac_to_linux_status_flags(mac_fl);
715-
if (snap.type == FD_REGULAR || snap.type == FD_DIR ||
716-
snap.type == FD_PATH || snap.type == FD_URANDOM)
717-
linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3);
718-
linux_fl |= snap.linux_flags &
747+
if (fd_snap.type == FD_REGULAR || fd_snap.type == FD_DIR ||
748+
fd_snap.type == FD_PATH || fd_snap.type == FD_URANDOM)
749+
linux_fl = (linux_fl & ~O_ACCMODE) | (fd_snap.linux_flags & 3);
750+
linux_fl |= fd_snap.linux_flags &
719751
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
720752
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
721753
return linux_fl;
722754
}
723755
case 4: /* F_SETFL */
724756
{
725757
if (fuse_fd) {
726-
int preserved =
727-
fd_table[fd].linux_flags &
728-
(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
729-
LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE);
758+
/* Preserve LINUX_O_ACCMODE: F_SETFL is not allowed to change the
759+
* access mode in the Linux kernel, and without preserving it
760+
* here a stray F_SETFL(0) would silently flip an O_RDWR FUSE
761+
* shadow to O_RDONLY, surfacing the wrong mode through F_GETFL.
762+
*
763+
* Hold fd_lock across the read-modify-write so the update is
764+
* atomic against a concurrent F_SETFD and any fd_lock-protected
765+
* reader. Revalidate against the snapshot generation so a
766+
* close+reopen returns EBADF.
767+
*/
768+
pthread_mutex_lock(&fd_lock);
769+
if (fd_table[fd].type != fd_type ||
770+
fd_table[fd].generation != fd_snap.generation) {
771+
pthread_mutex_unlock(&fd_lock);
772+
return -LINUX_EBADF;
773+
}
774+
int preserved = fd_table[fd].linux_flags &
775+
(LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH |
776+
LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
777+
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
778+
fd_table[fd].linux_flags =
779+
preserved | ((int) arg & ~(LINUX_O_ACCMODE | LINUX_O_CLOEXEC |
780+
LINUX_O_PATH | LINUX_O_DIRECTORY |
781+
LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
782+
LINUX_O_LARGEFILE));
783+
pthread_mutex_unlock(&fd_lock);
784+
return 0;
785+
}
786+
/* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's
787+
* file-status word in the linux_flags shadow. Of Linux's writable
788+
* status flags (O_APPEND, O_ASYNC, O_DIRECT, O_NOATIME, O_NONBLOCK)
789+
* the timerfd kernel object honors O_APPEND, O_NONBLOCK, and
790+
* O_NOATIME. O_ASYNC is silently dropped (timerfd_fops lacks
791+
* ->fasync). O_DIRECT returns -EINVAL because the inode lacks
792+
* FMODE_CAN_ODIRECT. Bits outside the writable set (access mode,
793+
* CLOEXEC, O_PATH/DIRECTORY/NOFOLLOW/etc.) are silently ignored,
794+
* matching how Linux F_SETFL drops them.
795+
*/
796+
if (fd_type == FD_TIMERFD) {
797+
const int setfl_mask =
798+
LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME;
799+
pthread_mutex_lock(&fd_lock);
800+
if (fd_table[fd].type != FD_TIMERFD ||
801+
fd_table[fd].generation != fd_snap.generation) {
802+
pthread_mutex_unlock(&fd_lock);
803+
return -LINUX_EBADF;
804+
}
805+
if ((int) arg & LINUX_O_DIRECT) {
806+
pthread_mutex_unlock(&fd_lock);
807+
return -LINUX_EINVAL;
808+
}
730809
fd_table[fd].linux_flags =
731-
preserved |
732-
((int) arg &
733-
~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
734-
LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE));
810+
(fd_table[fd].linux_flags & ~setfl_mask) |
811+
((int) arg & setfl_mask);
812+
pthread_mutex_unlock(&fd_lock);
735813
return 0;
736814
}
737815
host_fd_ref_t host_ref;

src/syscall/fuse.c

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,8 +1329,11 @@ int fuse_proc_open(int linux_flags)
13291329
errno = EMFILE;
13301330
return -1;
13311331
}
1332-
fd_table[guest_fd].linux_flags = linux_flags;
13331332
pthread_mutex_unlock(&fuse_lock);
1333+
/* Publish under fd_lock so the write is on the same lock domain as
1334+
* sys_fcntl(F_SETFL/F_SETFD), not stranded behind fuse_lock.
1335+
*/
1336+
fd_publish_linux_flags(guest_fd, linux_flags);
13341337
return guest_fd;
13351338
}
13361339

@@ -1897,8 +1900,11 @@ int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode)
18971900
fd_mark_closed(guest_fd);
18981901
return -LINUX_EMFILE;
18991902
}
1900-
fd_table[guest_fd].linux_flags = linux_flags;
19011903
pthread_mutex_unlock(&fuse_lock);
1904+
/* Publish under fd_lock so the open's flags land on the same lock
1905+
* domain that sys_fcntl(F_SETFL/F_SETFD) uses.
1906+
*/
1907+
fd_publish_linux_flags(guest_fd, linux_flags);
19021908
return guest_fd;
19031909
}
19041910

@@ -2607,11 +2613,24 @@ int fuse_dup_fd(int src_fd,
26072613
}
26082614
}
26092615

2610-
int preserved_flags = fd_table[src_fd].linux_flags &
2611-
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
2612-
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
2613-
fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
26142616
pthread_mutex_unlock(&fuse_lock);
2617+
2618+
/* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without
2619+
* it a duplicated non-blocking FUSE fd would silently become blocking
2620+
* because nothing else carries the flag forward.
2621+
*
2622+
* Take fd_lock once for both the source read and the destination write
2623+
* so the dup snapshot is consistent with any concurrent F_SETFL on the
2624+
* source and so the destination publish cannot be overwritten by an
2625+
* early racing F_SETFL on the new slot.
2626+
*/
2627+
pthread_mutex_lock(&fd_lock);
2628+
int preserved_flags =
2629+
fd_table[src_fd].linux_flags &
2630+
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
2631+
LINUX_O_LARGEFILE | LINUX_O_NONBLOCK);
2632+
fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
2633+
pthread_mutex_unlock(&fd_lock);
26152634
return guest_fd;
26162635
}
26172636

src/syscall/inotify.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ int64_t sys_inotify_init1(int flags)
402402
memset(inst->watches, 0, sizeof(inst->watches));
403403
pthread_mutex_unlock(&inotify_lock);
404404

405-
fd_table[gfd].linux_flags = (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
405+
fd_publish_linux_flags(gfd, (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
406406

407407
return gfd;
408408
}

src/syscall/internal.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,15 @@ int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out);
109109
*/
110110
int fd_get_type(int guest_fd);
111111

112+
/* Publish linux_flags for a guest fd under fd_lock. Use after fd_alloc when
113+
* the creating syscall needs to set linux_flags atomically with respect to a
114+
* concurrent fcntl(F_SETFL/F_SETFD) on the same slot. The fd_alloc-then-
115+
* publish window is small (the new gfd is not communicated to other threads
116+
* until the syscall returns) but the lock removes the structural race and
117+
* keeps every linux_flags writer on one lock domain.
118+
*/
119+
void fd_publish_linux_flags(int guest_fd, int linux_flags);
120+
112121
/* Republish the EL1 urandom read fast-path bit for this fd from the current
113122
* fd_table type and access mode. Only readable /dev/urandom descriptors are
114123
* eligible for the bitmap.

0 commit comments

Comments
 (0)