@@ -501,10 +501,15 @@ static bool install_fd_alias_metadata_atomic(int dst_fd,
501501 int linux_flags ,
502502 DIR * dir )
503503{
504+ /* LINUX_O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2).
505+ * Required for FD_TIMERFD (and any other type that stores NONBLOCK in
506+ * linux_flags rather than on the host fd) so a duplicated non-blocking
507+ * timerfd does not silently turn blocking.
508+ */
504509 int preserved_flags =
505510 src_snap -> linux_flags &
506511 (LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
507- LINUX_O_DIRECT | LINUX_O_LARGEFILE );
512+ LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK );
508513 int final_flags = preserved_flags | linux_flags ;
509514
510515 bool installed = false;
@@ -663,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
663668 if (!RANGE_CHECK (fd , 0 , FD_TABLE_SIZE ))
664669 return - LINUX_EBADF ;
665670
666- int fd_type = fd_table [fd ].type ;
671+ /* Snapshot the slot under fd_lock once; readers use fd_snap below, and
672+ * writers reacquire fd_lock and revalidate against fd_snap.generation
673+ * so a close+reopen between the snapshot and the RMW returns EBADF
674+ * instead of mutating an unrelated fd.
675+ */
676+ fd_entry_t fd_snap ;
677+ if (!fd_snapshot (fd , & fd_snap ))
678+ return - LINUX_EBADF ;
679+
680+ int fd_type = fd_snap .type ;
667681 bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE ||
668682 fd_type == FD_FUSE_DIR );
669683
@@ -676,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
676690 if ((int ) arg < 0 ) {
677691 return - LINUX_EINVAL ;
678692 }
679- int dup_flags = fd_table [ fd ] .linux_flags & ~LINUX_O_CLOEXEC ;
693+ int dup_flags = fd_snap .linux_flags & ~LINUX_O_CLOEXEC ;
680694 if (cmd == 1030 )
681695 dup_flags |= LINUX_O_CLOEXEC ;
682696 int gfd = duplicate_guest_fd (fd , (int ) arg , -1 , false, dup_flags );
@@ -690,20 +704,38 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
690704 return gfd ;
691705 }
692706 case 1 : /* F_GETFD */
693- return (fd_table [fd ].linux_flags & LINUX_O_CLOEXEC ) ? LINUX_FD_CLOEXEC
694- : 0 ;
707+ return (fd_snap .linux_flags & LINUX_O_CLOEXEC ) ? LINUX_FD_CLOEXEC : 0 ;
695708 case 2 : /* F_SETFD */
709+ /* Hold fd_lock across the read-modify-write so the CLOEXEC flip is
710+ * atomic against a concurrent F_SETFL on the same shadow word and
711+ * against any fd_lock-protected reader. Revalidate against the
712+ * snapshot generation so a close+reopen returns EBADF.
713+ */
714+ pthread_mutex_lock (& fd_lock );
715+ if (fd_table [fd ].type == FD_CLOSED ||
716+ fd_table [fd ].generation != fd_snap .generation ) {
717+ pthread_mutex_unlock (& fd_lock );
718+ return - LINUX_EBADF ;
719+ }
696720 if ((int ) arg & LINUX_FD_CLOEXEC )
697721 fd_table [fd ].linux_flags |= LINUX_O_CLOEXEC ;
698722 else
699723 fd_table [fd ].linux_flags &= ~LINUX_O_CLOEXEC ;
724+ pthread_mutex_unlock (& fd_lock );
700725 return 0 ;
701726 case 3 : { /* F_GETFL */
702727 if (fuse_fd )
703- return fd_table [fd ].linux_flags ;
704- fd_entry_t snap ;
705- if (!fd_snapshot (fd , & snap ))
706- return - LINUX_EBADF ;
728+ return fd_snap .linux_flags ;
729+ /* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits
730+ * the kernel honors. Surface only those bits from the shadow rather
731+ * than echoing arbitrary linux_flags bits so stray F_SETFL args
732+ * cannot leak through here. O_ASYNC stays off because timerfd_fops
733+ * lacks ->fasync, so generic_setfl drops it.
734+ */
735+ if (fd_type == FD_TIMERFD )
736+ return LINUX_O_RDWR |
737+ (fd_snap .linux_flags &
738+ (LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME ));
707739 host_fd_ref_t host_ref ;
708740 if (host_fd_ref_open (fd , & host_ref ) < 0 )
709741 return - LINUX_EBADF ;
@@ -712,26 +744,72 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
712744 if (mac_fl < 0 )
713745 return linux_errno ();
714746 int linux_fl = mac_to_linux_status_flags (mac_fl );
715- if (snap .type == FD_REGULAR || snap .type == FD_DIR ||
716- snap .type == FD_PATH || snap .type == FD_URANDOM )
717- linux_fl = (linux_fl & ~O_ACCMODE ) | (snap .linux_flags & 3 );
718- linux_fl |= snap .linux_flags &
747+ if (fd_snap .type == FD_REGULAR || fd_snap .type == FD_DIR ||
748+ fd_snap .type == FD_PATH || fd_snap .type == FD_URANDOM )
749+ linux_fl = (linux_fl & ~O_ACCMODE ) | (fd_snap .linux_flags & 3 );
750+ linux_fl |= fd_snap .linux_flags &
719751 (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
720752 LINUX_O_DIRECT | LINUX_O_LARGEFILE );
721753 return linux_fl ;
722754 }
723755 case 4 : /* F_SETFL */
724756 {
725757 if (fuse_fd ) {
726- int preserved =
727- fd_table [fd ].linux_flags &
728- (LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
729- LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE );
758+ /* Preserve LINUX_O_ACCMODE: F_SETFL is not allowed to change the
759+ * access mode in the Linux kernel, and without preserving it
760+ * here a stray F_SETFL(0) would silently flip an O_RDWR FUSE
761+ * shadow to O_RDONLY, surfacing the wrong mode through F_GETFL.
762+ *
763+ * Hold fd_lock across the read-modify-write so the update is
764+ * atomic against a concurrent F_SETFD and any fd_lock-protected
765+ * reader. Revalidate against the snapshot generation so a
766+ * close+reopen returns EBADF.
767+ */
768+ pthread_mutex_lock (& fd_lock );
769+ if (fd_table [fd ].type != fd_type ||
770+ fd_table [fd ].generation != fd_snap .generation ) {
771+ pthread_mutex_unlock (& fd_lock );
772+ return - LINUX_EBADF ;
773+ }
774+ int preserved = fd_table [fd ].linux_flags &
775+ (LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH |
776+ LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
777+ LINUX_O_DIRECT | LINUX_O_LARGEFILE );
778+ fd_table [fd ].linux_flags =
779+ preserved | ((int ) arg & ~(LINUX_O_ACCMODE | LINUX_O_CLOEXEC |
780+ LINUX_O_PATH | LINUX_O_DIRECTORY |
781+ LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
782+ LINUX_O_LARGEFILE ));
783+ pthread_mutex_unlock (& fd_lock );
784+ return 0 ;
785+ }
786+ /* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's
787+ * file-status word in the linux_flags shadow. Of Linux's writable
788+ * status flags (O_APPEND, O_ASYNC, O_DIRECT, O_NOATIME, O_NONBLOCK)
789+ * the timerfd kernel object honors O_APPEND, O_NONBLOCK, and
790+ * O_NOATIME. O_ASYNC is silently dropped (timerfd_fops lacks
791+ * ->fasync). O_DIRECT returns -EINVAL because the inode lacks
792+ * FMODE_CAN_ODIRECT. Bits outside the writable set (access mode,
793+ * CLOEXEC, O_PATH/DIRECTORY/NOFOLLOW/etc.) are silently ignored,
794+ * matching how Linux F_SETFL drops them.
795+ */
796+ if (fd_type == FD_TIMERFD ) {
797+ const int setfl_mask =
798+ LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME ;
799+ pthread_mutex_lock (& fd_lock );
800+ if (fd_table [fd ].type != FD_TIMERFD ||
801+ fd_table [fd ].generation != fd_snap .generation ) {
802+ pthread_mutex_unlock (& fd_lock );
803+ return - LINUX_EBADF ;
804+ }
805+ if ((int ) arg & LINUX_O_DIRECT ) {
806+ pthread_mutex_unlock (& fd_lock );
807+ return - LINUX_EINVAL ;
808+ }
730809 fd_table [fd ].linux_flags =
731- preserved |
732- ((int ) arg &
733- ~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
734- LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE ));
810+ (fd_table [fd ].linux_flags & ~setfl_mask ) |
811+ ((int ) arg & setfl_mask );
812+ pthread_mutex_unlock (& fd_lock );
735813 return 0 ;
736814 }
737815 host_fd_ref_t host_ref ;
0 commit comments