Skip to content

Commit 0a2199c

Browse files
committed
Serialize linux_flags writes on fd_lock
Several callers wrote fd_table[gfd].linux_flags under different locks or none at all, so a concurrent fcntl(F_SETFL/F_SETFD) on fd_lock could race a creator's bare assignment. sys_fcntl read the slot's type and flags outside fd_lock and mutated them without revalidation, so a close+reopen between the read and the write could update an unrelated fd. This commit unifies both concerns under fd_lock. fd_publish_linux_flags helper New fdtable helper takes fd_lock around a single linux_flags write. Replaces bare assignments in sys_timerfd_create, sys_eventfd, sys_signalfd, sys_inotify_init1, the FUSE dev mount, and fuse_open. fuse_dup_fd takes fd_lock once for both the source read and the destination write so the preserved-flags snapshot stays consistent with a racing F_SETFL on either fd. The result: every write to fd_table[*].linux_flags is now serialized on the same lock, with no fuse_lock<->fd_lock nesting introduced. sys_fcntl snapshot-then-revalidate sys_fcntl now takes a single fd_snapshot at entry and uses it for F_GETFD, F_GETFL, and F_DUPFD source reads. F_SETFD and the FUSE / timerfd F_SETFL writers reacquire fd_lock and revalidate against fd_snap.generation before mutating linux_flags. fd_alloc bumps a monotonic generation counter per slot reuse, so close+reopen between snapshot and lock is caught and returns EBADF rather than mutating an unrelated fd. The timerfd F_SETFL O_DIRECT EINVAL check moves inside the lock so a stale-snapshot race cannot report EINVAL based on a fd that is no longer a timerfd; the revalidation returns EBADF first instead. A new test exercises the cross-cutting fd_lock RMW: F_SETFL stamps the writable status bits, then F_SETFD toggles CLOEXEC, and F_GETFL must still surface the status bits unperturbed.
1 parent b526ad7 commit 0a2199c

7 files changed

Lines changed: 124 additions & 26 deletions

File tree

src/syscall/fd.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,10 @@ int64_t sys_timerfd_create(int clockid, int flags)
218218
* fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch
219219
* below can surface the access mode without re-deriving it.
220220
*/
221-
fd_table[gfd].linux_flags =
222-
LINUX_O_RDWR | ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) |
223-
((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0);
221+
fd_publish_linux_flags(
222+
gfd, LINUX_O_RDWR |
223+
((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) |
224+
((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0));
224225
return gfd;
225226
}
226227

@@ -659,8 +660,8 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
659660
eventfd_owner[gfd] = slot;
660661
pthread_mutex_unlock(&sfd_lock);
661662

662-
fd_table[gfd].linux_flags =
663-
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
663+
fd_publish_linux_flags(gfd,
664+
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
664665

665666
/* If initial counter > 0, make the pipe readable so poll sees it */
666667
if (initval > 0) {
@@ -1083,8 +1084,8 @@ int64_t sys_signalfd4(guest_t *g,
10831084
signalfd_state[slot].nonblock = (flags & LINUX_SFD_NONBLOCK) ? 1 : 0;
10841085
pthread_mutex_unlock(&sfd_lock);
10851086

1086-
fd_table[gfd].linux_flags =
1087-
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
1087+
fd_publish_linux_flags(gfd,
1088+
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
10881089

10891090
return gfd;
10901091
}

src/syscall/fdtable.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,15 @@ int fd_get_type(int guest_fd)
404404
return type;
405405
}
406406

407+
void fd_publish_linux_flags(int guest_fd, int linux_flags)
408+
{
409+
if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
410+
return;
411+
pthread_mutex_lock(&fd_lock);
412+
fd_table[guest_fd].linux_flags = linux_flags;
413+
pthread_mutex_unlock(&fd_lock);
414+
}
415+
407416
/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed
408417
* by type. Each slot defaults to NULL (no per-type cleanup). Modules that
409418
* own a type call fd_register_cleanup() at init time; dup and fork-restore

src/syscall/fs.c

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
668668
if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
669669
return -LINUX_EBADF;
670670

671-
int fd_type = fd_table[fd].type;
671+
/* Snapshot the slot under fd_lock once; readers use fd_snap below, and
672+
* writers reacquire fd_lock and revalidate against fd_snap.generation
673+
* so a close+reopen between the snapshot and the RMW returns EBADF
674+
* instead of mutating an unrelated fd.
675+
*/
676+
fd_entry_t fd_snap;
677+
if (!fd_snapshot(fd, &fd_snap))
678+
return -LINUX_EBADF;
679+
680+
int fd_type = fd_snap.type;
672681
bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE ||
673682
fd_type == FD_FUSE_DIR);
674683

@@ -681,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
681690
if ((int) arg < 0) {
682691
return -LINUX_EINVAL;
683692
}
684-
int dup_flags = fd_table[fd].linux_flags & ~LINUX_O_CLOEXEC;
693+
int dup_flags = fd_snap.linux_flags & ~LINUX_O_CLOEXEC;
685694
if (cmd == 1030)
686695
dup_flags |= LINUX_O_CLOEXEC;
687696
int gfd = duplicate_guest_fd(fd, (int) arg, -1, false, dup_flags);
@@ -695,17 +704,28 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
695704
return gfd;
696705
}
697706
case 1: /* F_GETFD */
698-
return (fd_table[fd].linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC
699-
: 0;
707+
return (fd_snap.linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC : 0;
700708
case 2: /* F_SETFD */
709+
/* Hold fd_lock across the read-modify-write so the CLOEXEC flip is
710+
* atomic against a concurrent F_SETFL on the same shadow word and
711+
* against any fd_lock-protected reader. Revalidate against the
712+
* snapshot generation so a close+reopen returns EBADF.
713+
*/
714+
pthread_mutex_lock(&fd_lock);
715+
if (fd_table[fd].type == FD_CLOSED ||
716+
fd_table[fd].generation != fd_snap.generation) {
717+
pthread_mutex_unlock(&fd_lock);
718+
return -LINUX_EBADF;
719+
}
701720
if ((int) arg & LINUX_FD_CLOEXEC)
702721
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
703722
else
704723
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
724+
pthread_mutex_unlock(&fd_lock);
705725
return 0;
706726
case 3: { /* F_GETFL */
707727
if (fuse_fd)
708-
return fd_table[fd].linux_flags;
728+
return fd_snap.linux_flags;
709729
/* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits
710730
* the kernel honors. Surface only those bits from the shadow rather
711731
* than echoing arbitrary linux_flags bits so stray F_SETFL args
@@ -714,11 +734,8 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
714734
*/
715735
if (fd_type == FD_TIMERFD)
716736
return LINUX_O_RDWR |
717-
(fd_table[fd].linux_flags &
737+
(fd_snap.linux_flags &
718738
(LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME));
719-
fd_entry_t snap;
720-
if (!fd_snapshot(fd, &snap))
721-
return -LINUX_EBADF;
722739
host_fd_ref_t host_ref;
723740
if (host_fd_ref_open(fd, &host_ref) < 0)
724741
return -LINUX_EBADF;
@@ -727,10 +744,10 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
727744
if (mac_fl < 0)
728745
return linux_errno();
729746
int linux_fl = mac_to_linux_status_flags(mac_fl);
730-
if (snap.type == FD_REGULAR || snap.type == FD_DIR ||
731-
snap.type == FD_PATH || snap.type == FD_URANDOM)
732-
linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3);
733-
linux_fl |= snap.linux_flags &
747+
if (fd_snap.type == FD_REGULAR || fd_snap.type == FD_DIR ||
748+
fd_snap.type == FD_PATH || fd_snap.type == FD_URANDOM)
749+
linux_fl = (linux_fl & ~O_ACCMODE) | (fd_snap.linux_flags & 3);
750+
linux_fl |= fd_snap.linux_flags &
734751
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
735752
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
736753
return linux_fl;
@@ -742,7 +759,18 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
742759
* access mode in the Linux kernel, and without preserving it
743760
* here a stray F_SETFL(0) would silently flip an O_RDWR FUSE
744761
* shadow to O_RDONLY, surfacing the wrong mode through F_GETFL.
762+
*
763+
* Hold fd_lock across the read-modify-write so the update is
764+
* atomic against a concurrent F_SETFD and any fd_lock-protected
765+
* reader. Revalidate against the snapshot generation so a
766+
* close+reopen returns EBADF.
745767
*/
768+
pthread_mutex_lock(&fd_lock);
769+
if (fd_table[fd].type != fd_type ||
770+
fd_table[fd].generation != fd_snap.generation) {
771+
pthread_mutex_unlock(&fd_lock);
772+
return -LINUX_EBADF;
773+
}
746774
int preserved = fd_table[fd].linux_flags &
747775
(LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH |
748776
LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
@@ -752,6 +780,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
752780
LINUX_O_PATH | LINUX_O_DIRECTORY |
753781
LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
754782
LINUX_O_LARGEFILE));
783+
pthread_mutex_unlock(&fd_lock);
755784
return 0;
756785
}
757786
/* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's
@@ -765,13 +794,22 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
765794
* matching how Linux F_SETFL drops them.
766795
*/
767796
if (fd_type == FD_TIMERFD) {
768-
if ((int) arg & LINUX_O_DIRECT)
769-
return -LINUX_EINVAL;
770797
const int setfl_mask =
771798
LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME;
799+
pthread_mutex_lock(&fd_lock);
800+
if (fd_table[fd].type != FD_TIMERFD ||
801+
fd_table[fd].generation != fd_snap.generation) {
802+
pthread_mutex_unlock(&fd_lock);
803+
return -LINUX_EBADF;
804+
}
805+
if ((int) arg & LINUX_O_DIRECT) {
806+
pthread_mutex_unlock(&fd_lock);
807+
return -LINUX_EINVAL;
808+
}
772809
fd_table[fd].linux_flags =
773810
(fd_table[fd].linux_flags & ~setfl_mask) |
774811
((int) arg & setfl_mask);
812+
pthread_mutex_unlock(&fd_lock);
775813
return 0;
776814
}
777815
host_fd_ref_t host_ref;

src/syscall/fuse.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,8 +1329,11 @@ int fuse_proc_open(int linux_flags)
13291329
errno = EMFILE;
13301330
return -1;
13311331
}
1332-
fd_table[guest_fd].linux_flags = linux_flags;
13331332
pthread_mutex_unlock(&fuse_lock);
1333+
/* Publish under fd_lock so the write is on the same lock domain as
1334+
* sys_fcntl(F_SETFL/F_SETFD), not stranded behind fuse_lock.
1335+
*/
1336+
fd_publish_linux_flags(guest_fd, linux_flags);
13341337
return guest_fd;
13351338
}
13361339

@@ -1897,8 +1900,11 @@ int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode)
18971900
fd_mark_closed(guest_fd);
18981901
return -LINUX_EMFILE;
18991902
}
1900-
fd_table[guest_fd].linux_flags = linux_flags;
19011903
pthread_mutex_unlock(&fuse_lock);
1904+
/* Publish under fd_lock so the open's flags land on the same lock
1905+
* domain that sys_fcntl(F_SETFL/F_SETFD) uses.
1906+
*/
1907+
fd_publish_linux_flags(guest_fd, linux_flags);
19021908
return guest_fd;
19031909
}
19041910

@@ -2607,16 +2613,24 @@ int fuse_dup_fd(int src_fd,
26072613
}
26082614
}
26092615

2616+
pthread_mutex_unlock(&fuse_lock);
2617+
26102618
/* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without
26112619
* it a duplicated non-blocking FUSE fd would silently become blocking
26122620
* because nothing else carries the flag forward.
2621+
*
2622+
* Take fd_lock once for both the source read and the destination write
2623+
* so the dup snapshot is consistent with any concurrent F_SETFL on the
2624+
* source and so the destination publish cannot be overwritten by an
2625+
* early racing F_SETFL on the new slot.
26132626
*/
2627+
pthread_mutex_lock(&fd_lock);
26142628
int preserved_flags =
26152629
fd_table[src_fd].linux_flags &
26162630
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
26172631
LINUX_O_LARGEFILE | LINUX_O_NONBLOCK);
26182632
fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
2619-
pthread_mutex_unlock(&fuse_lock);
2633+
pthread_mutex_unlock(&fd_lock);
26202634
return guest_fd;
26212635
}
26222636

src/syscall/inotify.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ int64_t sys_inotify_init1(int flags)
402402
memset(inst->watches, 0, sizeof(inst->watches));
403403
pthread_mutex_unlock(&inotify_lock);
404404

405-
fd_table[gfd].linux_flags = (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
405+
fd_publish_linux_flags(gfd, (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0);
406406

407407
return gfd;
408408
}

src/syscall/internal.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,15 @@ int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out);
109109
*/
110110
int fd_get_type(int guest_fd);
111111

112+
/* Publish linux_flags for a guest fd under fd_lock. Use after fd_alloc when
113+
* the creating syscall needs to set linux_flags atomically with respect to a
114+
* concurrent fcntl(F_SETFL/F_SETFD) on the same slot. The fd_alloc-then-
115+
* publish window is small (the new gfd is not communicated to other threads
116+
* until the syscall returns) but the lock removes the structural race and
117+
* keeps every linux_flags writer on one lock domain.
118+
*/
119+
void fd_publish_linux_flags(int guest_fd, int linux_flags);
120+
112121
/* Republish the EL1 urandom read fast-path bit for this fd from the current
113122
* fd_table type and access mode. Only readable /dev/urandom descriptors are
114123
* eligible for the bitmap.

tests/test-timerfd.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,33 @@ int main(void)
225225
FAIL("timerfd_create");
226226
}
227227

228+
/* F_SETFD and F_SETFL touch the same linux_flags shadow but operate on
229+
* disjoint bits. Toggling CLOEXEC via F_SETFD must not perturb the
230+
* status bits surfaced by F_GETFL. Targets the new fd_lock-serialized
231+
* RMW in both branches.
232+
*/
233+
TEST("F_SETFD does not perturb F_GETFL status bits");
234+
{
235+
int fd = timerfd_create(CLOCK_MONOTONIC, 0);
236+
if (fd >= 0) {
237+
int wanted = O_APPEND | O_NONBLOCK | O_NOATIME;
238+
EXPECT_TRUE(fcntl(fd, F_SETFL, wanted) == 0, "F_SETFL failed");
239+
240+
EXPECT_TRUE(fcntl(fd, F_SETFD, FD_CLOEXEC) == 0,
241+
"F_SETFD set failed");
242+
int fl = fcntl(fd, F_GETFL);
243+
EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted,
244+
"F_SETFD CLOEXEC perturbed status bits");
245+
246+
EXPECT_TRUE(fcntl(fd, F_SETFD, 0) == 0, "F_SETFD clear failed");
247+
fl = fcntl(fd, F_GETFL);
248+
EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted,
249+
"F_SETFD clear perturbed status bits");
250+
close(fd);
251+
} else
252+
FAIL("timerfd_create");
253+
}
254+
228255
/* dup must carry the linux_flags shadow's NONBLOCK bit through; without
229256
* the preserved-mask fix the new fd's F_GETFL would report blocking.
230257
* (The timerfd slot table is keyed by guest_fd with no alias support,

0 commit comments

Comments
 (0)