Skip to content

Commit 0000d31

Browse files
committed
Fix writable shadow ADDFD_AT race
try_writeback_shadow_open pre-allocated a fast-range FD slot via insert_fast, then used ADDFD_AT to inject a memfd at that number. This raced with concurrent close(2) CONTINUE: the supervisor removed its bookkeeping before the kernel replayed the close, so another thread could reuse that FD number and have the older close tear down the newly injected memfd (observable as EBADF on the next write). This fixes the race with a post-allocation pattern: 1. allocate_writable_shadow_fd scans the fast-shadow band and checks child_fd_is_open() via /proc/pid/fdinfo to skip slots with pending kernel-side closes. 2. ADDFD_AT injects the memfd at the validated slot. 3. The fd-table entry is published only after ADDFD_AT succeeds. The two post-ADDFD error paths (injected!=target_fd, insert_at failure) are unreachable by construction: SECCOMP_ADDFD_FLAG_SETFD guarantees exact-target-or-negative, and the allocator validates the slot is in-range and free. Replace the old misleading EMFILE recovery with abort so a broken kernel contract crashes loudly instead of leaking untracked tracee FD. Add remove_fd_table_entry_with_writeback so dup2, dup3, and exec paths sync writable shadow content back to LKL before clobbering fd-table entries. Change-Id: I3e72df3a9c0ba3f1d8b6e4c7a5290e1fbc834d60
1 parent 5321524 commit 0000d31

6 files changed

Lines changed: 146 additions & 30 deletions

File tree

.github/workflows/build-kbox.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,5 @@ jobs:
171171

172172
- name: Stress tests
173173
run: ./scripts/run-stress.sh ./kbox alpine.ext4
174-
continue-on-error: true
174+
env:
175+
STRESS_TIMEOUT: 120

mk/tests.mk

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ check-integration: $(TARGET) guest-bins stress-bins $(ROOTFS)
9191

9292
check-stress: $(TARGET) stress-bins $(ROOTFS)
9393
@echo " RUN check-stress"
94-
$(Q)./scripts/run-stress.sh ./$(TARGET) $(ROOTFS) || \
95-
echo "(stress test failures are non-blocking -- see TODO.md)"
94+
$(Q)./scripts/run-stress.sh ./$(TARGET) $(ROOTFS)
9695

9796
# ---- Guest / stress binaries (static, no ASAN) ----
9897
# These are cross-compiled on Linux and placed into the rootfs.

scripts/run-stress.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ export LSAN_OPTIONS="${LSAN_OPTIONS:+${LSAN_OPTIONS}:}${SUPP}"
1818

1919
KBOX="${1:-./kbox}"
2020
ROOTFS="${2:-rootfs.ext4}"
21+
22+
# Stress tests exercise threads (concurrent-io), interval timers
23+
# (signal-race), and rapid fork/wait (rapid-fork). These require
24+
# seccomp mode: trap/rewrite mode denies CLONE_THREAD by design and
25+
# has signal-handler re-entrancy constraints that break setitimer.
26+
KBOX_MODE_FLAGS="--syscall-mode=seccomp"
2127
PASS=0
2228
FAIL=0
2329
SKIP=0
@@ -44,7 +50,7 @@ run_stress_test()
4450
printf " %-40s " "$name"
4551

4652
# Check if the test binary exists in the rootfs.
47-
if ! "$KBOX" -S "$ROOTFS" -- /bin/sh -c "test -x '$guest_path'" 2> /dev/null; then
53+
if ! "$KBOX" $KBOX_MODE_FLAGS -S "$ROOTFS" -- /bin/sh -c "test -x '$guest_path'" 2> /dev/null; then
4854
printf "${YELLOW}SKIP${NC} (not in rootfs)\n"
4955
SKIP=$((SKIP + 1))
5056
return
@@ -54,13 +60,13 @@ run_stress_test()
5460

5561
RC=0
5662
if [ -n "$TIMEOUT_CMD" ]; then
57-
if "$TIMEOUT_CMD" "$TIMEOUT" "$KBOX" -S "$ROOTFS" -- "$guest_path" $guest_args > "$OUTPUT" 2>&1; then
63+
if "$TIMEOUT_CMD" "$TIMEOUT" "$KBOX" $KBOX_MODE_FLAGS -S "$ROOTFS" -- "$guest_path" $guest_args > "$OUTPUT" 2>&1; then
5864
RC=0
5965
else
6066
RC=$?
6167
fi
6268
else
63-
if "$KBOX" -S "$ROOTFS" -- "$guest_path" $guest_args > "$OUTPUT" 2>&1; then
69+
if "$KBOX" $KBOX_MODE_FLAGS -S "$ROOTFS" -- "$guest_path" $guest_args > "$OUTPUT" 2>&1; then
6470
RC=0
6571
else
6672
RC=$?

src/dispatch-exec.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ static struct kbox_dispatch trap_userspace_exec(
656656
/* Clean up CLOEXEC entries from the FD table, matching what a
657657
* real exec would do.
658658
*/
659-
kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs);
659+
close_cloexec_with_writeback(ctx);
660660

661661
/* If the original launch used rewrite mode, re-apply binary rewriting to
662662
* the new binary. This patches syscall instructions in the newly loaded
@@ -1024,7 +1024,7 @@ struct kbox_dispatch forward_execve(const struct kbox_syscall_request *req,
10241024
* to keeping stale mappings alive across a successful exec, which misroutes
10251025
* future FD operations in the new image.
10261026
*/
1027-
kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs);
1027+
close_cloexec_with_writeback(ctx);
10281028

10291029
/* Invalidate the cached /proc/pid/mem FD. After exec, the kernel may revoke
10301030
* access to the old FD even though the PID is the same (credential check

src/dispatch-internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
325325
struct kbox_dispatch *out);
326326
int sync_shadow_writeback(struct kbox_supervisor_ctx *ctx,
327327
struct kbox_fd_entry *entry);
328+
void close_cloexec_with_writeback(struct kbox_supervisor_ctx *ctx);
328329

329330
/* Handler functions: dispatch-net.c (networking syscalls). */
330331

src/seccomp-dispatch.c

Lines changed: 131 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,40 @@ long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx)
10301030
return fd;
10311031
}
10321032

1033+
static long allocate_writable_shadow_fd(struct kbox_supervisor_ctx *ctx)
1034+
{
1035+
long base_fd = KBOX_FD_FAST_BASE;
1036+
long end_fd = KBOX_FD_HOSTONLY_BASE;
1037+
long start_fd;
1038+
long fd;
1039+
1040+
if (!ctx || !ctx->fd_table)
1041+
return -1;
1042+
1043+
start_fd = ctx->fd_table->next_fast_fd;
1044+
if (start_fd < base_fd || start_fd >= end_fd)
1045+
start_fd = base_fd;
1046+
1047+
for (fd = start_fd; fd < end_fd; fd++) {
1048+
struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd);
1049+
1050+
if (entry && entry->lkl_fd == -1 && !child_fd_is_open(ctx, fd)) {
1051+
ctx->fd_table->next_fast_fd = fd + 1;
1052+
return fd;
1053+
}
1054+
}
1055+
for (fd = base_fd; fd < start_fd; fd++) {
1056+
struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd);
1057+
1058+
if (entry && entry->lkl_fd == -1 && !child_fd_is_open(ctx, fd)) {
1059+
ctx->fd_table->next_fast_fd = fd + 1;
1060+
return fd;
1061+
}
1062+
}
1063+
1064+
return -1;
1065+
}
1066+
10331067
int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx)
10341068
{
10351069
if (!ctx)
@@ -1594,6 +1628,59 @@ void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx,
15941628
ctx->active_writeback_shadows--;
15951629
}
15961630

1631+
static void sync_cloexec_writebacks_in_range(struct kbox_supervisor_ctx *ctx,
1632+
struct kbox_fd_entry *entries,
1633+
long count)
1634+
{
1635+
long i;
1636+
1637+
if (!ctx || !entries)
1638+
return;
1639+
for (i = 0; i < count; i++) {
1640+
struct kbox_fd_entry *entry = &entries[i];
1641+
1642+
if (entry->lkl_fd != -1 && entry->cloexec) {
1643+
if (entry->lkl_fd >= 0)
1644+
invalidate_stat_cache_fd(ctx, entry->lkl_fd);
1645+
if (entry->shadow_writeback) {
1646+
(void) sync_shadow_writeback(ctx, entry);
1647+
note_shadow_writeback_close(ctx, entry);
1648+
}
1649+
}
1650+
}
1651+
}
1652+
1653+
void close_cloexec_with_writeback(struct kbox_supervisor_ctx *ctx)
1654+
{
1655+
if (!ctx || !ctx->fd_table)
1656+
return;
1657+
1658+
sync_cloexec_writebacks_in_range(ctx, ctx->fd_table->low_fds,
1659+
KBOX_LOW_FD_MAX);
1660+
sync_cloexec_writebacks_in_range(ctx, ctx->fd_table->mid_fds,
1661+
KBOX_MID_FD_MAX);
1662+
sync_cloexec_writebacks_in_range(ctx, ctx->fd_table->entries,
1663+
KBOX_FD_TABLE_MAX);
1664+
kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs);
1665+
}
1666+
1667+
static long remove_fd_table_entry_with_writeback(
1668+
struct kbox_supervisor_ctx *ctx,
1669+
long fd)
1670+
{
1671+
struct kbox_fd_entry *entry;
1672+
1673+
if (!ctx || !ctx->fd_table)
1674+
return -1;
1675+
1676+
entry = fd_table_entry(ctx->fd_table, fd);
1677+
if (entry && entry->shadow_writeback) {
1678+
(void) sync_shadow_writeback(ctx, entry);
1679+
note_shadow_writeback_close(ctx, entry);
1680+
}
1681+
return kbox_fd_table_remove(ctx->fd_table, fd);
1682+
}
1683+
15971684
int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
15981685
const struct kbox_syscall_request *req,
15991686
long lkl_fd,
@@ -1603,8 +1690,8 @@ int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
16031690
{
16041691
struct kbox_fd_entry *entry;
16051692
int memfd;
1693+
long target_fd;
16061694
int injected;
1607-
long vfd;
16081695

16091696
if (!ctx || !req || !out || lkl_fd < 0 || !translated)
16101697
return 0;
@@ -1622,36 +1709,58 @@ int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
16221709
* sealed.
16231710
*/
16241711

1625-
vfd = kbox_fd_table_insert_fast(ctx->fd_table, lkl_fd, 0);
1626-
if (vfd < 0) {
1712+
target_fd = allocate_writable_shadow_fd(ctx);
1713+
if (target_fd < 0) {
16271714
close(memfd);
16281715
return 0;
16291716
}
16301717

1631-
injected = request_addfd_at(ctx, req, memfd, (int) vfd,
1718+
/* Install only in the fast-shadow band. Host-only FDs are allowed to
1719+
* close directly in BPF, which would bypass writeback, and kernel-picked
1720+
* ADDFD results can also exceed the fd-table range. Unlike the old
1721+
* preallocation approach, the fd-table entry is published only after
1722+
* ADDFD_AT succeeds and after child_fd_is_open() has verified that no
1723+
* pending close still owns this tracee fd number.
1724+
*/
1725+
injected = request_addfd_at(ctx, req, memfd, (int) target_fd,
16321726
(flags & O_CLOEXEC) ? O_CLOEXEC : 0);
16331727
if (injected < 0) {
1634-
kbox_fd_table_remove(ctx->fd_table, vfd);
16351728
close(memfd);
16361729
return 0;
16371730
}
16381731

1639-
entry = fd_table_entry(ctx->fd_table, vfd);
1640-
if (!entry) {
1641-
kbox_fd_table_remove(ctx->fd_table, vfd);
1642-
close(memfd);
1643-
return 0;
1732+
/* SECCOMP_ADDFD_FLAG_SETFD guarantees injected == target_fd on success.
1733+
* A mismatch means the kernel API contract broke; the tracee would hold
1734+
* an untracked FD we cannot revoke. Crash loudly instead of leaking.
1735+
*/
1736+
if (injected != target_fd) {
1737+
fprintf(stderr,
1738+
"kbox: ADDFD_AT returned %d, expected %ld -- aborting\n",
1739+
injected, target_fd);
1740+
abort();
1741+
}
1742+
1743+
/* allocate_writable_shadow_fd validated target_fd is in-range and free.
1744+
* insert_at must not fail here; if it does, the tracee holds a live FD
1745+
* with no supervisor bookkeeping.
1746+
*/
1747+
if (kbox_fd_table_insert_at(ctx->fd_table, injected, lkl_fd, 0) < 0) {
1748+
fprintf(stderr,
1749+
"kbox: fd_table_insert_at(%d) failed after ADDFD -- aborting\n",
1750+
injected);
1751+
abort();
16441752
}
16451753

1754+
entry = fd_table_entry(ctx->fd_table, injected);
16461755
entry->host_fd = KBOX_FD_HOST_SAME_FD_SHADOW;
16471756
entry->shadow_sp = memfd;
16481757
note_shadow_writeback_open(ctx, entry);
16491758
if (ctx->verbose) {
16501759
fprintf(stderr,
1651-
"kbox: writable shadow promote fd=%ld lkl_fd=%ld path=%s\n",
1652-
vfd, lkl_fd, translated);
1760+
"kbox: writable shadow promote fd=%d lkl_fd=%ld path=%s\n",
1761+
injected, lkl_fd, translated);
16531762
}
1654-
*out = kbox_dispatch_value((int64_t) vfd);
1763+
*out = kbox_dispatch_value((int64_t) injected);
16551764
return 1;
16561765
}
16571766

@@ -2689,16 +2798,16 @@ static void cleanup_replaced_fd_tracking(struct kbox_supervisor_ctx *ctx,
26892798

26902799
stale_lkl = kbox_fd_table_get_lkl(ctx->fd_table, fd);
26912800
if (stale_lkl >= 0) {
2801+
remove_fd_table_entry_with_writeback(ctx, fd);
26922802
lkl_close_and_invalidate(ctx, stale_lkl);
2693-
kbox_fd_table_remove(ctx->fd_table, fd);
26942803
} else if (stale_lkl == KBOX_LKL_FD_SHADOW_ONLY) {
2695-
kbox_fd_table_remove(ctx->fd_table, fd);
2804+
remove_fd_table_entry_with_writeback(ctx, fd);
26962805
}
26972806

26982807
shadow_vfd = kbox_fd_table_find_by_host_fd(ctx->fd_table, fd);
26992808
if (shadow_vfd >= 0) {
27002809
long shadow_lkl = kbox_fd_table_get_lkl(ctx->fd_table, shadow_vfd);
2701-
kbox_fd_table_remove(ctx->fd_table, shadow_vfd);
2810+
remove_fd_table_entry_with_writeback(ctx, shadow_vfd);
27022811
if (shadow_lkl >= 0) {
27032812
int ref = 0;
27042813
for (long j = 0; j < KBOX_FD_TABLE_MAX && !ref; j++)
@@ -2851,14 +2960,14 @@ static struct kbox_dispatch forward_dup2(const struct kbox_syscall_request *req,
28512960
/* Remove any stale mapping at newfd (virtual or shadow). */
28522961
long stale = kbox_fd_table_get_lkl(ctx->fd_table, newfd);
28532962
if (stale >= 0) {
2963+
remove_fd_table_entry_with_writeback(ctx, newfd);
28542964
lkl_close_and_invalidate(ctx, stale);
2855-
kbox_fd_table_remove(ctx->fd_table, newfd);
28562965
} else {
28572966
long sv =
28582967
kbox_fd_table_find_by_host_fd(ctx->fd_table, newfd);
28592968
if (sv >= 0) {
28602969
long sl = kbox_fd_table_get_lkl(ctx->fd_table, sv);
2861-
kbox_fd_table_remove(ctx->fd_table, sv);
2970+
remove_fd_table_entry_with_writeback(ctx, sv);
28622971
if (sl >= 0) {
28632972
int ref = 0;
28642973
for (long j = 0; j < KBOX_FD_TABLE_MAX; j++)
@@ -2914,7 +3023,7 @@ static struct kbox_dispatch forward_dup2(const struct kbox_syscall_request *req,
29143023
if (ret < 0)
29153024
return kbox_dispatch_errno((int) (-ret));
29163025

2917-
long existing = kbox_fd_table_remove(ctx->fd_table, newfd);
3026+
long existing = remove_fd_table_entry_with_writeback(ctx, newfd);
29183027
if (existing >= 0)
29193028
lkl_close_and_invalidate(ctx, existing);
29203029

@@ -2978,15 +3087,15 @@ static struct kbox_dispatch forward_dup3(const struct kbox_syscall_request *req,
29783087
/* Remove stale mapping at newfd (virtual or shadow). */
29793088
long stale3 = kbox_fd_table_get_lkl(ctx->fd_table, newfd);
29803089
if (stale3 >= 0) {
3090+
remove_fd_table_entry_with_writeback(ctx, newfd);
29813091
lkl_close_and_invalidate(ctx, stale3);
2982-
kbox_fd_table_remove(ctx->fd_table, newfd);
29833092
} else {
29843093
long sv3 =
29853094
kbox_fd_table_find_by_host_fd(ctx->fd_table, newfd);
29863095
if (sv3 >= 0) {
29873096
long sl3 =
29883097
kbox_fd_table_get_lkl(ctx->fd_table, sv3);
2989-
kbox_fd_table_remove(ctx->fd_table, sv3);
3098+
remove_fd_table_entry_with_writeback(ctx, sv3);
29903099
if (sl3 >= 0) {
29913100
int r3 = 0;
29923101
for (long j = 0; j < KBOX_FD_TABLE_MAX; j++)
@@ -3042,7 +3151,7 @@ static struct kbox_dispatch forward_dup3(const struct kbox_syscall_request *req,
30423151
if (ret < 0)
30433152
return kbox_dispatch_errno((int) (-ret));
30443153

3045-
long existing = kbox_fd_table_remove(ctx->fd_table, newfd);
3154+
long existing = remove_fd_table_entry_with_writeback(ctx, newfd);
30463155
if (existing >= 0)
30473156
lkl_close_and_invalidate(ctx, existing);
30483157

0 commit comments

Comments
 (0)