@@ -1030,6 +1030,40 @@ long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx)
10301030 return fd ;
10311031}
10321032
1033+ static long allocate_writable_shadow_fd (struct kbox_supervisor_ctx * ctx )
1034+ {
1035+ long base_fd = KBOX_FD_FAST_BASE ;
1036+ long end_fd = KBOX_FD_HOSTONLY_BASE ;
1037+ long start_fd ;
1038+ long fd ;
1039+
1040+ if (!ctx || !ctx -> fd_table )
1041+ return -1 ;
1042+
1043+ start_fd = ctx -> fd_table -> next_fast_fd ;
1044+ if (start_fd < base_fd || start_fd >= end_fd )
1045+ start_fd = base_fd ;
1046+
1047+ for (fd = start_fd ; fd < end_fd ; fd ++ ) {
1048+ struct kbox_fd_entry * entry = fd_table_entry (ctx -> fd_table , fd );
1049+
1050+ if (entry && entry -> lkl_fd == -1 && !child_fd_is_open (ctx , fd )) {
1051+ ctx -> fd_table -> next_fast_fd = fd + 1 ;
1052+ return fd ;
1053+ }
1054+ }
1055+ for (fd = base_fd ; fd < start_fd ; fd ++ ) {
1056+ struct kbox_fd_entry * entry = fd_table_entry (ctx -> fd_table , fd );
1057+
1058+ if (entry && entry -> lkl_fd == -1 && !child_fd_is_open (ctx , fd )) {
1059+ ctx -> fd_table -> next_fast_fd = fd + 1 ;
1060+ return fd ;
1061+ }
1062+ }
1063+
1064+ return -1 ;
1065+ }
1066+
10331067int ensure_proc_self_fd_dir (struct kbox_supervisor_ctx * ctx )
10341068{
10351069 if (!ctx )
@@ -1594,6 +1628,59 @@ void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx,
15941628 ctx -> active_writeback_shadows -- ;
15951629}
15961630
1631+ static void sync_cloexec_writebacks_in_range (struct kbox_supervisor_ctx * ctx ,
1632+ struct kbox_fd_entry * entries ,
1633+ long count )
1634+ {
1635+ long i ;
1636+
1637+ if (!ctx || !entries )
1638+ return ;
1639+ for (i = 0 ; i < count ; i ++ ) {
1640+ struct kbox_fd_entry * entry = & entries [i ];
1641+
1642+ if (entry -> lkl_fd != -1 && entry -> cloexec ) {
1643+ if (entry -> lkl_fd >= 0 )
1644+ invalidate_stat_cache_fd (ctx , entry -> lkl_fd );
1645+ if (entry -> shadow_writeback ) {
1646+ (void ) sync_shadow_writeback (ctx , entry );
1647+ note_shadow_writeback_close (ctx , entry );
1648+ }
1649+ }
1650+ }
1651+ }
1652+
1653+ void close_cloexec_with_writeback (struct kbox_supervisor_ctx * ctx )
1654+ {
1655+ if (!ctx || !ctx -> fd_table )
1656+ return ;
1657+
1658+ sync_cloexec_writebacks_in_range (ctx , ctx -> fd_table -> low_fds ,
1659+ KBOX_LOW_FD_MAX );
1660+ sync_cloexec_writebacks_in_range (ctx , ctx -> fd_table -> mid_fds ,
1661+ KBOX_MID_FD_MAX );
1662+ sync_cloexec_writebacks_in_range (ctx , ctx -> fd_table -> entries ,
1663+ KBOX_FD_TABLE_MAX );
1664+ kbox_fd_table_close_cloexec (ctx -> fd_table , ctx -> sysnrs );
1665+ }
1666+
1667+ static long remove_fd_table_entry_with_writeback (
1668+ struct kbox_supervisor_ctx * ctx ,
1669+ long fd )
1670+ {
1671+ struct kbox_fd_entry * entry ;
1672+
1673+ if (!ctx || !ctx -> fd_table )
1674+ return -1 ;
1675+
1676+ entry = fd_table_entry (ctx -> fd_table , fd );
1677+ if (entry && entry -> shadow_writeback ) {
1678+ (void ) sync_shadow_writeback (ctx , entry );
1679+ note_shadow_writeback_close (ctx , entry );
1680+ }
1681+ return kbox_fd_table_remove (ctx -> fd_table , fd );
1682+ }
1683+
15971684int try_writeback_shadow_open (struct kbox_supervisor_ctx * ctx ,
15981685 const struct kbox_syscall_request * req ,
15991686 long lkl_fd ,
@@ -1603,8 +1690,8 @@ int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
16031690{
16041691 struct kbox_fd_entry * entry ;
16051692 int memfd ;
1693+ long target_fd ;
16061694 int injected ;
1607- long vfd ;
16081695
16091696 if (!ctx || !req || !out || lkl_fd < 0 || !translated )
16101697 return 0 ;
@@ -1622,36 +1709,58 @@ int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx,
16221709 * sealed.
16231710 */
16241711
1625- vfd = kbox_fd_table_insert_fast (ctx -> fd_table , lkl_fd , 0 );
1626- if (vfd < 0 ) {
1712+ target_fd = allocate_writable_shadow_fd (ctx );
1713+ if (target_fd < 0 ) {
16271714 close (memfd );
16281715 return 0 ;
16291716 }
16301717
1631- injected = request_addfd_at (ctx , req , memfd , (int ) vfd ,
1718+ /* Install only in the fast-shadow band. Host-only FDs are allowed to
1719+ * close directly in BPF, which would bypass writeback, and kernel-picked
1720+ * ADDFD results can also exceed the fd-table range. Unlike the old
1721+ * preallocation approach, the fd-table entry is published only after
1722+ * ADDFD_AT succeeds and after child_fd_is_open() has verified that no
1723+ * pending close still owns this tracee fd number.
1724+ */
1725+ injected = request_addfd_at (ctx , req , memfd , (int ) target_fd ,
16321726 (flags & O_CLOEXEC ) ? O_CLOEXEC : 0 );
16331727 if (injected < 0 ) {
1634- kbox_fd_table_remove (ctx -> fd_table , vfd );
16351728 close (memfd );
16361729 return 0 ;
16371730 }
16381731
1639- entry = fd_table_entry (ctx -> fd_table , vfd );
1640- if (!entry ) {
1641- kbox_fd_table_remove (ctx -> fd_table , vfd );
1642- close (memfd );
1643- return 0 ;
1732+ /* SECCOMP_ADDFD_FLAG_SETFD guarantees injected == target_fd on success.
1733+ * A mismatch means the kernel API contract broke; the tracee would hold
1734+ * an untracked FD we cannot revoke. Crash loudly instead of leaking.
1735+ */
1736+ if (injected != target_fd ) {
1737+ fprintf (stderr ,
1738+ "kbox: ADDFD_AT returned %d, expected %ld -- aborting\n" ,
1739+ injected , target_fd );
1740+ abort ();
1741+ }
1742+
1743+ /* allocate_writable_shadow_fd validated target_fd is in-range and free.
1744+ * insert_at must not fail here; if it does, the tracee holds a live FD
1745+ * with no supervisor bookkeeping.
1746+ */
1747+ if (kbox_fd_table_insert_at (ctx -> fd_table , injected , lkl_fd , 0 ) < 0 ) {
1748+ fprintf (stderr ,
1749+ "kbox: fd_table_insert_at(%d) failed after ADDFD -- aborting\n" ,
1750+ injected );
1751+ abort ();
16441752 }
16451753
1754+ entry = fd_table_entry (ctx -> fd_table , injected );
16461755 entry -> host_fd = KBOX_FD_HOST_SAME_FD_SHADOW ;
16471756 entry -> shadow_sp = memfd ;
16481757 note_shadow_writeback_open (ctx , entry );
16491758 if (ctx -> verbose ) {
16501759 fprintf (stderr ,
1651- "kbox: writable shadow promote fd=%ld lkl_fd=%ld path=%s\n" ,
1652- vfd , lkl_fd , translated );
1760+ "kbox: writable shadow promote fd=%d lkl_fd=%ld path=%s\n" ,
1761+ injected , lkl_fd , translated );
16531762 }
1654- * out = kbox_dispatch_value ((int64_t ) vfd );
1763+ * out = kbox_dispatch_value ((int64_t ) injected );
16551764 return 1 ;
16561765}
16571766
@@ -2689,16 +2798,16 @@ static void cleanup_replaced_fd_tracking(struct kbox_supervisor_ctx *ctx,
26892798
26902799 stale_lkl = kbox_fd_table_get_lkl (ctx -> fd_table , fd );
26912800 if (stale_lkl >= 0 ) {
2801+ remove_fd_table_entry_with_writeback (ctx , fd );
26922802 lkl_close_and_invalidate (ctx , stale_lkl );
2693- kbox_fd_table_remove (ctx -> fd_table , fd );
26942803 } else if (stale_lkl == KBOX_LKL_FD_SHADOW_ONLY ) {
2695- kbox_fd_table_remove (ctx -> fd_table , fd );
2804+ remove_fd_table_entry_with_writeback (ctx , fd );
26962805 }
26972806
26982807 shadow_vfd = kbox_fd_table_find_by_host_fd (ctx -> fd_table , fd );
26992808 if (shadow_vfd >= 0 ) {
27002809 long shadow_lkl = kbox_fd_table_get_lkl (ctx -> fd_table , shadow_vfd );
2701- kbox_fd_table_remove (ctx -> fd_table , shadow_vfd );
2810+ remove_fd_table_entry_with_writeback (ctx , shadow_vfd );
27022811 if (shadow_lkl >= 0 ) {
27032812 int ref = 0 ;
27042813 for (long j = 0 ; j < KBOX_FD_TABLE_MAX && !ref ; j ++ )
@@ -2851,14 +2960,14 @@ static struct kbox_dispatch forward_dup2(const struct kbox_syscall_request *req,
28512960 /* Remove any stale mapping at newfd (virtual or shadow). */
28522961 long stale = kbox_fd_table_get_lkl (ctx -> fd_table , newfd );
28532962 if (stale >= 0 ) {
2963+ remove_fd_table_entry_with_writeback (ctx , newfd );
28542964 lkl_close_and_invalidate (ctx , stale );
2855- kbox_fd_table_remove (ctx -> fd_table , newfd );
28562965 } else {
28572966 long sv =
28582967 kbox_fd_table_find_by_host_fd (ctx -> fd_table , newfd );
28592968 if (sv >= 0 ) {
28602969 long sl = kbox_fd_table_get_lkl (ctx -> fd_table , sv );
2861- kbox_fd_table_remove (ctx -> fd_table , sv );
2970+ remove_fd_table_entry_with_writeback (ctx , sv );
28622971 if (sl >= 0 ) {
28632972 int ref = 0 ;
28642973 for (long j = 0 ; j < KBOX_FD_TABLE_MAX ; j ++ )
@@ -2914,7 +3023,7 @@ static struct kbox_dispatch forward_dup2(const struct kbox_syscall_request *req,
29143023 if (ret < 0 )
29153024 return kbox_dispatch_errno ((int ) (- ret ));
29163025
2917- long existing = kbox_fd_table_remove (ctx -> fd_table , newfd );
3026+ long existing = remove_fd_table_entry_with_writeback (ctx , newfd );
29183027 if (existing >= 0 )
29193028 lkl_close_and_invalidate (ctx , existing );
29203029
@@ -2978,15 +3087,15 @@ static struct kbox_dispatch forward_dup3(const struct kbox_syscall_request *req,
29783087 /* Remove stale mapping at newfd (virtual or shadow). */
29793088 long stale3 = kbox_fd_table_get_lkl (ctx -> fd_table , newfd );
29803089 if (stale3 >= 0 ) {
3090+ remove_fd_table_entry_with_writeback (ctx , newfd );
29813091 lkl_close_and_invalidate (ctx , stale3 );
2982- kbox_fd_table_remove (ctx -> fd_table , newfd );
29833092 } else {
29843093 long sv3 =
29853094 kbox_fd_table_find_by_host_fd (ctx -> fd_table , newfd );
29863095 if (sv3 >= 0 ) {
29873096 long sl3 =
29883097 kbox_fd_table_get_lkl (ctx -> fd_table , sv3 );
2989- kbox_fd_table_remove (ctx -> fd_table , sv3 );
3098+ remove_fd_table_entry_with_writeback (ctx , sv3 );
29903099 if (sl3 >= 0 ) {
29913100 int r3 = 0 ;
29923101 for (long j = 0 ; j < KBOX_FD_TABLE_MAX ; j ++ )
@@ -3042,7 +3151,7 @@ static struct kbox_dispatch forward_dup3(const struct kbox_syscall_request *req,
30423151 if (ret < 0 )
30433152 return kbox_dispatch_errno ((int ) (- ret ));
30443153
3045- long existing = kbox_fd_table_remove (ctx -> fd_table , newfd );
3154+ long existing = remove_fd_table_entry_with_writeback (ctx , newfd );
30463155 if (existing >= 0 )
30473156 lkl_close_and_invalidate (ctx , existing );
30483157
0 commit comments