2424#include <sys/socket.h>
2525#include <dirent.h> /* fdopendir, for DIR* reconstruction in child */
2626#include <sys/wait.h>
27+ #include <sys/clonefile.h> /* fclonefileat for CoW shm snapshots */
2728#include <mach-o/dyld.h>
2829
2930#include "hvutil.h"
@@ -1137,6 +1138,51 @@ static void *vm_clone_thread_run(void *arg)
11371138 return NULL ;
11381139}
11391140
1141+ /* Create an APFS block-level CoW clone of src_fd via fclonefileat (O(metadata),
1142+ * independent of the source once either side writes). Returns the clone fd on
1143+ * success, -1 with errno set on failure (non-APFS /tmp, ENOSYS, ENOSPC, ...).
1144+ * Callers that issue this snapshot are documented at the call site; the helper
1145+ * itself only owns the clone-path lifecycle.
1146+ */
1147+ static int fork_snapshot_shm_via_clonefile (int src_fd )
1148+ {
1149+ /* fclonefileat needs a destination path on the same APFS volume as the
1150+ * source. /tmp is APFS on every shipped macOS Apple Silicon configuration;
1151+ * if a user has remapped /tmp to a different filesystem the call fails
1152+ * and the caller drops back to the legacy path.
1153+ *
1154+ * The destination lives inside a fresh mkdtemp directory (mode 0700) so
1155+ * no other local user can race to claim the destination basename between
1156+ * path selection and fclonefileat: an earlier mkstemp + unlink +
1157+ * fclonefileat sequence left a window where /tmp was world-writable for
1158+ * that name and a concurrent process could DoS the fast path via EEXIST.
1159+ */
1160+ char tmpdir [] = "/tmp/elfuse-fork-XXXXXX" ;
1161+ if (mkdtemp (tmpdir ) == NULL )
1162+ return -1 ;
1163+ char clone_path [64 ];
1164+ snprintf (clone_path , sizeof (clone_path ), "%s/snap" , tmpdir );
1165+ if (fclonefileat (src_fd , AT_FDCWD , clone_path , 0 ) < 0 ) {
1166+ int saved_errno = errno ;
1167+ rmdir (tmpdir );
1168+ errno = saved_errno ;
1169+ return -1 ;
1170+ }
1171+ int clone_fd = open (clone_path , O_RDWR | O_CLOEXEC );
1172+ int saved_errno = errno ;
1173+ /* Best-effort cleanup: the clone fd alone keeps the inode alive, so any
1174+ * unlink/rmdir failure here is a directory-leak nuisance, not a
1175+ * correctness issue. Caller still gets the open fd.
1176+ */
1177+ (void ) unlink (clone_path );
1178+ (void ) rmdir (tmpdir );
1179+ if (clone_fd < 0 ) {
1180+ errno = saved_errno ;
1181+ return -1 ;
1182+ }
1183+ return clone_fd ;
1184+ }
1185+
11401186int64_t sys_clone (hv_vcpu_t vcpu ,
11411187 guest_t * g ,
11421188 uint64_t flags ,
@@ -1163,13 +1209,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
11631209 verbose );
11641210 }
11651211
1166- /* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path
1167- * is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and
1168- * the parent's MAP_SHARED mapping cannot be safely remapped under the
1169- * running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes
1170- * ride along as primary-buffer used regions; the child restores
1171- * TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base
1172- * from kbuf_gpa.
1212+ /* Rosetta fork takes the helper-process IPC path. The parent cannot remap
1213+ * its live guest memory under the running vCPU because HVF caches VA->PA at
1214+ * hv_vm_map time; instead, the fork path snapshots shm with clonefile when
1215+ * available and otherwise falls back to region copy. The TTBR1 kbuf tree,
1216+ * translator image, and kbuf bytes ride along as primary-buffer used
1217+ * regions; the child restores TCR_EL1 / TTBR1_EL1 from ipc_registers_t and
1218+ * recomputes kbuf_base from kbuf_gpa.
11731219 */
11741220
11751221 /* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like
@@ -1291,10 +1337,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
12911337 return - LINUX_ENOMEM ;
12921338 }
12931339
1294- /* The parent keeps only its end of the control channel. */
1340+ /* The parent keeps only its end of the control channel. Reset the closed
1341+ * write end to -1 so the fail_snapshot guarded close at the bottom of the
1342+ * function cannot double-close it. In a multithreaded guest, another vCPU
1343+ * could open a new fd between the two closes and get the same number,
1344+ * which the second close would then steal.
1345+ */
12951346 close (sock_fds [1 ]);
1296- if (vfork_notify_fds [1 ] >= 0 )
1347+ if (vfork_notify_fds [1 ] >= 0 ) {
12971348 close (vfork_notify_fds [1 ]);
1349+ vfork_notify_fds [1 ] = -1 ;
1350+ }
12981351 int ipc_sock = sock_fds [0 ];
12991352
13001353 /* Allocate guest PID before serialization so the child header carries its
@@ -1314,6 +1367,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13141367 mmap_fork_anon_shared_txn_t * anon_shared_txn = NULL ;
13151368 guest_region_t * regions_snapshot = NULL ;
13161369 guest_region_t preannounced_snapshot [GUEST_MAX_PREANNOUNCED ];
1370+ /* APFS clone fd for the CoW snapshot sent to the child. Declared up front
1371+ * so early goto fail_snapshot exits do not read an uninitialized local.
1372+ */
1373+ int snapshot_shm_fd = -1 ;
13171374
13181375 /* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
13191376 * into memfd-backed overlay regions. The conversion seeds a private
@@ -1328,31 +1385,87 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13281385 if (mmap_fork_prepare_anon_shared (g , & anon_shared_txn ) < 0 )
13291386 goto fail_snapshot ;
13301387
1331- /* Determine if elfuse can use the CoW (shm) fast path.
1332- * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
1333- * shm fd to the child. Otherwise fall back to region-by-region copy .
1388+ /* CoW fast path: if shm_fd >= 0, send a snapshot of guest memory to the
1389+ * child instead of the per-region copy. The child maps that snapshot
1390+ * MAP_PRIVATE; subsequent writes on either side are private .
13341391 *
1335- * Rosetta guests are excluded from CoW even when shm-backed: rosetta's
1336- * JIT state (TLS slabs, code caches, indirect-call tables, block lists)
1337- * is process-local and corrupts when CoW-shared. The legacy region-copy
1338- * path preserves the parent's JIT state independently per child.
1339- */
1340- bool use_shm = (g -> shm_fd >= 0 ) && !g -> is_rosetta ;
1341-
1342- /* elfuse does not remap the parent to MAP_PRIVATE here. The parent
1343- * stays on MAP_SHARED; its vCPU continues writing to the shared file.
1344- * The child maps MAP_PRIVATE, getting a CoW snapshot.
1392+ * The parent's own mapping cannot be flipped to MAP_PRIVATE here: hv_vm_map
1393+ * caches the host VA->PA mapping, and a MAP_FIXED remap invalidates it
1394+ * (the parent then reads stale memory and writev returns EFAULT). So the
1395+ * parent stays on MAP_SHARED and the snapshot is what isolates the child.
13451396 *
1346- * This is safe because the IPC is synchronous: the child maps MAP_PRIVATE
1347- * before the parent's vCPU resumes. After that, the child's CoW pages are
1348- * frozen (child writes are private, parent writes to MAP_SHARED do not
1349- * affect CoW'd child pages).
1397+ * Two snapshot sources, in preference order (selected just below):
1398+ * 1. fclonefileat of g->shm_fd to an independent APFS clone. The clone
1399+ * shares blocks with the parent until either side writes, so the
1400+ * parent's subsequent writes never reach the child's backing.
1401+ * 2. The live g->shm_fd. Any page the child has not yet COW'd reads the
1402+ * parent's current bytes -- benign for typical guest state, but
1403+ * corrupts Rosetta's translator-internal structures (TLS slabs, code
1404+ * caches, indirect-call tables) on mid-update reads. Issue #45.
13501405 *
1351- * an earlier implementation tried remapping the parent to MAP_PRIVATE here,
1352- * but that breaks HVF: hv_vm_map caches the host VA->PA mapping, and
1353- * MAP_FIXED remap invalidates it. The parent's vCPU then reads stale
1354- * memory, causing corrupted syscall data (EFAULT on writev).
1406+ * Rosetta therefore requires path 1 and falls back to region copy if
1407+ * fclonefileat fails; native guests accept path 2 as a fallback so a
1408+ * non-APFS /tmp does not silently slow forks down to per-region copy cost.
13551409 */
1410+ bool use_shm = (g -> shm_fd >= 0 );
1411+
1412+ /* Overlay sync runs before the snapshot so the cloned file picks up the
1413+ * overlay-backed bytes. The parent's host VA for each overlay region maps
1414+ * the overlay file, not shm_fd, so shm_fd's contents at those offsets are
1415+ * stale (typically zero) until the pwrite below copies them in. Both the
1416+ * clone-fd path and the live-shm_fd fallback consume this sync.
1417+ */
1418+ if (use_shm ) {
1419+ for (int i = 0 ; i < g -> nregions ; i ++ ) {
1420+ const guest_region_t * r = & g -> regions [i ];
1421+ if (!r -> overlay_active )
1422+ continue ;
1423+ uint64_t len = r -> end - r -> start ;
1424+ const uint8_t * src = (const uint8_t * ) g -> host_base + r -> start ;
1425+ uint64_t off = r -> start ;
1426+ while (len > 0 ) {
1427+ size_t chunk = len > (uint64_t ) SSIZE_MAX ? (size_t ) SSIZE_MAX
1428+ : (size_t ) len ;
1429+ ssize_t nw = pwrite (g -> shm_fd , src , chunk , (off_t ) off );
1430+ if (nw < 0 ) {
1431+ if (errno == EINTR )
1432+ continue ;
1433+ log_error ("clone: shm overlay sync pwrite failed: %s" ,
1434+ strerror (errno ));
1435+ goto fail_snapshot ;
1436+ }
1437+ if (nw == 0 ) {
1438+ log_error ("clone: shm overlay sync pwrite returned 0" );
1439+ goto fail_snapshot ;
1440+ }
1441+ src += nw ;
1442+ off += (uint64_t ) nw ;
1443+ len -= (uint64_t ) nw ;
1444+ }
1445+ }
1446+ /* Attempt the APFS clone snapshot for every guest, not just Rosetta:
1447+ * the clone gives POSIX-style isolation at O(metadata) cost and avoids
1448+ * torn-snapshot reads in guests that snapshot their own state across
1449+ * fork (Redis BGSAVE, checkpointing runtimes). On failure the fallback
1450+ * differs per design above: Rosetta drops use_shm so the region-copy
1451+ * path runs; native guests keep use_shm and send the live g->shm_fd.
1452+ */
1453+ snapshot_shm_fd = fork_snapshot_shm_via_clonefile (g -> shm_fd );
1454+ if (snapshot_shm_fd < 0 ) {
1455+ if (g -> is_rosetta ) {
1456+ log_warn (
1457+ "clone: rosetta CoW snapshot via fclonefileat failed "
1458+ "(%s); falling back to region-copy path" ,
1459+ strerror (errno ));
1460+ use_shm = false;
1461+ } else {
1462+ log_debug (
1463+ "clone: CoW snapshot via fclonefileat failed (%s); "
1464+ "sending live shm fd as fallback" ,
1465+ strerror (errno ));
1466+ }
1467+ }
1468+ }
13561469
13571470 /* Snapshot of the semantic region array, populated after the memory dump
13581471 * but before sibling vCPUs resume. Declared up front so all goto paths to
@@ -1401,46 +1514,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
14011514 goto fail_snapshot ;
14021515 }
14031516
1404- /* CoW path: sync MAP_SHARED file overlays back into shm_fd before
1405- * sending it to the child. The parent's host VA at each overlay
1406- * region maps the overlay file, not shm_fd, so shm_fd's content at
1407- * those IPAs is stale (typically zero). The child's MAP_PRIVATE
1408- * snapshot would expose that stale data at the overlay IPAs. Copy
1409- * the live overlay bytes into shm_fd at the matching offsets so the
1410- * child snapshot reflects the parent's view at fork time. Live
1411- * cross-fork MAP_SHARED coherence (parent and child both seeing
1412- * subsequent writes through the same file) is left to the cross-fork
1413- * coherence TODO; this fix only avoids the stale-snapshot regression.
1517+ /* Send the snapshot fd if fclonefileat succeeded, otherwise the live
1518+ * g->shm_fd. The Rosetta-failure case already cleared use_shm above so it
1519+ * never reaches this branch with snapshot_shm_fd < 0.
14141520 */
14151521 if (use_shm ) {
1416- for (int i = 0 ; i < g -> nregions ; i ++ ) {
1417- const guest_region_t * r = & g -> regions [i ];
1418- if (!r -> overlay_active )
1419- continue ;
1420- uint64_t len = r -> end - r -> start ;
1421- const uint8_t * src = (const uint8_t * ) g -> host_base + r -> start ;
1422- uint64_t off = r -> start ;
1423- while (len > 0 ) {
1424- size_t chunk = len > (uint64_t ) SSIZE_MAX ? (size_t ) SSIZE_MAX
1425- : (size_t ) len ;
1426- ssize_t nw = pwrite (g -> shm_fd , src , chunk , (off_t ) off );
1427- if (nw < 0 ) {
1428- if (errno == EINTR )
1429- continue ;
1430- log_error ("clone: shm overlay sync pwrite failed: %s" ,
1431- strerror (errno ));
1432- goto fail_snapshot ;
1433- }
1434- if (nw == 0 ) {
1435- log_error ("clone: shm overlay sync pwrite returned 0" );
1436- goto fail_snapshot ;
1437- }
1438- src += nw ;
1439- off += (uint64_t ) nw ;
1440- len -= (uint64_t ) nw ;
1441- }
1442- }
1443- if (fork_ipc_send_fds (ipc_sock , & g -> shm_fd , 1 ) < 0 ) {
1522+ int fd_to_send = (snapshot_shm_fd >= 0 ) ? snapshot_shm_fd : g -> shm_fd ;
1523+ if (fork_ipc_send_fds (ipc_sock , & fd_to_send , 1 ) < 0 ) {
14441524 log_error ("clone: failed to send shm fd" );
14451525 goto fail_snapshot ;
14461526 }
@@ -1555,10 +1635,14 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15551635 child_host_pid );
15561636
15571637 free (regions_snapshot );
1638+ if (snapshot_shm_fd >= 0 )
1639+ close (snapshot_shm_fd );
15581640 return child_guest_pid ;
15591641
15601642fail_snapshot :
15611643 free (regions_snapshot );
1644+ if (snapshot_shm_fd >= 0 )
1645+ close (snapshot_shm_fd );
15621646 /* Roll back the in-place anon-shared overlay conversion while
15631647 * siblings are still parked. A partial rollback failure (e.g.,
15641648 * region drift past the quiesce timeout) leaves the parent in a
@@ -1578,6 +1662,23 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15781662 close (vfork_notify_fds [0 ]);
15791663 if (vfork_notify_fds [1 ] >= 0 )
15801664 close (vfork_notify_fds [1 ]);
1665+ /* posix_spawn at the top of sys_clone always succeeds before any goto
1666+ * fail_snapshot fires, so child_host_pid is a live process here. The
1667+ * IPC socket just closed; the child reads EOF on fork_ipc_read_all and
1668+ * returns nonzero from fork_child_main. Without an explicit waitpid the
1669+ * exited child becomes a zombie: proc_register_child only runs on the
1670+ * success path, so neither proc_reap_finished nor sys_wait4 will ever
1671+ * pick this PID up, and the guest's fork(2) already reported failure.
1672+ * Reap it here to keep host PIDs from accumulating across repeated
1673+ * failures.
1674+ */
1675+ pid_t reaped ;
1676+ do {
1677+ reaped = waitpid (child_host_pid , NULL , 0 );
1678+ } while (reaped < 0 && errno == EINTR );
1679+ if (reaped < 0 )
1680+ log_warn ("clone: failed to reap fork-child pid=%d: %s" ,
1681+ (int ) child_host_pid , strerror (errno ));
15811682 return - LINUX_ENOMEM ;
15821683}
15831684
0 commit comments