Skip to content

Commit b724c9d

Browse files
committed
Snapshot shm via APFS fclonefileat for safe CoW
Fork previously sent the parent's live shm_fd via SCM_RIGHTS and the child mapped it MAP_PRIVATE. The parent stayed on MAP_SHARED, so any page the child had not yet COW'd reflected the parent's current bytes. That was benign for typical aarch64 workloads but corrupted x86_64-via-Rosetta guests: translator-internal structures (TLS slabs, code caches, indirect-call tables, block lists) cross page boundaries and observe parent-side mid-update reads. Issue #45 tracked the resulting fall back to a per-region byte copy through the IPC socket -- 10-16x slower per fork than the CoW path. sys_clone now lifts the !g->is_rosetta gate and always asks fork_snapshot_shm_via_clonefile() for an APFS fclonefileat snapshot of the shm file. The clone shares blocks with the parent until either side writes, so the parent's later writes never reach the child's backing, and the existing guest_init_from_shm MAP_PRIVATE flow on the child consumes the snapshot unchanged. The snapshot helper uses a mode 0700 mkdtemp directory (clone inside, then unlink + rmdir) rather than an earlier mkstemp + unlink + fclonefileat sequence whose freed /tmp basename gave a local-user TOCTOU window that could DoS the fast path via EEXIST. Fallback differs by guest. Rosetta drops use_shm on clonefile failure and falls through to the legacy region-copy path; sending the live fd would re-introduce the issue #45 corruption. Native guests keep use_shm and send g->shm_fd directly, preserving the original CoW behavior so a non-APFS /tmp does not silently slow forks down to per-region copy cost. Overlay sync (pwrite of file-backed MAP_SHARED overlay bytes into shm_fd) moves before the IPC header so the cloned file picks up overlay-backed bytes and the header has_shm field reflects the post-clonefile outcome. guest_init_from_shm now closes shm_fd on the compute_infra_layout failure path so the take-ownership contract holds on every error, not just the post-mmap ones. tests/bench-fork-cost.sh is added as the regression baseline. Per-fork wall-clock means from three back-to-back runs on M1 (subshell fork, no exec, per-fork numbers exclude startup via a 0-iter baseline subtraction): rss aarch64 CoW Rosetta (before / after) ratio 0 MiB ~113 ms/fork ~1058-1196 / ~113 ms 10x -> ~1x 1 MiB ~113 ms/fork ~1090-1250 / ~117 ms 10x -> ~1x 16 MiB ~114 ms/fork ~1125-1230 / ~120 ms 10x -> ~1x 64 MiB ~114 ms/fork ~1400-1840 / ~220 ms 12-16x -> ~2x The 64 MiB Rosetta residual is APFS clone metadata plus child-side MAP_PRIVATE materialization, not byte-copy bandwidth. test-cow-fork (5/5), make check, and the 71-test make test-rosetta-all suite stay green. Close #45
1 parent 0a46e4f commit b724c9d

3 files changed

Lines changed: 395 additions & 69 deletions

File tree

src/core/guest.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,15 @@ int guest_init_from_shm(guest_t *g,
458458
g->interp_base = size - 0x100000000ULL;
459459
g->mmap_limit = size - 0x200000000ULL;
460460
g->overflow_ipa_next = size;
461-
if (compute_infra_layout(g) < 0)
461+
if (compute_infra_layout(g) < 0) {
462+
/* Layout computation may reject a malformed header (impossible
463+
* guest_size / ipa_bits combination) before the mapping is set up;
464+
* close the inherited shm fd here so the caller's contract -- this
465+
* function takes ownership of shm_fd -- holds on every error path.
466+
*/
467+
close(shm_fd);
462468
return -1;
469+
}
463470
g->pt_pool_next = g->pt_pool_base;
464471

465472
/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see

src/runtime/forkipc.c

Lines changed: 169 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <sys/socket.h>
2525
#include <dirent.h> /* fdopendir, for DIR* reconstruction in child */
2626
#include <sys/wait.h>
27+
#include <sys/clonefile.h> /* fclonefileat for CoW shm snapshots */
2728
#include <mach-o/dyld.h>
2829

2930
#include "hvutil.h"
@@ -1137,6 +1138,51 @@ static void *vm_clone_thread_run(void *arg)
11371138
return NULL;
11381139
}
11391140

1141+
/* Create an APFS block-level CoW clone of src_fd via fclonefileat (O(metadata),
1142+
* independent of the source once either side writes). Returns the clone fd on
1143+
* success, -1 with errno set on failure (non-APFS /tmp, ENOSYS, ENOSPC, ...).
1144+
* Callers that issue this snapshot are documented at the call site; the helper
1145+
* itself only owns the clone-path lifecycle.
1146+
*/
1147+
static int fork_snapshot_shm_via_clonefile(int src_fd)
1148+
{
1149+
/* fclonefileat needs a destination path on the same APFS volume as the
1150+
* source. /tmp is APFS on every shipped macOS Apple Silicon configuration;
1151+
* if a user has remapped /tmp to a different filesystem the call fails
1152+
* and the caller drops back to the legacy path.
1153+
*
1154+
* The destination lives inside a fresh mkdtemp directory (mode 0700) so
1155+
* no other local user can race to claim the destination basename between
1156+
* path selection and fclonefileat: an earlier mkstemp + unlink +
1157+
* fclonefileat sequence left a window where /tmp was world-writable for
1158+
* that name and a concurrent process could DoS the fast path via EEXIST.
1159+
*/
1160+
char tmpdir[] = "/tmp/elfuse-fork-XXXXXX";
1161+
if (mkdtemp(tmpdir) == NULL)
1162+
return -1;
1163+
char clone_path[64];
1164+
snprintf(clone_path, sizeof(clone_path), "%s/snap", tmpdir);
1165+
if (fclonefileat(src_fd, AT_FDCWD, clone_path, 0) < 0) {
1166+
int saved_errno = errno;
1167+
rmdir(tmpdir);
1168+
errno = saved_errno;
1169+
return -1;
1170+
}
1171+
int clone_fd = open(clone_path, O_RDWR | O_CLOEXEC);
1172+
int saved_errno = errno;
1173+
/* Best-effort cleanup: the clone fd alone keeps the inode alive, so any
1174+
* unlink/rmdir failure here is a directory-leak nuisance, not a
1175+
* correctness issue. Caller still gets the open fd.
1176+
*/
1177+
(void) unlink(clone_path);
1178+
(void) rmdir(tmpdir);
1179+
if (clone_fd < 0) {
1180+
errno = saved_errno;
1181+
return -1;
1182+
}
1183+
return clone_fd;
1184+
}
1185+
11401186
int64_t sys_clone(hv_vcpu_t vcpu,
11411187
guest_t *g,
11421188
uint64_t flags,
@@ -1163,13 +1209,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
11631209
verbose);
11641210
}
11651211

1166-
/* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path
1167-
* is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and
1168-
* the parent's MAP_SHARED mapping cannot be safely remapped under the
1169-
* running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes
1170-
* ride along as primary-buffer used regions; the child restores
1171-
* TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base
1172-
* from kbuf_gpa.
1212+
/* Rosetta fork takes the helper-process IPC path. The parent cannot remap
1213+
* its live guest memory under the running vCPU because HVF caches VA->PA at
1214+
* hv_vm_map time; instead, the fork path snapshots shm with clonefile when
1215+
* available and otherwise falls back to region copy. The TTBR1 kbuf tree,
1216+
* translator image, and kbuf bytes ride along as primary-buffer used
1217+
* regions; the child restores TCR_EL1 / TTBR1_EL1 from ipc_registers_t and
1218+
* recomputes kbuf_base from kbuf_gpa.
11731219
*/
11741220

11751221
/* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like
@@ -1291,10 +1337,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
12911337
return -LINUX_ENOMEM;
12921338
}
12931339

1294-
/* The parent keeps only its end of the control channel. */
1340+
/* The parent keeps only its end of the control channel. Reset the closed
1341+
* write end to -1 so the fail_snapshot guarded close at the bottom of the
1342+
* function cannot double-close it. In a multithreaded guest, another vCPU
1343+
* could open a new fd between the two closes and get the same number,
1344+
* which the second close would then steal.
1345+
*/
12951346
close(sock_fds[1]);
1296-
if (vfork_notify_fds[1] >= 0)
1347+
if (vfork_notify_fds[1] >= 0) {
12971348
close(vfork_notify_fds[1]);
1349+
vfork_notify_fds[1] = -1;
1350+
}
12981351
int ipc_sock = sock_fds[0];
12991352

13001353
/* Allocate guest PID before serialization so the child header carries its
@@ -1314,6 +1367,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13141367
mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
13151368
guest_region_t *regions_snapshot = NULL;
13161369
guest_region_t preannounced_snapshot[GUEST_MAX_PREANNOUNCED];
1370+
/* APFS clone fd for the CoW snapshot sent to the child. Declared up front
1371+
* so early goto fail_snapshot exits do not read an uninitialized local.
1372+
*/
1373+
int snapshot_shm_fd = -1;
13171374

13181375
/* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
13191376
* into memfd-backed overlay regions. The conversion seeds a private
@@ -1328,31 +1385,87 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13281385
if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
13291386
goto fail_snapshot;
13301387

1331-
/* Determine if elfuse can use the CoW (shm) fast path.
1332-
* If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
1333-
* shm fd to the child. Otherwise fall back to region-by-region copy.
1388+
/* CoW fast path: if shm_fd >= 0, send a snapshot of guest memory to the
1389+
* child instead of the per-region copy. The child maps that snapshot
1390+
* MAP_PRIVATE; subsequent writes on either side are private.
13341391
*
1335-
* Rosetta guests are excluded from CoW even when shm-backed: rosetta's
1336-
* JIT state (TLS slabs, code caches, indirect-call tables, block lists)
1337-
* is process-local and corrupts when CoW-shared. The legacy region-copy
1338-
* path preserves the parent's JIT state independently per child.
1339-
*/
1340-
bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta;
1341-
1342-
/* elfuse does not remap the parent to MAP_PRIVATE here. The parent
1343-
* stays on MAP_SHARED; its vCPU continues writing to the shared file.
1344-
* The child maps MAP_PRIVATE, getting a CoW snapshot.
1392+
* The parent's own mapping cannot be flipped to MAP_PRIVATE here: hv_vm_map
1393+
* caches the host VA->PA mapping, and a MAP_FIXED remap invalidates it
1394+
* (the parent then reads stale memory and writev returns EFAULT). So the
1395+
* parent stays on MAP_SHARED and the snapshot is what isolates the child.
13451396
*
1346-
* This is safe because the IPC is synchronous: the child maps MAP_PRIVATE
1347-
* before the parent's vCPU resumes. After that, the child's CoW pages are
1348-
* frozen (child writes are private, parent writes to MAP_SHARED do not
1349-
* affect CoW'd child pages).
1397+
* Two snapshot sources, in preference order (selected just below):
1398+
* 1. fclonefileat of g->shm_fd to an independent APFS clone. The clone
1399+
* shares blocks with the parent until either side writes, so the
1400+
* parent's subsequent writes never reach the child's backing.
1401+
* 2. The live g->shm_fd. Any page the child has not yet COW'd reads the
1402+
* parent's current bytes -- benign for typical guest state, but
1403+
* corrupts Rosetta's translator-internal structures (TLS slabs, code
1404+
* caches, indirect-call tables) on mid-update reads. Issue #45.
13501405
*
1351-
* an earlier implementation tried remapping the parent to MAP_PRIVATE here,
1352-
* but that breaks HVF: hv_vm_map caches the host VA->PA mapping, and
1353-
* MAP_FIXED remap invalidates it. The parent's vCPU then reads stale
1354-
* memory, causing corrupted syscall data (EFAULT on writev).
1406+
* Rosetta therefore requires path 1 and falls back to region copy if
1407+
* fclonefileat fails; native guests accept path 2 as a fallback so a
1408+
* non-APFS /tmp does not silently slow forks down to per-region copy cost.
13551409
*/
1410+
bool use_shm = (g->shm_fd >= 0);
1411+
1412+
/* Overlay sync runs before the snapshot so the cloned file picks up the
1413+
* overlay-backed bytes. The parent's host VA for each overlay region maps
1414+
* the overlay file, not shm_fd, so shm_fd's contents at those offsets are
1415+
* stale (typically zero) until the pwrite below copies them in. Both the
1416+
* clone-fd path and the live-shm_fd fallback consume this sync.
1417+
*/
1418+
if (use_shm) {
1419+
for (int i = 0; i < g->nregions; i++) {
1420+
const guest_region_t *r = &g->regions[i];
1421+
if (!r->overlay_active)
1422+
continue;
1423+
uint64_t len = r->end - r->start;
1424+
const uint8_t *src = (const uint8_t *) g->host_base + r->start;
1425+
uint64_t off = r->start;
1426+
while (len > 0) {
1427+
size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
1428+
: (size_t) len;
1429+
ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
1430+
if (nw < 0) {
1431+
if (errno == EINTR)
1432+
continue;
1433+
log_error("clone: shm overlay sync pwrite failed: %s",
1434+
strerror(errno));
1435+
goto fail_snapshot;
1436+
}
1437+
if (nw == 0) {
1438+
log_error("clone: shm overlay sync pwrite returned 0");
1439+
goto fail_snapshot;
1440+
}
1441+
src += nw;
1442+
off += (uint64_t) nw;
1443+
len -= (uint64_t) nw;
1444+
}
1445+
}
1446+
/* Attempt the APFS clone snapshot for every guest, not just Rosetta:
1447+
* the clone gives POSIX-style isolation at O(metadata) cost and avoids
1448+
* torn-snapshot reads in guests that snapshot their own state across
1449+
* fork (Redis BGSAVE, checkpointing runtimes). On failure the fallback
1450+
* differs per design above: Rosetta drops use_shm so the region-copy
1451+
* path runs; native guests keep use_shm and send the live g->shm_fd.
1452+
*/
1453+
snapshot_shm_fd = fork_snapshot_shm_via_clonefile(g->shm_fd);
1454+
if (snapshot_shm_fd < 0) {
1455+
if (g->is_rosetta) {
1456+
log_warn(
1457+
"clone: rosetta CoW snapshot via fclonefileat failed "
1458+
"(%s); falling back to region-copy path",
1459+
strerror(errno));
1460+
use_shm = false;
1461+
} else {
1462+
log_debug(
1463+
"clone: CoW snapshot via fclonefileat failed (%s); "
1464+
"sending live shm fd as fallback",
1465+
strerror(errno));
1466+
}
1467+
}
1468+
}
13561469

13571470
/* Snapshot of the semantic region array, populated after the memory dump
13581471
* but before sibling vCPUs resume. Declared up front so all goto paths to
@@ -1401,46 +1514,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
14011514
goto fail_snapshot;
14021515
}
14031516

1404-
/* CoW path: sync MAP_SHARED file overlays back into shm_fd before
1405-
* sending it to the child. The parent's host VA at each overlay
1406-
* region maps the overlay file, not shm_fd, so shm_fd's content at
1407-
* those IPAs is stale (typically zero). The child's MAP_PRIVATE
1408-
* snapshot would expose that stale data at the overlay IPAs. Copy
1409-
* the live overlay bytes into shm_fd at the matching offsets so the
1410-
* child snapshot reflects the parent's view at fork time. Live
1411-
* cross-fork MAP_SHARED coherence (parent and child both seeing
1412-
* subsequent writes through the same file) is left to the cross-fork
1413-
* coherence TODO; this fix only avoids the stale-snapshot regression.
1517+
/* Send the snapshot fd if fclonefileat succeeded, otherwise the live
1518+
* g->shm_fd. The Rosetta-failure case already cleared use_shm above so it
1519+
* never reaches this branch with snapshot_shm_fd < 0.
14141520
*/
14151521
if (use_shm) {
1416-
for (int i = 0; i < g->nregions; i++) {
1417-
const guest_region_t *r = &g->regions[i];
1418-
if (!r->overlay_active)
1419-
continue;
1420-
uint64_t len = r->end - r->start;
1421-
const uint8_t *src = (const uint8_t *) g->host_base + r->start;
1422-
uint64_t off = r->start;
1423-
while (len > 0) {
1424-
size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
1425-
: (size_t) len;
1426-
ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
1427-
if (nw < 0) {
1428-
if (errno == EINTR)
1429-
continue;
1430-
log_error("clone: shm overlay sync pwrite failed: %s",
1431-
strerror(errno));
1432-
goto fail_snapshot;
1433-
}
1434-
if (nw == 0) {
1435-
log_error("clone: shm overlay sync pwrite returned 0");
1436-
goto fail_snapshot;
1437-
}
1438-
src += nw;
1439-
off += (uint64_t) nw;
1440-
len -= (uint64_t) nw;
1441-
}
1442-
}
1443-
if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) {
1522+
int fd_to_send = (snapshot_shm_fd >= 0) ? snapshot_shm_fd : g->shm_fd;
1523+
if (fork_ipc_send_fds(ipc_sock, &fd_to_send, 1) < 0) {
14441524
log_error("clone: failed to send shm fd");
14451525
goto fail_snapshot;
14461526
}
@@ -1555,10 +1635,14 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15551635
child_host_pid);
15561636

15571637
free(regions_snapshot);
1638+
if (snapshot_shm_fd >= 0)
1639+
close(snapshot_shm_fd);
15581640
return child_guest_pid;
15591641

15601642
fail_snapshot:
15611643
free(regions_snapshot);
1644+
if (snapshot_shm_fd >= 0)
1645+
close(snapshot_shm_fd);
15621646
/* Roll back the in-place anon-shared overlay conversion while
15631647
* siblings are still parked. A partial rollback failure (e.g.,
15641648
* region drift past the quiesce timeout) leaves the parent in a
@@ -1578,6 +1662,23 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15781662
close(vfork_notify_fds[0]);
15791663
if (vfork_notify_fds[1] >= 0)
15801664
close(vfork_notify_fds[1]);
1665+
/* posix_spawn at the top of sys_clone always succeeds before any goto
1666+
* fail_snapshot fires, so child_host_pid is a live process here. The
1667+
* IPC socket just closed; the child reads EOF on fork_ipc_read_all and
1668+
* returns nonzero from fork_child_main. Without an explicit waitpid the
1669+
* exited child becomes a zombie: proc_register_child only runs on the
1670+
* success path, so neither proc_reap_finished nor sys_wait4 will ever
1671+
* pick this PID up, and the guest's fork(2) already reported failure.
1672+
* Reap it here to keep host PIDs from accumulating across repeated
1673+
* failures.
1674+
*/
1675+
pid_t reaped;
1676+
do {
1677+
reaped = waitpid(child_host_pid, NULL, 0);
1678+
} while (reaped < 0 && errno == EINTR);
1679+
if (reaped < 0)
1680+
log_warn("clone: failed to reap fork-child pid=%d: %s",
1681+
(int) child_host_pid, strerror(errno));
15811682
return -LINUX_ENOMEM;
15821683
}
15831684

0 commit comments

Comments
 (0)