Skip to content

Commit b1ce739

Browse files
authored
Merge pull request #60 from sysprog21/rosetta-fork
Snapshot shm via APFS fclonefileat for safe CoW
2 parents 0a46e4f + b724c9d commit b1ce739

3 files changed

Lines changed: 395 additions & 69 deletions

File tree

src/core/guest.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,15 @@ int guest_init_from_shm(guest_t *g,
458458
g->interp_base = size - 0x100000000ULL;
459459
g->mmap_limit = size - 0x200000000ULL;
460460
g->overflow_ipa_next = size;
461-
if (compute_infra_layout(g) < 0)
461+
if (compute_infra_layout(g) < 0) {
462+
/* Layout computation may reject a malformed header (impossible
463+
* guest_size / ipa_bits combination) before the mapping is set up;
464+
* close the inherited shm fd here so the caller's contract -- this
465+
* function takes ownership of shm_fd -- holds on every error path.
466+
*/
467+
close(shm_fd);
462468
return -1;
469+
}
463470
g->pt_pool_next = g->pt_pool_base;
464471

465472
/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see

src/runtime/forkipc.c

Lines changed: 169 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <sys/socket.h>
2525
#include <dirent.h> /* fdopendir, for DIR* reconstruction in child */
2626
#include <sys/wait.h>
27+
#include <sys/clonefile.h> /* fclonefileat for CoW shm snapshots */
2728
#include <mach-o/dyld.h>
2829

2930
#include "hvutil.h"
@@ -1137,6 +1138,51 @@ static void *vm_clone_thread_run(void *arg)
11371138
return NULL;
11381139
}
11391140

1141+
/* Create an APFS block-level CoW clone of src_fd via fclonefileat (O(metadata),
1142+
* independent of the source once either side writes). Returns the clone fd on
1143+
* success, -1 with errno set on failure (non-APFS /tmp, ENOSYS, ENOSPC, ...).
1144+
* Callers that issue this snapshot are documented at the call site; the helper
1145+
* itself only owns the clone-path lifecycle.
1146+
*/
1147+
static int fork_snapshot_shm_via_clonefile(int src_fd)
1148+
{
1149+
/* fclonefileat needs a destination path on the same APFS volume as the
1150+
* source. /tmp is APFS on every shipped macOS Apple Silicon configuration;
1151+
* if a user has remapped /tmp to a different filesystem the call fails
1152+
* and the caller drops back to the legacy path.
1153+
*
1154+
* The destination lives inside a fresh mkdtemp directory (mode 0700) so
1155+
* no other local user can race to claim the destination basename between
1156+
* path selection and fclonefileat: an earlier mkstemp + unlink +
1157+
* fclonefileat sequence left a window where /tmp was world-writable for
1158+
* that name and a concurrent process could DoS the fast path via EEXIST.
1159+
*/
1160+
char tmpdir[] = "/tmp/elfuse-fork-XXXXXX";
1161+
if (mkdtemp(tmpdir) == NULL)
1162+
return -1;
1163+
char clone_path[64];
1164+
snprintf(clone_path, sizeof(clone_path), "%s/snap", tmpdir);
1165+
if (fclonefileat(src_fd, AT_FDCWD, clone_path, 0) < 0) {
1166+
int saved_errno = errno;
1167+
rmdir(tmpdir);
1168+
errno = saved_errno;
1169+
return -1;
1170+
}
1171+
int clone_fd = open(clone_path, O_RDWR | O_CLOEXEC);
1172+
int saved_errno = errno;
1173+
/* Best-effort cleanup: the clone fd alone keeps the inode alive, so any
1174+
* unlink/rmdir failure here is a directory-leak nuisance, not a
1175+
* correctness issue. Caller still gets the open fd.
1176+
*/
1177+
(void) unlink(clone_path);
1178+
(void) rmdir(tmpdir);
1179+
if (clone_fd < 0) {
1180+
errno = saved_errno;
1181+
return -1;
1182+
}
1183+
return clone_fd;
1184+
}
1185+
11401186
int64_t sys_clone(hv_vcpu_t vcpu,
11411187
guest_t *g,
11421188
uint64_t flags,
@@ -1163,13 +1209,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
11631209
verbose);
11641210
}
11651211

1166-
/* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path
1167-
* is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and
1168-
* the parent's MAP_SHARED mapping cannot be safely remapped under the
1169-
* running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes
1170-
* ride along as primary-buffer used regions; the child restores
1171-
* TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base
1172-
* from kbuf_gpa.
1212+
/* Rosetta fork takes the helper-process IPC path. The parent cannot remap
1213+
* its live guest memory under the running vCPU because HVF caches VA->PA at
1214+
* hv_vm_map time; instead, the fork path snapshots shm with clonefile when
1215+
* available and otherwise falls back to region copy. The TTBR1 kbuf tree,
1216+
* translator image, and kbuf bytes ride along as primary-buffer used
1217+
* regions; the child restores TCR_EL1 / TTBR1_EL1 from ipc_registers_t and
1218+
* recomputes kbuf_base from kbuf_gpa.
11731219
*/
11741220

11751221
/* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like
@@ -1291,10 +1337,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
12911337
return -LINUX_ENOMEM;
12921338
}
12931339

1294-
/* The parent keeps only its end of the control channel. */
1340+
/* The parent keeps only its end of the control channel. Reset the closed
1341+
* write end to -1 so the fail_snapshot guarded close at the bottom of the
1342+
* function cannot double-close it. In a multithreaded guest, another vCPU
1343+
* could open a new fd between the two closes and get the same number,
1344+
* which the second close would then steal.
1345+
*/
12951346
close(sock_fds[1]);
1296-
if (vfork_notify_fds[1] >= 0)
1347+
if (vfork_notify_fds[1] >= 0) {
12971348
close(vfork_notify_fds[1]);
1349+
vfork_notify_fds[1] = -1;
1350+
}
12981351
int ipc_sock = sock_fds[0];
12991352

13001353
/* Allocate guest PID before serialization so the child header carries its
@@ -1314,6 +1367,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13141367
mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
13151368
guest_region_t *regions_snapshot = NULL;
13161369
guest_region_t preannounced_snapshot[GUEST_MAX_PREANNOUNCED];
1370+
/* APFS clone fd for the CoW snapshot sent to the child. Declared up front
1371+
* so early goto fail_snapshot exits do not read an uninitialized local.
1372+
*/
1373+
int snapshot_shm_fd = -1;
13171374

13181375
/* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
13191376
* into memfd-backed overlay regions. The conversion seeds a private
@@ -1328,31 +1385,87 @@ int64_t sys_clone(hv_vcpu_t vcpu,
13281385
if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
13291386
goto fail_snapshot;
13301387

1331-
/* Determine if elfuse can use the CoW (shm) fast path.
1332-
* If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
1333-
* shm fd to the child. Otherwise fall back to region-by-region copy.
1388+
/* CoW fast path: if shm_fd >= 0, send a snapshot of guest memory to the
1389+
* child instead of the per-region copy. The child maps that snapshot
1390+
* MAP_PRIVATE; subsequent writes on either side are private.
13341391
*
1335-
* Rosetta guests are excluded from CoW even when shm-backed: rosetta's
1336-
* JIT state (TLS slabs, code caches, indirect-call tables, block lists)
1337-
* is process-local and corrupts when CoW-shared. The legacy region-copy
1338-
* path preserves the parent's JIT state independently per child.
1339-
*/
1340-
bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta;
1341-
1342-
/* elfuse does not remap the parent to MAP_PRIVATE here. The parent
1343-
* stays on MAP_SHARED; its vCPU continues writing to the shared file.
1344-
* The child maps MAP_PRIVATE, getting a CoW snapshot.
1392+
* The parent's own mapping cannot be flipped to MAP_PRIVATE here: hv_vm_map
1393+
* caches the host VA->PA mapping, and a MAP_FIXED remap invalidates it
1394+
* (the parent then reads stale memory and writev returns EFAULT). So the
1395+
* parent stays on MAP_SHARED and the snapshot is what isolates the child.
13451396
*
1346-
* This is safe because the IPC is synchronous: the child maps MAP_PRIVATE
1347-
* before the parent's vCPU resumes. After that, the child's CoW pages are
1348-
* frozen (child writes are private, parent writes to MAP_SHARED do not
1349-
* affect CoW'd child pages).
1397+
* Two snapshot sources, in preference order (selected just below):
1398+
* 1. fclonefileat of g->shm_fd to an independent APFS clone. The clone
1399+
* shares blocks with the parent until either side writes, so the
1400+
* parent's subsequent writes never reach the child's backing.
1401+
* 2. The live g->shm_fd. Any page the child has not yet COW'd reads the
1402+
* parent's current bytes -- benign for typical guest state, but
1403+
* corrupts Rosetta's translator-internal structures (TLS slabs, code
1404+
* caches, indirect-call tables) on mid-update reads. Issue #45.
13501405
*
1351-
* an earlier implementation tried remapping the parent to MAP_PRIVATE here,
1352-
* but that breaks HVF: hv_vm_map caches the host VA->PA mapping, and
1353-
* MAP_FIXED remap invalidates it. The parent's vCPU then reads stale
1354-
* memory, causing corrupted syscall data (EFAULT on writev).
1406+
* Rosetta therefore requires path 1 and falls back to region copy if
1407+
* fclonefileat fails; native guests accept path 2 as a fallback so a
1408+
* non-APFS /tmp does not silently slow forks down to per-region copy cost.
13551409
*/
1410+
bool use_shm = (g->shm_fd >= 0);
1411+
1412+
/* Overlay sync runs before the snapshot so the cloned file picks up the
1413+
* overlay-backed bytes. The parent's host VA for each overlay region maps
1414+
* the overlay file, not shm_fd, so shm_fd's contents at those offsets are
1415+
* stale (typically zero) until the pwrite below copies them in. Both the
1416+
* clone-fd path and the live-shm_fd fallback consume this sync.
1417+
*/
1418+
if (use_shm) {
1419+
for (int i = 0; i < g->nregions; i++) {
1420+
const guest_region_t *r = &g->regions[i];
1421+
if (!r->overlay_active)
1422+
continue;
1423+
uint64_t len = r->end - r->start;
1424+
const uint8_t *src = (const uint8_t *) g->host_base + r->start;
1425+
uint64_t off = r->start;
1426+
while (len > 0) {
1427+
size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
1428+
: (size_t) len;
1429+
ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
1430+
if (nw < 0) {
1431+
if (errno == EINTR)
1432+
continue;
1433+
log_error("clone: shm overlay sync pwrite failed: %s",
1434+
strerror(errno));
1435+
goto fail_snapshot;
1436+
}
1437+
if (nw == 0) {
1438+
log_error("clone: shm overlay sync pwrite returned 0");
1439+
goto fail_snapshot;
1440+
}
1441+
src += nw;
1442+
off += (uint64_t) nw;
1443+
len -= (uint64_t) nw;
1444+
}
1445+
}
1446+
/* Attempt the APFS clone snapshot for every guest, not just Rosetta:
1447+
* the clone gives POSIX-style isolation at O(metadata) cost and avoids
1448+
* torn-snapshot reads in guests that snapshot their own state across
1449+
* fork (Redis BGSAVE, checkpointing runtimes). On failure the fallback
1450+
* differs per design above: Rosetta drops use_shm so the region-copy
1451+
* path runs; native guests keep use_shm and send the live g->shm_fd.
1452+
*/
1453+
snapshot_shm_fd = fork_snapshot_shm_via_clonefile(g->shm_fd);
1454+
if (snapshot_shm_fd < 0) {
1455+
if (g->is_rosetta) {
1456+
log_warn(
1457+
"clone: rosetta CoW snapshot via fclonefileat failed "
1458+
"(%s); falling back to region-copy path",
1459+
strerror(errno));
1460+
use_shm = false;
1461+
} else {
1462+
log_debug(
1463+
"clone: CoW snapshot via fclonefileat failed (%s); "
1464+
"sending live shm fd as fallback",
1465+
strerror(errno));
1466+
}
1467+
}
1468+
}
13561469

13571470
/* Snapshot of the semantic region array, populated after the memory dump
13581471
* but before sibling vCPUs resume. Declared up front so all goto paths to
@@ -1401,46 +1514,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
14011514
goto fail_snapshot;
14021515
}
14031516

1404-
/* CoW path: sync MAP_SHARED file overlays back into shm_fd before
1405-
* sending it to the child. The parent's host VA at each overlay
1406-
* region maps the overlay file, not shm_fd, so shm_fd's content at
1407-
* those IPAs is stale (typically zero). The child's MAP_PRIVATE
1408-
* snapshot would expose that stale data at the overlay IPAs. Copy
1409-
* the live overlay bytes into shm_fd at the matching offsets so the
1410-
* child snapshot reflects the parent's view at fork time. Live
1411-
* cross-fork MAP_SHARED coherence (parent and child both seeing
1412-
* subsequent writes through the same file) is left to the cross-fork
1413-
* coherence TODO; this fix only avoids the stale-snapshot regression.
1517+
/* Send the snapshot fd if fclonefileat succeeded, otherwise the live
1518+
* g->shm_fd. The Rosetta-failure case already cleared use_shm above so it
1519+
* never reaches this branch with snapshot_shm_fd < 0.
14141520
*/
14151521
if (use_shm) {
1416-
for (int i = 0; i < g->nregions; i++) {
1417-
const guest_region_t *r = &g->regions[i];
1418-
if (!r->overlay_active)
1419-
continue;
1420-
uint64_t len = r->end - r->start;
1421-
const uint8_t *src = (const uint8_t *) g->host_base + r->start;
1422-
uint64_t off = r->start;
1423-
while (len > 0) {
1424-
size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
1425-
: (size_t) len;
1426-
ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
1427-
if (nw < 0) {
1428-
if (errno == EINTR)
1429-
continue;
1430-
log_error("clone: shm overlay sync pwrite failed: %s",
1431-
strerror(errno));
1432-
goto fail_snapshot;
1433-
}
1434-
if (nw == 0) {
1435-
log_error("clone: shm overlay sync pwrite returned 0");
1436-
goto fail_snapshot;
1437-
}
1438-
src += nw;
1439-
off += (uint64_t) nw;
1440-
len -= (uint64_t) nw;
1441-
}
1442-
}
1443-
if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) {
1522+
int fd_to_send = (snapshot_shm_fd >= 0) ? snapshot_shm_fd : g->shm_fd;
1523+
if (fork_ipc_send_fds(ipc_sock, &fd_to_send, 1) < 0) {
14441524
log_error("clone: failed to send shm fd");
14451525
goto fail_snapshot;
14461526
}
@@ -1555,10 +1635,14 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15551635
child_host_pid);
15561636

15571637
free(regions_snapshot);
1638+
if (snapshot_shm_fd >= 0)
1639+
close(snapshot_shm_fd);
15581640
return child_guest_pid;
15591641

15601642
fail_snapshot:
15611643
free(regions_snapshot);
1644+
if (snapshot_shm_fd >= 0)
1645+
close(snapshot_shm_fd);
15621646
/* Roll back the in-place anon-shared overlay conversion while
15631647
* siblings are still parked. A partial rollback failure (e.g.,
15641648
* region drift past the quiesce timeout) leaves the parent in a
@@ -1578,6 +1662,23 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15781662
close(vfork_notify_fds[0]);
15791663
if (vfork_notify_fds[1] >= 0)
15801664
close(vfork_notify_fds[1]);
1665+
/* posix_spawn at the top of sys_clone always succeeds before any goto
1666+
* fail_snapshot fires, so child_host_pid is a live process here. The
1667+
* IPC socket just closed; the child reads EOF on fork_ipc_read_all and
1668+
* returns nonzero from fork_child_main. Without an explicit waitpid the
1669+
* exited child becomes a zombie: proc_register_child only runs on the
1670+
* success path, so neither proc_reap_finished nor sys_wait4 will ever
1671+
* pick this PID up, and the guest's fork(2) already reported failure.
1672+
* Reap it here to keep host PIDs from accumulating across repeated
1673+
* failures.
1674+
*/
1675+
pid_t reaped;
1676+
do {
1677+
reaped = waitpid(child_host_pid, NULL, 0);
1678+
} while (reaped < 0 && errno == EINTR);
1679+
if (reaped < 0)
1680+
log_warn("clone: failed to reap fork-child pid=%d: %s",
1681+
(int) child_host_pid, strerror(errno));
15811682
return -LINUX_ENOMEM;
15821683
}
15831684

0 commit comments

Comments
 (0)