Skip to content

Commit a2fb25a

Browse files
authored
Merge pull request #101 from sysprog21/nested-clone
Sync child TID and retain CoW across nested fork
2 parents e45b71c + b8e578c commit a2fb25a

7 files changed

Lines changed: 205 additions & 24 deletions

File tree

src/core/guest.c

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -441,12 +441,13 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
441441
int guest_init_from_shm(guest_t *g,
442442
int shm_fd,
443443
uint64_t size,
444-
uint32_t ipa_bits)
444+
uint32_t ipa_bits,
445+
bool retain_shared)
445446
{
446447
uint64_t t0;
447448

448449
memset(g, 0, sizeof(*g));
449-
g->shm_fd = -1; /* Child does not own the shm */
450+
g->shm_fd = -1; /* Child does not own the shm unless retain_shared */
450451
g->ipa_base = GUEST_IPA_BASE;
451452
g->elf_load_min = ELF_DEFAULT_BASE;
452453
g->brk_base = BRK_BASE_DEFAULT;
@@ -471,13 +472,21 @@ int guest_init_from_shm(guest_t *g,
471472
}
472473
g->pt_pool_next = g->pt_pool_base;
473474

474-
/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see the
475-
* parent's frozen snapshot; writes are private to this process. macOS CoW
476-
* is page-granular: only modified pages are duplicated.
475+
/* Two mapping modes:
476+
* retain_shared: shm_fd is an independent APFS clone of the parent's
477+
* memory (already isolated from the parent). Map MAP_SHARED so the
478+
* child's writes land in the clone file, then keep the fd so the child
479+
* can fclonefileat it for its own nested CoW fork. guest_destroy closes
480+
* it.
481+
* otherwise: shm_fd may be the parent's live fd (clonefile fallback). Map
482+
* MAP_PRIVATE so writes stay private to this process, then close the
483+
* fd. macOS CoW is page-granular either way: only modified pages are
484+
* duplicated.
477485
*/
486+
int map_flags = retain_shared ? MAP_SHARED : MAP_PRIVATE;
478487
t0 = startup_trace_now_ns();
479488
g->host_base =
480-
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
489+
mmap(NULL, size, PROT_READ | PROT_WRITE, map_flags, shm_fd, 0);
481490
startup_trace_step("shm_mmap", t0);
482491
if (g->host_base == MAP_FAILED) {
483492
perror("guest: mmap shm");
@@ -486,8 +495,10 @@ int guest_init_from_shm(guest_t *g,
486495
return -1;
487496
}
488497

489-
/* Close the shm fd; the mapping keeps the pages alive */
490-
close(shm_fd);
498+
if (retain_shared)
499+
g->shm_fd = shm_fd; /* Child owns the clone; guest_destroy closes it */
500+
else
501+
close(shm_fd); /* MAP_PRIVATE mapping keeps the pages alive */
491502

492503
/* Create HVF VM with the same IPA width as the parent */
493504
hv_return_t ret = HV_ERROR;
@@ -506,6 +517,10 @@ int guest_init_from_shm(guest_t *g,
506517
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
507518
munmap(g->host_base, size);
508519
g->host_base = NULL;
520+
if (g->shm_fd >= 0) {
521+
close(g->shm_fd);
522+
g->shm_fd = -1;
523+
}
509524
return -1;
510525
}
511526

@@ -518,6 +533,10 @@ int guest_init_from_shm(guest_t *g,
518533
hv_vm_destroy();
519534
munmap(g->host_base, size);
520535
g->host_base = NULL;
536+
if (g->shm_fd >= 0) {
537+
close(g->shm_fd);
538+
g->shm_fd = -1;
539+
}
521540
return -1;
522541
}
523542

src/core/guest.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -774,16 +774,22 @@ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr)
774774
*/
775775
int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits);
776776

777-
/* Initialize guest from a POSIX shared memory fd (CoW fork path). Maps shm_fd
778-
* MAP_PRIVATE (copy-on-write), creates HVF VM, maps to hypervisor. The child
779-
* gets an instant CoW snapshot of parent's guest memory without copying. shm_fd
780-
* is closed after mapping.
777+
/* Initialize guest from a shared memory fd (CoW fork path). Creates the HVF VM
778+
* and maps the fd to the hypervisor. The child gets an instant CoW snapshot of
779+
* the parent's guest memory without copying.
780+
*
781+
* retain_shared selects the mapping: when true, shm_fd is an independent APFS
782+
* clone, so it is mapped MAP_SHARED and retained in g->shm_fd (guest_destroy
783+
* closes it) so the child can fclonefileat it for nested CoW fork. When false,
784+
* shm_fd may be the parent's live fd, so it is mapped MAP_PRIVATE and closed
785+
* after mapping. This function takes ownership of shm_fd on every path.
781786
* Returns 0 on success, -1 on failure.
782787
*/
783788
int guest_init_from_shm(guest_t *g,
784789
int shm_fd,
785790
uint64_t size,
786-
uint32_t ipa_bits);
791+
uint32_t ipa_bits,
792+
bool retain_shared);
787793

788794
/* Tear down VM and free guest memory. */
789795
void guest_destroy(guest_t *g);

src/runtime/fork-state.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ typedef struct {
6060
uint64_t rosetta_entry;
6161
uint64_t kbuf_gpa;
6262
uint64_t ttbr1;
63+
/* Clone TID-sync state for the fork path. glibc's fork wrapper passes
64+
* CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child writes its new TID
65+
* into the TCB and clears it on exit. The posix_spawn child has no access
66+
* to the original clone() arguments, so the parent forwards them here:
67+
* clone_flags carries the CHILD_SETTID / CHILD_CLEARTID bits and ctid_gva
68+
* the guest address. Zero for callers (e.g. raw fork(2)) that pass neither.
69+
*/
70+
uint64_t clone_flags;
71+
uint64_t ctid_gva;
72+
/* Nonzero when the shm fd sent below is an independent fclonefileat clone
73+
* (not the parent's live fd). Only then may the child map it MAP_SHARED and
74+
* retain it for its own nested CoW fork; the live-fd fallback must stay
75+
* MAP_PRIVATE so the child does not share writes with the parent.
76+
*/
77+
uint32_t shm_is_clone;
78+
uint32_t _pad2;
6379
} ipc_header_t;
6480

6581
typedef struct {

src/runtime/forkipc.c

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@
4848
#include "debug/log.h"
4949
#include "debug/syscall-hist.h"
5050

51+
/* Linux clone flags. Shared by the fork-child TID-sync emulation below and
52+
* sys_clone further down.
53+
*/
54+
#define LINUX_CLONE_VM 0x00000100
55+
#define LINUX_CLONE_VFORK 0x00004000
56+
#define LINUX_CLONE_THREAD 0x00010000
57+
#define LINUX_CLONE_SETTLS 0x00080000
58+
#define LINUX_CLONE_PARENT_SETTID 0x00100000
59+
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
60+
#define LINUX_CLONE_CHILD_SETTID 0x01000000
61+
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */
62+
5163
/* fork_child_main. */
5264

5365
static int fork_child_vfork_notify_fd = -1;
@@ -166,7 +178,8 @@ int fork_child_main(int ipc_fd,
166178
close(ipc_fd);
167179
return 1;
168180
}
169-
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits) < 0) {
181+
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits,
182+
hdr.shm_is_clone != 0) < 0) {
170183
log_error("fork-child: guest_init_from_shm failed");
171184
close(ipc_fd);
172185
return 1;
@@ -363,6 +376,30 @@ int fork_child_main(int ipc_fd,
363376
*/
364377
thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);
365378

379+
/* Emulate CLONE_CHILD_SETTID for the fork child. glibc's fork wrapper
380+
* passes CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child's TCB
381+
* caches its own TID; without the SETTID write the child keeps the parent's
382+
* cached TID and modern glibc trips stack-canary / TLS checks ("stack
383+
* smashing detected"). The write goes through guest memory, valid for both
384+
* the CoW and region-copy paths. A faulting ctid_gva is the guest's own bad
385+
* pointer: warn and continue, matching how the kernel ignores a
386+
* child_tidptr fault.
387+
*
388+
* CLONE_CHILD_CLEARTID is deliberately not honored here. The clear-and-wake
389+
* on exit only matters to an in-process joiner waiting on the futex (that
390+
* is how the worker-thread exit path serves pthread_join). A fork child is
391+
* a separate process with its own address space, so its ctid lives in
392+
* memory no other process can observe -- the parent reaps it via
393+
* wait4/SIGCHLD, not a cross-process futex. Registering clear_child_tid
394+
* would be inert.
395+
*/
396+
if (hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) {
397+
int32_t tid32 = (int32_t) hdr.child_pid;
398+
if (guest_write_small(&g, hdr.ctid_gva, &tid32, sizeof(tid32)) < 0)
399+
log_warn("fork-child: CHILD_SETTID write to 0x%llx failed",
400+
(unsigned long long) hdr.ctid_gva);
401+
}
402+
366403
/* Re-publish identity into the child's shim-globals cache: the CoW / region
367404
* copy inherits the parent's pid/uid values, and the shim's identity fast
368405
* path would otherwise return the parent's pid to the child. Identity is
@@ -420,16 +457,6 @@ int fork_child_main(int ipc_fd,
420457

421458
/* sys_clone. */
422459

423-
/* Linux clone flags */
424-
#define LINUX_CLONE_VM 0x00000100
425-
#define LINUX_CLONE_VFORK 0x00004000
426-
#define LINUX_CLONE_THREAD 0x00010000
427-
#define LINUX_CLONE_SETTLS 0x00080000
428-
#define LINUX_CLONE_PARENT_SETTID 0x00100000
429-
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
430-
#define LINUX_CLONE_CHILD_SETTID 0x01000000
431-
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */
432-
433460
/* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and
434461
* sys_clone3 reject them.
435462
*/
@@ -1528,6 +1555,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15281555
.rosetta_entry = g->rosetta_entry,
15291556
.kbuf_gpa = g->kbuf_gpa,
15301557
.ttbr1 = g->ttbr1,
1558+
.clone_flags =
1559+
flags & (LINUX_CLONE_CHILD_SETTID | LINUX_CLONE_CHILD_CLEARTID),
1560+
.ctid_gva = ctid_gva,
1561+
.shm_is_clone = (snapshot_shm_fd >= 0) ? 1 : 0,
15311562
};
15321563
if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) {
15331564
log_error("clone: failed to send header");

tests/manifest.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ test-signal-thread
9595

9696
[section] Fork edge cases
9797
test-clone3 # diff=skip
98+
test-clone-childtid
9899
test-fork-exec $TESTDIR/echo-test
99100
test-fork-lowbase
100101

tests/test-clone-childtid.c

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/* Test CLONE_CHILD_SETTID / CLONE_CHILD_CLEARTID on the fork (posix_spawn) path
2+
*
3+
* Copyright 2026 elfuse contributors
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Issue #99: glibc's fork wrapper clones with CLONE_CHILD_SETTID |
7+
* CLONE_CHILD_CLEARTID | SIGCHLD. The child's TID must be written into the
8+
* ctid address so glibc's TCB caches the right value. This calls clone()
9+
* directly with those exact flags (no CLONE_VM/THREAD/VFORK, so elfuse takes
10+
* the fork helper-process path) and checks the child observes its own TID at
11+
* the ctid slot -- glibc-version-independent, unlike the canary symptom.
12+
*
13+
* Raw syscall throughout: glibc's own clone wrapper does not expose the ctid
14+
* arg, and we want to exercise elfuse's handling rather than libc's.
15+
*/
16+
17+
#include <stdio.h>
18+
#include <stdint.h>
19+
#include <unistd.h>
20+
#include <sched.h>
21+
#include <sys/syscall.h>
22+
#include <sys/wait.h>
23+
#include <linux/sched.h>
24+
25+
#ifndef CLONE_CHILD_CLEARTID
26+
#define CLONE_CHILD_CLEARTID 0x00200000
27+
#endif
28+
#ifndef CLONE_CHILD_SETTID
29+
#define CLONE_CHILD_SETTID 0x01000000
30+
#endif
31+
32+
static volatile int child_tid_slot;
33+
34+
int main(void)
35+
{
36+
/* aarch64 clone(2): clone(flags, stack, parent_tid, tls, child_tid). */
37+
unsigned long flags = CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD;
38+
long rc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
39+
(void *) &child_tid_slot);
40+
if (rc < 0) {
41+
printf("test-clone-childtid: clone failed -- FAIL\n");
42+
return 1;
43+
}
44+
45+
if (rc == 0) {
46+
/* Child: the kernel (here, elfuse) must have written our TID into the
47+
* ctid slot before we resumed.
48+
*/
49+
pid_t tid = (pid_t) syscall(SYS_gettid);
50+
if (child_tid_slot != tid) {
51+
/* Cannot printf reliably from a possibly-confused child; encode the
52+
* result in the exit status instead.
53+
*/
54+
_exit(child_tid_slot == 0 ? 2 : 3);
55+
}
56+
57+
/* Nested clone: the child forks a grandchild with the same flags. This
58+
* exercises the child-side CoW shm retention (issue #99 part 2): the
59+
* child must be able to clone its own memory again, and the grandchild
60+
* must likewise see a fresh TID at its ctid slot.
61+
*/
62+
static volatile int grand_tid_slot;
63+
long grc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
64+
(void *) &grand_tid_slot);
65+
if (grc < 0)
66+
_exit(4);
67+
if (grc == 0) {
68+
pid_t gtid = (pid_t) syscall(SYS_gettid);
69+
_exit(grand_tid_slot == gtid ? 0 : 5);
70+
}
71+
int gstatus;
72+
if (waitpid((pid_t) grc, &gstatus, 0) < 0)
73+
_exit(6);
74+
if (!WIFEXITED(gstatus) || WEXITSTATUS(gstatus) != 0)
75+
_exit(7);
76+
_exit(0);
77+
}
78+
79+
int status;
80+
if (waitpid((pid_t) rc, &status, 0) < 0) {
81+
printf("test-clone-childtid: waitpid failed -- FAIL\n");
82+
return 1;
83+
}
84+
if (!WIFEXITED(status)) {
85+
printf(
86+
"test-clone-childtid: child did not exit cleanly (0x%x) -- FAIL\n",
87+
status);
88+
return 1;
89+
}
90+
switch (WEXITSTATUS(status)) {
91+
case 0:
92+
printf("test-clone-childtid: child saw its TID at ctid -- PASS\n");
93+
return 0;
94+
case 2:
95+
printf(
96+
"test-clone-childtid: ctid slot still 0 (SETTID ignored) -- "
97+
"FAIL\n");
98+
return 1;
99+
case 3:
100+
printf("test-clone-childtid: ctid slot holds wrong TID -- FAIL\n");
101+
return 1;
102+
default:
103+
printf("test-clone-childtid: unexpected child exit %d -- FAIL\n",
104+
WEXITSTATUS(status));
105+
return 1;
106+
}
107+
}

tests/test-matrix.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ run_unit_tests()
453453

454454
printf "\nProcess tests\n"
455455
test_check "$runner" "test-fork" "PASS" "$bindir/test-fork"
456+
test_check "$runner" "test-clone-childtid" "PASS" "$bindir/test-clone-childtid"
456457
test_check "$runner" "test-exec" "exec-works" "$bindir/test-exec" "$bindir/echo-test" exec-works
457458
test_check "$runner" "test-fork-exec" "PASS" "$bindir/test-fork-exec" "$bindir/echo-test"
458459
test_check "$runner" "test-cloexec" "PASS" "$bindir/test-cloexec"

0 commit comments

Comments
 (0)