Skip to content

Commit b8e578c

Browse files
committed
Sync child TID and retain CoW across nested fork
glibc's fork wrapper clones with CLONE_CHILD_{SETTID | CLEARTID} | SIGCHLD, but the posix_spawn fork path could not see the original clone arguments, so the child never wrote its new TID into the guest ctid address. The child kept the parent's cached TID and modern glibc tripped its stack-canary / TLS checks ("stack smashing detected"), which surfaced on nested forks. Forward the relevant clone flags and the ctid address through ipc_header_t. In fork_child_main, after the main thread is registered, honor CLONE_CHILD_SETTID by writing the child TID into ctid_gva. A faulting address is the guest's own bad pointer, so warn and continue, matching how the kernel ignores a child_tidptr fault. CLONE_CHILD_CLEARTID is intentionally not honored: a fork child is a separate process whose ctid no other process can observe, and the parent reaps it via wait4/SIGCHLD rather than a cross-process futex. A fork child also closed its inherited shm fd and mapped it MAP_PRIVATE, so any nested grandchild fork dropped off the copy-on-write fast path into the slow region-copy path. When the inherited fd is an independent fclonefileat clone (the new shm_is_clone header flag), map it MAP_SHARED and retain it in g->shm_fd so the child can clone it again for its own nested fork; guest_destroy closes it. The live-fd fallback keeps the MAP_PRIVATE behavior so the child does not share writes with the parent. guest_init_from_shm gains a retain_shared parameter and closes the inherited fd on every error path so the ownership contract holds. Close #99
1 parent e45b71c commit b8e578c

7 files changed

Lines changed: 205 additions & 24 deletions

File tree

src/core/guest.c

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -441,12 +441,13 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
441441
int guest_init_from_shm(guest_t *g,
442442
int shm_fd,
443443
uint64_t size,
444-
uint32_t ipa_bits)
444+
uint32_t ipa_bits,
445+
bool retain_shared)
445446
{
446447
uint64_t t0;
447448

448449
memset(g, 0, sizeof(*g));
449-
g->shm_fd = -1; /* Child does not own the shm */
450+
g->shm_fd = -1; /* Child does not own the shm unless retain_shared */
450451
g->ipa_base = GUEST_IPA_BASE;
451452
g->elf_load_min = ELF_DEFAULT_BASE;
452453
g->brk_base = BRK_BASE_DEFAULT;
@@ -471,13 +472,21 @@ int guest_init_from_shm(guest_t *g,
471472
}
472473
g->pt_pool_next = g->pt_pool_base;
473474

474-
/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see the
475-
* parent's frozen snapshot; writes are private to this process. macOS CoW
476-
* is page-granular: only modified pages are duplicated.
475+
/* Two mapping modes:
476+
* retain_shared: shm_fd is an independent APFS clone of the parent's
477+
* memory (already isolated from the parent). Map MAP_SHARED so the
478+
* child's writes land in the clone file, then keep the fd so the child
479+
* can fclonefileat it for its own nested CoW fork. guest_destroy closes
480+
* it.
481+
* otherwise: shm_fd may be the parent's live fd (clonefile fallback). Map
482+
* MAP_PRIVATE so writes stay private to this process, then close the
483+
* fd. macOS CoW is page-granular either way: only modified pages are
484+
* duplicated.
477485
*/
486+
int map_flags = retain_shared ? MAP_SHARED : MAP_PRIVATE;
478487
t0 = startup_trace_now_ns();
479488
g->host_base =
480-
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
489+
mmap(NULL, size, PROT_READ | PROT_WRITE, map_flags, shm_fd, 0);
481490
startup_trace_step("shm_mmap", t0);
482491
if (g->host_base == MAP_FAILED) {
483492
perror("guest: mmap shm");
@@ -486,8 +495,10 @@ int guest_init_from_shm(guest_t *g,
486495
return -1;
487496
}
488497

489-
/* Close the shm fd; the mapping keeps the pages alive */
490-
close(shm_fd);
498+
if (retain_shared)
499+
g->shm_fd = shm_fd; /* Child owns the clone; guest_destroy closes it */
500+
else
501+
close(shm_fd); /* MAP_PRIVATE mapping keeps the pages alive */
491502

492503
/* Create HVF VM with the same IPA width as the parent */
493504
hv_return_t ret = HV_ERROR;
@@ -506,6 +517,10 @@ int guest_init_from_shm(guest_t *g,
506517
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
507518
munmap(g->host_base, size);
508519
g->host_base = NULL;
520+
if (g->shm_fd >= 0) {
521+
close(g->shm_fd);
522+
g->shm_fd = -1;
523+
}
509524
return -1;
510525
}
511526

@@ -518,6 +533,10 @@ int guest_init_from_shm(guest_t *g,
518533
hv_vm_destroy();
519534
munmap(g->host_base, size);
520535
g->host_base = NULL;
536+
if (g->shm_fd >= 0) {
537+
close(g->shm_fd);
538+
g->shm_fd = -1;
539+
}
521540
return -1;
522541
}
523542

src/core/guest.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -774,16 +774,22 @@ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr)
774774
*/
775775
int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits);
776776

777-
/* Initialize guest from a POSIX shared memory fd (CoW fork path). Maps shm_fd
778-
* MAP_PRIVATE (copy-on-write), creates HVF VM, maps to hypervisor. The child
779-
* gets an instant CoW snapshot of parent's guest memory without copying. shm_fd
780-
* is closed after mapping.
777+
/* Initialize guest from a shared memory fd (CoW fork path). Creates the HVF VM
778+
* and maps the fd to the hypervisor. The child gets an instant CoW snapshot of
779+
* the parent's guest memory without copying.
780+
*
781+
* retain_shared selects the mapping: when true, shm_fd is an independent APFS
782+
* clone, so it is mapped MAP_SHARED and retained in g->shm_fd (guest_destroy
783+
* closes it) so the child can fclonefileat it for nested CoW fork. When false,
784+
* shm_fd may be the parent's live fd, so it is mapped MAP_PRIVATE and closed
785+
* after mapping. This function takes ownership of shm_fd on every path.
781786
* Returns 0 on success, -1 on failure.
782787
*/
783788
int guest_init_from_shm(guest_t *g,
784789
int shm_fd,
785790
uint64_t size,
786-
uint32_t ipa_bits);
791+
uint32_t ipa_bits,
792+
bool retain_shared);
787793

788794
/* Tear down VM and free guest memory. */
789795
void guest_destroy(guest_t *g);

src/runtime/fork-state.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ typedef struct {
6060
uint64_t rosetta_entry;
6161
uint64_t kbuf_gpa;
6262
uint64_t ttbr1;
63+
/* Clone TID-sync state for the fork path. glibc's fork wrapper passes
64+
* CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child writes its new TID
65+
* into the TCB and clears it on exit. The posix_spawn child has no access
66+
* to the original clone() arguments, so the parent forwards them here:
67+
* clone_flags carries the CHILD_SETTID / CHILD_CLEARTID bits and ctid_gva
68+
* the guest address. Zero for callers (e.g. raw fork(2)) that pass neither.
69+
*/
70+
uint64_t clone_flags;
71+
uint64_t ctid_gva;
72+
/* Nonzero when the shm fd sent below is an independent fclonefileat clone
73+
* (not the parent's live fd). Only then may the child map it MAP_SHARED and
74+
* retain it for its own nested CoW fork; the live-fd fallback must stay
75+
* MAP_PRIVATE so the child does not share writes with the parent.
76+
*/
77+
uint32_t shm_is_clone;
78+
uint32_t _pad2;
6379
} ipc_header_t;
6480

6581
typedef struct {

src/runtime/forkipc.c

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@
4848
#include "debug/log.h"
4949
#include "debug/syscall-hist.h"
5050

51+
/* Linux clone flags. Shared by the fork-child TID-sync emulation below and
52+
* sys_clone further down.
53+
*/
54+
#define LINUX_CLONE_VM 0x00000100
55+
#define LINUX_CLONE_VFORK 0x00004000
56+
#define LINUX_CLONE_THREAD 0x00010000
57+
#define LINUX_CLONE_SETTLS 0x00080000
58+
#define LINUX_CLONE_PARENT_SETTID 0x00100000
59+
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
60+
#define LINUX_CLONE_CHILD_SETTID 0x01000000
61+
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */
62+
5163
/* fork_child_main. */
5264

5365
static int fork_child_vfork_notify_fd = -1;
@@ -166,7 +178,8 @@ int fork_child_main(int ipc_fd,
166178
close(ipc_fd);
167179
return 1;
168180
}
169-
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits) < 0) {
181+
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits,
182+
hdr.shm_is_clone != 0) < 0) {
170183
log_error("fork-child: guest_init_from_shm failed");
171184
close(ipc_fd);
172185
return 1;
@@ -363,6 +376,30 @@ int fork_child_main(int ipc_fd,
363376
*/
364377
thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);
365378

379+
/* Emulate CLONE_CHILD_SETTID for the fork child. glibc's fork wrapper
380+
* passes CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child's TCB
381+
* caches its own TID; without the SETTID write the child keeps the parent's
382+
* cached TID and modern glibc trips stack-canary / TLS checks ("stack
383+
* smashing detected"). The write goes through guest memory, valid for both
384+
* the CoW and region-copy paths. A faulting ctid_gva is the guest's own bad
385+
* pointer: warn and continue, matching how the kernel ignores a
386+
* child_tidptr fault.
387+
*
388+
* CLONE_CHILD_CLEARTID is deliberately not honored here. The clear-and-wake
389+
* on exit only matters to an in-process joiner waiting on the futex (that
390+
* is how the worker-thread exit path serves pthread_join). A fork child is
391+
* a separate process with its own address space, so its ctid lives in
392+
* memory no other process can observe -- the parent reaps it via
393+
* wait4/SIGCHLD, not a cross-process futex. Registering clear_child_tid
394+
* would be inert.
395+
*/
396+
if (hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) {
397+
int32_t tid32 = (int32_t) hdr.child_pid;
398+
if (guest_write_small(&g, hdr.ctid_gva, &tid32, sizeof(tid32)) < 0)
399+
log_warn("fork-child: CHILD_SETTID write to 0x%llx failed",
400+
(unsigned long long) hdr.ctid_gva);
401+
}
402+
366403
/* Re-publish identity into the child's shim-globals cache: the CoW / region
367404
* copy inherits the parent's pid/uid values, and the shim's identity fast
368405
* path would otherwise return the parent's pid to the child. Identity is
@@ -420,16 +457,6 @@ int fork_child_main(int ipc_fd,
420457

421458
/* sys_clone. */
422459

423-
/* Linux clone flags */
424-
#define LINUX_CLONE_VM 0x00000100
425-
#define LINUX_CLONE_VFORK 0x00004000
426-
#define LINUX_CLONE_THREAD 0x00010000
427-
#define LINUX_CLONE_SETTLS 0x00080000
428-
#define LINUX_CLONE_PARENT_SETTID 0x00100000
429-
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
430-
#define LINUX_CLONE_CHILD_SETTID 0x01000000
431-
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */
432-
433460
/* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and
434461
* sys_clone3 reject them.
435462
*/
@@ -1528,6 +1555,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
15281555
.rosetta_entry = g->rosetta_entry,
15291556
.kbuf_gpa = g->kbuf_gpa,
15301557
.ttbr1 = g->ttbr1,
1558+
.clone_flags =
1559+
flags & (LINUX_CLONE_CHILD_SETTID | LINUX_CLONE_CHILD_CLEARTID),
1560+
.ctid_gva = ctid_gva,
1561+
.shm_is_clone = (snapshot_shm_fd >= 0) ? 1 : 0,
15311562
};
15321563
if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) {
15331564
log_error("clone: failed to send header");

tests/manifest.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ test-signal-thread
9595

9696
[section] Fork edge cases
9797
test-clone3 # diff=skip
98+
test-clone-childtid
9899
test-fork-exec $TESTDIR/echo-test
99100
test-fork-lowbase
100101

tests/test-clone-childtid.c

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/* Test CLONE_CHILD_SETTID / CLONE_CHILD_CLEARTID on the fork (posix_spawn) path
2+
*
3+
* Copyright 2026 elfuse contributors
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Issue #99: glibc's fork wrapper clones with CLONE_CHILD_SETTID |
7+
* CLONE_CHILD_CLEARTID | SIGCHLD. The child's TID must be written into the
8+
* ctid address so glibc's TCB caches the right value. This calls clone()
9+
* directly with those exact flags (no CLONE_VM/THREAD/VFORK, so elfuse takes
10+
* the fork helper-process path) and checks the child observes its own TID at
11+
* the ctid slot -- glibc-version-independent, unlike the canary symptom.
12+
*
13+
* Raw syscall throughout: glibc's own clone wrapper does not expose the ctid
14+
* arg, and we want to exercise elfuse's handling rather than libc's.
15+
*/
16+
17+
#include <stdio.h>
18+
#include <stdint.h>
19+
#include <unistd.h>
20+
#include <sched.h>
21+
#include <sys/syscall.h>
22+
#include <sys/wait.h>
23+
#include <linux/sched.h>
24+
25+
#ifndef CLONE_CHILD_CLEARTID
26+
#define CLONE_CHILD_CLEARTID 0x00200000
27+
#endif
28+
#ifndef CLONE_CHILD_SETTID
29+
#define CLONE_CHILD_SETTID 0x01000000
30+
#endif
31+
32+
static volatile int child_tid_slot;
33+
34+
int main(void)
35+
{
36+
/* aarch64 clone(2): clone(flags, stack, parent_tid, tls, child_tid). */
37+
unsigned long flags = CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD;
38+
long rc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
39+
(void *) &child_tid_slot);
40+
if (rc < 0) {
41+
printf("test-clone-childtid: clone failed -- FAIL\n");
42+
return 1;
43+
}
44+
45+
if (rc == 0) {
46+
/* Child: the kernel (here, elfuse) must have written our TID into the
47+
* ctid slot before we resumed.
48+
*/
49+
pid_t tid = (pid_t) syscall(SYS_gettid);
50+
if (child_tid_slot != tid) {
51+
/* Cannot printf reliably from a possibly-confused child; encode the
52+
* result in the exit status instead.
53+
*/
54+
_exit(child_tid_slot == 0 ? 2 : 3);
55+
}
56+
57+
/* Nested clone: the child forks a grandchild with the same flags. This
58+
* exercises the child-side CoW shm retention (issue #99 part 2): the
59+
* child must be able to clone its own memory again, and the grandchild
60+
* must likewise see a fresh TID at its ctid slot.
61+
*/
62+
static volatile int grand_tid_slot;
63+
long grc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
64+
(void *) &grand_tid_slot);
65+
if (grc < 0)
66+
_exit(4);
67+
if (grc == 0) {
68+
pid_t gtid = (pid_t) syscall(SYS_gettid);
69+
_exit(grand_tid_slot == gtid ? 0 : 5);
70+
}
71+
int gstatus;
72+
if (waitpid((pid_t) grc, &gstatus, 0) < 0)
73+
_exit(6);
74+
if (!WIFEXITED(gstatus) || WEXITSTATUS(gstatus) != 0)
75+
_exit(7);
76+
_exit(0);
77+
}
78+
79+
int status;
80+
if (waitpid((pid_t) rc, &status, 0) < 0) {
81+
printf("test-clone-childtid: waitpid failed -- FAIL\n");
82+
return 1;
83+
}
84+
if (!WIFEXITED(status)) {
85+
printf(
86+
"test-clone-childtid: child did not exit cleanly (0x%x) -- FAIL\n",
87+
status);
88+
return 1;
89+
}
90+
switch (WEXITSTATUS(status)) {
91+
case 0:
92+
printf("test-clone-childtid: child saw its TID at ctid -- PASS\n");
93+
return 0;
94+
case 2:
95+
printf(
96+
"test-clone-childtid: ctid slot still 0 (SETTID ignored) -- "
97+
"FAIL\n");
98+
return 1;
99+
case 3:
100+
printf("test-clone-childtid: ctid slot holds wrong TID -- FAIL\n");
101+
return 1;
102+
default:
103+
printf("test-clone-childtid: unexpected child exit %d -- FAIL\n",
104+
WEXITSTATUS(status));
105+
return 1;
106+
}
107+
}

tests/test-matrix.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ run_unit_tests()
453453

454454
printf "\nProcess tests\n"
455455
test_check "$runner" "test-fork" "PASS" "$bindir/test-fork"
456+
test_check "$runner" "test-clone-childtid" "PASS" "$bindir/test-clone-childtid"
456457
test_check "$runner" "test-exec" "exec-works" "$bindir/test-exec" "$bindir/echo-test" exec-works
457458
test_check "$runner" "test-fork-exec" "PASS" "$bindir/test-fork-exec" "$bindir/echo-test"
458459
test_check "$runner" "test-cloexec" "PASS" "$bindir/test-cloexec"

0 commit comments

Comments
 (0)