Skip to content

Commit a24fc53

Browse files
committed
Speedup vDSO CNTVCT and amortized urandom
vDSO clock_gettime drops from 1256 ns SVC trap to 2.5ns via CNTVCT-based fast path (493x speedup, 20x under the sub-50 ns design target). The trampoline emits a 28-instruction A64 sequence that reads CNTVCT_EL0, LDAR-acquires the vvar initialized flag, and interpolates wall clock from the anchor as delta * 125 / 3 (Apple Silicon CNTFRQ = 24 MHz), falling back to SVC on first call or CNTVCT regression. The first SVC seeds the vvar via a three-state CAS (0 -> 2 -> 1) so concurrent first calls cannot tear the anchor fields. The seed is gated on ELR_EL1 matching the trampoline's svc_fallback PC so an unrelated raw clock_gettime syscall cannot poison the anchor from arbitrary X9. /dev/urandom 1-byte reads drop from 5688 ns uncached to 2054 ns (2.77x) via a new per-fd entropy cache: an arc4random_buf-refilled 4 KiB buffer per FD_URANDOM slot. The cache is zeroed on close via a type-to-cleanup registry that also closes pre-existing dup and fork-state race windows for every synthetic fd type. eventfd dup shares state across aliases per the Linux contract (refcounted slot plus eventfd_owner[FD_TABLE_SIZE] table). The dup path holds fd_lock and sfd_lock together for the bind commit so racing close cannot leak the refcount; the source identity is pinned via snapshotted host fd so a racing close-and-rebind of the source cannot bind to the wrong slot. tests/test-eventfd-dup pins the shared-state contract. fork_ipc_send_fd_table filters eventfd, signalfd, timerfd, inotify, netlink, pidfd, and epoll out of the SCM_RIGHTS payload. macOS rejects kqueue fds across SCM_RIGHTS and per-class side-table state is not transferable, so a clean drop is the only honest contract. tests/test-fork-synthetic-fd pins it. Startup decomposition: ELFUSE_STARTUP_TRACE=1 emits per-step wall time for VM bring-up (17 steps on test-hello, dominated by hv_vcpu_create and guest_init at roughly 0.9 ms each). Zero overhead when unset.
1 parent b065c43 commit a24fc53

34 files changed

Lines changed: 2056 additions & 167 deletions

src/core/bootstrap.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "core/bootstrap.h"
2222
#include "core/rosetta.h"
2323
#include "core/stack.h"
24+
#include "core/startup-trace.h"
2425
#include "core/vdso.h"
2526

2627
#include "runtime/thread.h"
@@ -334,14 +335,17 @@ int guest_bootstrap_prepare(guest_t *g,
334335
mem_region_t regions[MAX_BOOT_REGIONS];
335336
int nregions = 0;
336337
uint64_t native_vdso;
338+
uint64_t t0;
337339

338340
memset(boot, 0, sizeof(*boot));
339341
*guest_initialized = false;
340342

343+
t0 = startup_trace_now_ns();
341344
if (elf_load(elf_host_path, &boot->elf_info) < 0) {
342345
log_error("failed to load ELF: %s", elf_host_path);
343346
return -1;
344347
}
348+
startup_trace_step("elf_load", t0);
345349

346350
bool want_rosetta = false;
347351
if (boot->elf_info.e_machine == EM_X86_64) {
@@ -374,10 +378,12 @@ int guest_bootstrap_prepare(guest_t *g,
374378
* the request is non-fatal in either direction.
375379
*/
376380
uint32_t req_ipa = want_rosetta ? 48 : 0;
381+
t0 = startup_trace_now_ns();
377382
if (guest_init(g, 0, req_ipa) < 0) {
378383
log_error("failed to initialize guest");
379384
return -1;
380385
}
386+
startup_trace_step("guest_init", t0);
381387
*guest_initialized = true;
382388
g->is_rosetta = want_rosetta;
383389
proc_set_rosetta_active(want_rosetta);
@@ -405,11 +411,13 @@ int guest_bootstrap_prepare(guest_t *g,
405411
} else {
406412
boot->elf_load_base =
407413
(boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0;
414+
t0 = startup_trace_now_ns();
408415
if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base,
409416
g->guest_size, boot->elf_load_base) < 0) {
410417
log_error("failed to map ELF segments");
411418
return -1;
412419
}
420+
startup_trace_step("elf_map_segments", t0);
413421

414422
/* Track the lowest loaded ELF address so the legacy fork IPC path
415423
* copies low-linked ET_EXECs (e.g. linked at 0x200000) in full.
@@ -427,15 +435,18 @@ int guest_bootstrap_prepare(guest_t *g,
427435
g->stack_top = STACK_TOP_DEFAULT;
428436
g->stack_base = g->stack_top - STACK_SIZE;
429437

438+
t0 = startup_trace_now_ns();
430439
if (!load_interpreter(g, sysroot, boot))
431440
return -1;
441+
startup_trace_step("load_interpreter", t0);
432442
}
433443

434444
if (shim_bin_len > BLOCK_2MIB) {
435445
log_error("shim binary too large (%zu bytes)", shim_bin_len);
436446
return -1;
437447
}
438448

449+
t0 = startup_trace_now_ns();
439450
memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len);
440451
log_debug("shim loaded at offset 0x%llx (%zu bytes)",
441452
(unsigned long long) g->shim_base, shim_bin_len);
@@ -448,37 +459,45 @@ int guest_bootstrap_prepare(guest_t *g,
448459
}
449460
sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base,
450461
shim_bin_len);
462+
startup_trace_step("shim_load_icache", t0);
451463

464+
t0 = startup_trace_now_ns();
452465
if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) {
453466
log_error("too many memory regions (%d >= %d)", nregions,
454467
MAX_BOOT_REGIONS);
455468
return -1;
456469
}
470+
startup_trace_step("build_boot_regions", t0);
457471

458472
/* Rosetta path: append the rosetta image as a non-identity region so the
459473
* page-table builder maps VA 0x800000000000 -> primary buffer GPA.
460474
* rosetta_prepare also initialises the TTBR1 kbuf (page-table pages come
461475
* from the same pool that guest_build_page_tables is about to consume).
462476
*/
463477
if (want_rosetta) {
478+
t0 = startup_trace_now_ns();
464479
if (rosetta_prepare(g, elf_host_path, regions, &nregions,
465480
MAX_BOOT_REGIONS, verbose, &rr) < 0) {
466481
log_error("rosetta_prepare failed for %s", elf_guest_path);
467482
return -1;
468483
}
484+
startup_trace_step("rosetta_prepare", t0);
469485
}
470486

487+
t0 = startup_trace_now_ns();
471488
boot->ttbr0 = guest_build_page_tables(g, regions, nregions);
472489
if (!boot->ttbr0) {
473490
log_error("failed to build page tables");
474491
return -1;
475492
}
493+
startup_trace_step("guest_build_page_tables", t0);
476494
/* No TLBI request here: the shim's _start does TLBI VMALLE1IS before
477495
* enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the
478496
* wrong place to stage a bring-up flush -- bootstrap may run on a thread
479497
* whose slot is later consumed by an unrelated syscall.
480498
*/
481499

500+
t0 = startup_trace_now_ns();
482501
if (want_rosetta) {
483502
/* /proc/self/maps for a rosetta guest reports the rosetta translator
484503
* as a single anonymous region covering [VA, VA+size). The original
@@ -505,12 +524,14 @@ int guest_bootstrap_prepare(guest_t *g,
505524
}
506525

507526
register_runtime_regions(g, shim_bin_len);
527+
startup_trace_step("register_regions", t0);
508528

509529
log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0,
510530
(unsigned long long) g->ipa_base);
511531
if (verbose)
512532
log_initial_page_tables(g, boot->ttbr0);
513533

534+
t0 = startup_trace_now_ns();
514535
syscall_init();
515536
proc_init();
516537

@@ -526,6 +547,7 @@ int guest_bootstrap_prepare(guest_t *g,
526547
proc_set_elf_path(elf_guest_path);
527548
if (sysroot)
528549
proc_set_sysroot(sysroot);
550+
startup_trace_step("runtime_init", t0);
529551

530552
/* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the
531553
* binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes
@@ -536,25 +558,30 @@ int guest_bootstrap_prepare(guest_t *g,
536558
int rosetta_argc = 0;
537559
const char **rosetta_argv = NULL;
538560
if (want_rosetta) {
561+
t0 = startup_trace_now_ns();
539562
if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp,
540563
elf_guest_path, guest_argc, guest_argv, &rr,
541564
verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) {
542565
log_error("rosetta_finalize failed");
543566
return -1;
544567
}
568+
startup_trace_step("rosetta_finalize", t0);
545569
} else {
546570
proc_set_cmdline(guest_argc, guest_argv);
547571
}
548572
proc_set_environ((const char **) environ);
549573

574+
t0 = startup_trace_now_ns();
550575
native_vdso = vdso_build(g);
576+
startup_trace_step("vdso_build", t0);
551577
linux_stack_auxv_t auxv;
552578
const elf_info_t *stack_elf =
553579
want_rosetta ? &rr.rosetta_info : &boot->elf_info;
554580
uint64_t stack_elf_load_base = want_rosetta ? 0 : boot->elf_load_base;
555581
uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base;
556582
int stack_argc = want_rosetta ? rosetta_argc : guest_argc;
557583
const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv;
584+
t0 = startup_trace_now_ns();
558585
boot->stack_pointer = build_linux_stack(
559586
g, g->stack_top, stack_argc, stack_argv, (const char **) environ,
560587
stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1,
@@ -564,6 +591,7 @@ int guest_bootstrap_prepare(guest_t *g,
564591
free(rosetta_argv);
565592
return -1;
566593
}
594+
startup_trace_step("build_linux_stack", t0);
567595
/* rosetta_argv was copied into the guest stack; the host allocation is
568596
* no longer needed. The strings themselves are constants (ROSETTA_PATH)
569597
* or owned by the caller (binary_path, guest_argv entries) so freeing
@@ -599,6 +627,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
599627
{
600628
uint64_t sctlr;
601629
uint64_t sctlr_with_mmu;
630+
uint64_t t0;
602631
/* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at
603632
* KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps
604633
* working. Aarch64 guests stay on the EPD1=1 variant which keeps the
@@ -613,14 +642,17 @@ int guest_bootstrap_create_vcpu(guest_t *g,
613642
hv_vcpu_t vcpu;
614643
hv_vcpu_exit_t *vexit;
615644

645+
t0 = startup_trace_now_ns();
616646
HV_CHECK(hv_vcpu_create(&vcpu, &vexit, NULL));
647+
startup_trace_step("hv_vcpu_create", t0);
617648
g->vcpu = vcpu;
618649
g->exit = vexit;
619650
*out_vcpu = vcpu;
620651
*out_vexit = vexit;
621652

622653
thread_register_main(vcpu, vexit, proc_get_pid(), el1_sp);
623654

655+
t0 = startup_trace_now_ns();
624656
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, shim_ipa + 0x800));
625657
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00));
626658
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value));
@@ -632,6 +664,12 @@ int guest_bootstrap_create_vcpu(guest_t *g,
632664
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa));
633665
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, el1_sp));
634666

667+
/* CNTKCTL_EL1.EL0VCTEN | EL0PCTEN: allow EL0 to read CNTVCT_EL0 /
668+
* CNTPCT_EL0. Required by the vDSO clock_gettime fast path (and is the
669+
* default on native Linux), without which the guest gets 0 back from MRS.
670+
*/
671+
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CNTKCTL_EL1, 0x3ULL));
672+
635673
HV_CHECK(hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, &sctlr));
636674
log_debug("SCTLR_EL1 default=0x%llx", (unsigned long long) sctlr);
637675

@@ -645,6 +683,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
645683
sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE |
646684
SCTLR_UCT | SCTLR_UCI;
647685
HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_X0, sctlr_with_mmu));
686+
startup_trace_step("hv_vcpu_configure", t0);
648687

649688
log_debug(
650689
"vCPU configured: PC=0x%llx SCTLR=0x%llx VBAR=0x%llx TTBR0=0x%llx "

src/core/guest.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <unistd.h>
3939

4040
#include "core/guest.h"
41+
#include "core/startup-trace.h"
4142
#include "debug/log.h"
4243
#include "utils.h"
4344
#include "runtime/thread.h" /* thread_destroy_all_vcpus */
@@ -202,6 +203,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
202203

203204
int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
204205
{
206+
uint64_t t0;
207+
205208
memset(g, 0, sizeof(*g));
206209
g->shm_fd = -1;
207210
g->ipa_base = GUEST_IPA_BASE;
@@ -257,6 +260,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
257260
* seconds max wait) to handle this gracefully.
258261
*/
259262
hv_return_t ret = HV_ERROR;
263+
t0 = startup_trace_now_ns();
260264
for (int attempt = 0; attempt < 30; attempt++) {
261265
hv_vm_config_t config = hv_vm_config_create();
262266
hv_vm_config_set_ipa_size(config, vm_ipa);
@@ -266,6 +270,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
266270
break;
267271
usleep(500000); /* 500ms between attempts */
268272
}
273+
startup_trace_step("hv_vm_create", t0);
269274
if (ret != HV_SUCCESS) {
270275
log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret,
271276
vm_ipa);
@@ -307,8 +312,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
307312
* physical memory. Do NOT memset because that would touch every
308313
* page and defeat demand paging.
309314
*/
315+
t0 = startup_trace_now_ns();
310316
g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE,
311317
MAP_ANON | MAP_PRIVATE, -1, 0);
318+
startup_trace_step("primary_mmap", t0);
312319
if (g->host_base == MAP_FAILED) {
313320
perror("guest: mmap");
314321
g->host_base = NULL;
@@ -320,6 +327,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
320327
* path instead of SCM_RIGHTS fd passing.
321328
*/
322329
char tmppath[] = "/tmp/elfuse-XXXXXX";
330+
t0 = startup_trace_now_ns();
323331
int sfd = mkstemp(tmppath);
324332
if (sfd >= 0) {
325333
unlink(tmppath); /* Unlink immediately; fd keeps file alive */
@@ -335,9 +343,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
335343
close(sfd);
336344
}
337345
}
346+
startup_trace_step("cow_shm_upgrade", t0);
338347

348+
t0 = startup_trace_now_ns();
339349
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size,
340350
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
351+
startup_trace_step("hv_vm_map", t0);
341352
if (ret == HV_SUCCESS) {
342353
mapped_size = try_size;
343354
mapped = true;
@@ -380,6 +391,8 @@ int guest_init_from_shm(guest_t *g,
380391
uint64_t size,
381392
uint32_t ipa_bits)
382393
{
394+
uint64_t t0;
395+
383396
memset(g, 0, sizeof(*g));
384397
g->shm_fd = -1; /* Child does not own the shm */
385398
g->ipa_base = GUEST_IPA_BASE;
@@ -403,8 +416,10 @@ int guest_init_from_shm(guest_t *g,
403416
* the parent's frozen snapshot; writes are private to this process.
404417
* macOS CoW is page-granular: only modified pages are duplicated.
405418
*/
419+
t0 = startup_trace_now_ns();
406420
g->host_base =
407421
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
422+
startup_trace_step("shm_mmap", t0);
408423
if (g->host_base == MAP_FAILED) {
409424
perror("guest: mmap shm");
410425
g->host_base = NULL;
@@ -417,6 +432,7 @@ int guest_init_from_shm(guest_t *g,
417432

418433
/* Create HVF VM with the same IPA width as the parent */
419434
hv_return_t ret = HV_ERROR;
435+
t0 = startup_trace_now_ns();
420436
for (int attempt = 0; attempt < 30; attempt++) {
421437
hv_vm_config_t config = hv_vm_config_create();
422438
hv_vm_config_set_ipa_size(config, ipa_bits);
@@ -426,15 +442,18 @@ int guest_init_from_shm(guest_t *g,
426442
break;
427443
usleep(500000);
428444
}
445+
startup_trace_step("hv_vm_create_shm", t0);
429446
if (ret != HV_SUCCESS) {
430447
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
431448
munmap(g->host_base, size);
432449
g->host_base = NULL;
433450
return -1;
434451
}
435452

453+
t0 = startup_trace_now_ns();
436454
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size,
437455
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
456+
startup_trace_step("hv_vm_map_shm", t0);
438457
if (ret != HV_SUCCESS) {
439458
log_error("guest: hv_vm_map (shm) failed: %d", (int) ret);
440459
hv_vm_destroy();

src/core/rosetta.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ int rosetta_finalize(guest_t *g,
469469
* goto fail must be introduced below, or the fail handler would
470470
* double-close it.
471471
*/
472-
int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd);
472+
int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd, NULL);
473473
if (bin_guest_fd < 0) {
474474
log_error("rosetta_finalize: fd_alloc_at(3) failed");
475475
goto fail;

0 commit comments

Comments
 (0)