Skip to content

Commit ed1811b

Browse files
authored
Merge pull request #52 from sysprog21/vdso
Add vDSO seqlock refresh and fast paths
2 parents 16050e0 + f5b3e21 commit ed1811b

11 files changed

Lines changed: 1867 additions & 243 deletions

File tree

Makefile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,25 @@ $(BUILD_DIR)/test-lowbase-mem-300000: tests/test-lowbase-mem.c | $(BUILD_DIR)
221221
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -no-pie \
222222
-Wl,-Ttext-segment=0x300000 -o $@ $<
223223

224+
# bench-hot-guard-glibc is the dynamic-glibc twin of bench-hot-guard.
225+
# Built only when the cross-glibc toolchain ships its own sysroot
226+
# (so a host without that toolchain can still run the rest of the
227+
# suite). Linked without -static so glibc resolves time / urandom
228+
# syscalls through the vDSO trampoline -- which is exactly what the
229+
# guardrail script verifies against the 50 ns / 200 ns ceilings.
230+
ifneq ($(wildcard $(LINUX_TOOLCHAIN)/aarch64-unknown-linux-gnu/sysroot/.),)
231+
# -DGUARD_USE_LIBC_CG switches the bench's clock_gettime case from a
232+
# direct vDSO trampoline call to the libc wrapper, so the dynamic-glibc
233+
# build measures glibc's actual routing decision. A regression in the
234+
# NT_GNU_ABI_TAG note or LINUX_2.6.39 versioning would push this
235+
# measurement from ~7 ns up to SVC time (~2000 ns) and fail the
236+
# guardrail.
237+
$(BUILD_DIR)/bench-hot-guard-glibc: tests/bench-hot-guard.c | $(BUILD_DIR)
238+
@echo " CROSS $< (dynamic glibc)"
239+
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -DGUARD_USE_LIBC_CG=1 -O2 \
240+
-o $@ $<
241+
endif
242+
224243
endif
225244

226245
include mk/tests.mk

mk/tests.mk

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,28 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage
5151
@$(MAKE) --no-print-directory test-timeout-disable
5252
@printf "\n$(BLUE)━━━ rosetta CLI gating ━━━$(RESET)\n"
5353
@$(MAKE) --no-print-directory test-rosetta-cli
54+
@printf "\n$(BLUE)━━━ hot-syscall guardrail ━━━$(RESET)\n"
55+
@$(MAKE) --no-print-directory test-bench-guardrail
56+
57+
## Hot-syscall performance guardrail: ensure getpid, libc clock_gettime,
58+
## and 1-byte /dev/urandom reads stay under their TODO ns/op ceilings.
59+
## Builds the dynamic-glibc variant opportunistically; the script skips
60+
## that arm when the cross-toolchain sysroot is missing.
61+
BENCH_GUARDRAIL_DEPS := $(ELFUSE_BIN)
62+
BENCH_GUARDRAIL_REQUIRE_STATIC := 0
63+
ifndef GUEST_TEST_BINARIES
64+
BENCH_GUARDRAIL_DEPS += $(BUILD_DIR)/bench-hot-guard
65+
BENCH_GUARDRAIL_REQUIRE_STATIC := 1
66+
ifneq ($(wildcard $(LINUX_TOOLCHAIN)/aarch64-unknown-linux-gnu/sysroot/.),)
67+
BENCH_GUARDRAIL_DEPS += $(BUILD_DIR)/bench-hot-guard-glibc
68+
endif
69+
endif
70+
test-bench-guardrail: $(BENCH_GUARDRAIL_DEPS)
71+
@ELFUSE="$(ELFUSE_BIN)" \
72+
BENCH_GUARDRAIL_DIR="$(TEST_DIR)" \
73+
BENCH_GUARDRAIL_REQUIRE_STATIC="$(BENCH_GUARDRAIL_REQUIRE_STATIC)" \
74+
LINUX_TOOLCHAIN="$(LINUX_TOOLCHAIN)" \
75+
bash tests/test-bench-guardrail.sh
5476

5577
test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename
5678
@set -e; \

src/core/elf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#define PT_LOAD 1
4646
#define PT_DYNAMIC 2
4747
#define PT_INTERP 3
48+
#define PT_NOTE 4
4849

4950
/* Program header flags */
5051
#define PF_X 1

src/core/vdso.c

Lines changed: 855 additions & 177 deletions
Large diffs are not rendered by default.

src/core/vdso.h

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,23 @@
3535
*/
3636
uint64_t vdso_build(guest_t *g);
3737

38-
/* If the vvar anchor has not been seeded yet, install the supplied cntvct as
39-
* the guest-frame anchor paired with the given monotonic and realtime
40-
* wall_clock values. Idempotent: subsequent calls with initialized==1 are
41-
* no-ops. Used by sys_clock_gettime to upgrade the first
42-
* __kernel_clock_gettime SVC fallback into a permanent vvar fast path that
43-
* serves both CLOCK_MONOTONIC and CLOCK_REALTIME.
38+
/* Publish a new vvar anchor under the seqlock. Handles both the initial
39+
* seed (seq 0 -> 1 -> 2) and refresh (seq 2K -> 2K+1 -> 2K+2) atomically
40+
* through one CAS-then-release-store sequence. Concurrent publishers
41+
* either lose the CAS or observe an odd seq and bail without blocking;
42+
* trampoline readers detect mid-write tearing via their own LDAR
43+
* snapshot/recheck. Callers (sys_clock_gettime / sys_gettimeofday) only
44+
* need to invoke this when an SVC trap from the vDSO trampoline carries a
45+
* trustworthy guest CNTVCT in X9.
46+
*
47+
* Overflow invariant: this function, the trampoline math, and
48+
* vdso_realtime_drift_exceeded all depend on VDSO_ANCHOR_AGE_SHIFT == 22
49+
* capping the per-call CNTVCT delta below 2^22. That bound keeps
50+
* (delta * 699050666) below 2^52 (no uint64 overflow) and keeps
51+
* anchor_nsec + delta_ns below 2e9 (so the trampoline's sub-1e9 carry
52+
* collapses to a single SUBS + CSEL + CINC instead of a UDIV). The
53+
* host-side drift check must apply the same formula and the same cap;
54+
* any divergence lets the trampoline interpolate from a stale anchor.
4455
*/
4556
void vdso_seed_anchor(guest_t *g,
4657
uint64_t guest_cntvct,
@@ -56,12 +67,13 @@ void vdso_seed_anchor(guest_t *g,
5667
* + 4, so callers compare ELR_EL1 against that.
5768
*/
5869
uint64_t vdso_clock_gettime_svc_pc(void);
70+
uint64_t vdso_gettimeofday_svc_pc(void);
5971

60-
/* Returns true once the vvar anchor has been published (initialized==1) and
61-
* the fast path can never be reseeded. Lets the post-SVC handler in
62-
* sys_clock_gettime skip the ELR_EL1 + X9 HVF reads it otherwise needs for
63-
* the seeding gate, since the second-call onward gate is moot once seeded.
64-
* Uses acquire ordering paired with vdso_seed_anchor's release store.
72+
/* Returns true when the seqlock counter is at a stable (nonzero, even)
73+
* generation, i.e. the anchor is currently publishable. Uses acquire
74+
* ordering paired with vdso_seed_anchor's release store of the next
75+
* even generation. Callers use this to gate the age / drift checks
76+
* that decide whether to publish a refresh.
6577
*/
6678
bool vdso_anchor_is_seeded(guest_t *g);
6779

@@ -72,3 +84,20 @@ bool vdso_anchor_is_seeded(guest_t *g);
7284
*/
7385
void vdso_attention_or(guest_t *g, uint32_t bits);
7486
void vdso_attention_and(guest_t *g, uint32_t mask);
87+
88+
/* True iff the anchor is currently stable AND (current_cntvct -
89+
* anchor_cntvct) has exceeded the trampoline's age cap. The host uses
90+
* this with a freshly-sampled CNTVCT to decide whether to publish a
91+
* refresh through vdso_seed_anchor.
92+
*/
93+
bool vdso_anchor_age_exceeded(guest_t *g, uint64_t current_cntvct);
94+
95+
/* True iff the anchor is seeded AND the wall-clock value predicted from
96+
* the anchor + CNTVCT delta differs from the supplied freshly-sampled
97+
* REALTIME (real_sec, real_nsec) by more than VDSO_ANCHOR_MAX_DRIFT_NS.
98+
* Catches macOS NTP steps that shift wall time without bumping CNTVCT.
99+
*/
100+
bool vdso_realtime_drift_exceeded(guest_t *g,
101+
uint64_t current_cntvct,
102+
int64_t real_sec,
103+
int64_t real_nsec);

src/syscall/time.c

Lines changed: 107 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,27 @@ typedef struct {
221221

222222
/* Time/timer syscall handlers. */
223223

224+
#define LINUX_COARSE_CLOCK_RES_NS 1000000
225+
226+
static bool linux_clock_getres_fixed(int clockid, linux_timespec_t *ts)
227+
{
228+
switch (clockid) {
229+
case 0: /* CLOCK_REALTIME */
230+
case 1: /* CLOCK_MONOTONIC */
231+
case 4: /* CLOCK_MONOTONIC_RAW */
232+
case 7: /* CLOCK_BOOTTIME */
233+
*ts = (linux_timespec_t) {.tv_sec = 0, .tv_nsec = 1};
234+
return true;
235+
case 5: /* CLOCK_REALTIME_COARSE */
236+
case 6: /* CLOCK_MONOTONIC_COARSE */
237+
*ts = (linux_timespec_t) {.tv_sec = 0,
238+
.tv_nsec = LINUX_COARSE_CLOCK_RES_NS};
239+
return true;
240+
default:
241+
return false;
242+
}
243+
}
244+
224245
int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva)
225246
{
226247
int mac_clockid = translate_clockid(clockid);
@@ -231,9 +252,16 @@ int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva)
231252
if (!tp_gva)
232253
return 0;
233254

234-
struct timespec ts;
235-
if (clock_getres(mac_clockid, &ts) < 0)
236-
return linux_errno();
255+
linux_timespec_t ts;
256+
if (!linux_clock_getres_fixed(clockid, &ts)) {
257+
struct timespec host_ts;
258+
if (clock_getres(mac_clockid, &host_ts) < 0)
259+
return linux_errno();
260+
ts = (linux_timespec_t) {
261+
.tv_sec = host_ts.tv_sec,
262+
.tv_nsec = host_ts.tv_nsec,
263+
};
264+
}
237265

238266
if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0)
239267
return -LINUX_EFAULT;
@@ -247,64 +275,46 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
247275
if (mac_clockid < 0)
248276
return -LINUX_EINVAL;
249277

250-
/* If this trap came from the __kernel_clock_gettime vDSO svc_fallback,
251-
* the trampoline parked the guest's CNTVCT_EL0 read in X9 before
252-
* issuing SVC, and ELR_EL1 holds the address immediately after that
253-
* SVC. Pair X9 with both the MONOTONIC and REALTIME wall_clocks and
254-
* seed the vvar so subsequent calls hit the fast path for either
255-
* clockid. Skip the seed for any other trap (raw
256-
* syscall(SYS_clock_gettime, ...) from guest code, etc.): X9 is
257-
* then arbitrary guest state, and seeding from it would poison the
258-
* anchor and break every later fast-path call.
259-
*
260-
* Skip the gate entirely once the anchor is published: vdso_seed_anchor
261-
* is a one-shot CAS that can never fire again, so the HVF reads of
262-
* ELR_EL1 and X9 below would be pure waste on every subsequent trap.
263-
* Both clockid 0 (REALTIME) and clockid 1 (MONOTONIC) take the vDSO
264-
* fast path, so either may be the first caller; either way both
265-
* anchor pairs are seeded from a single set of host clock_gettime
266-
* calls.
278+
/* When the trap came from the __kernel_clock_gettime vDSO
279+
* svc_fallback, the trampoline parked the guest's CNTVCT_EL0 read in
280+
* X9 before SVC, and ELR_EL1 holds SVC_PC + 4. Use X9 to seed (or
281+
* refresh) the vvar anchor so subsequent calls hit the fast path.
282+
* Reject any other trap: X9 would then be arbitrary guest state and
283+
* seeding from it would poison the anchor.
267284
*
268-
* Order matters: read X9 first, then sample both host wall clocks
269-
* back-to-back, then write to guest and seed. Sampling host clocks
270-
* before checking X9 would bake a permanent positive bias (~50-200 ns)
271-
* into the anchor because every host call ages the X9 timestamp by
272-
* the seeding gate's HVF round-trip. The back-to-back wall-clock
273-
* reads minimize MONO/REAL skew within the anchor.
285+
* Order matters: read X9 first, then sample host wall clocks
286+
* back-to-back, then write the guest result and seed. Sampling host
287+
* clocks before checking X9 would bake a permanent positive bias
288+
* into the anchor from the HVF round-trip in the seeding gate.
274289
*/
275-
bool seed_eligible = (clockid == 0 /* CLOCK_REALTIME */ ||
276-
clockid == 1 /* CLOCK_MONOTONIC */) &&
277-
current_thread && !vdso_anchor_is_seeded(g);
290+
bool from_trampoline = (clockid == 0 /* CLOCK_REALTIME */ ||
291+
clockid == 1 /* CLOCK_MONOTONIC */) &&
292+
current_thread;
278293

279294
uint64_t guest_cntvct = 0;
280-
if (seed_eligible) {
295+
if (from_trampoline) {
281296
uint64_t elr = 0;
282297
if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
283298
&elr) != HV_SUCCESS ||
284299
elr != vdso_clock_gettime_svc_pc() + 4 ||
285300
hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) !=
286301
HV_SUCCESS ||
287-
guest_cntvct == 0) {
288-
/* Trap came from a path other than the vDSO trampoline; X9 is
289-
* arbitrary, fall through to the non-seeding path.
290-
*/
291-
seed_eligible = false;
292-
}
302+
guest_cntvct == 0)
303+
from_trampoline = false;
293304
}
294305

295306
struct timespec ts;
296307
if (clock_gettime(mac_clockid, &ts) < 0)
297308
return linux_errno();
298309

299-
/* For the seeding path, sample the OTHER clockid back-to-back so both
300-
* anchor pairs reflect roughly the same host moment. If the second
301-
* clock_gettime fails (unreachable on macOS but defensive), skip
302-
* seeding rather than fail the user's request: the user already has
303-
* the value they asked for.
310+
/* Sample the OTHER clockid back-to-back so both anchor pairs reflect
311+
* roughly the same host moment. If the second clock_gettime fails
312+
* (defensive; unreachable on macOS), skip seeding rather than fail
313+
* the user's request.
304314
*/
305315
struct timespec ts_other;
306316
bool can_seed = false;
307-
if (seed_eligible) {
317+
if (from_trampoline) {
308318
int other_mac = (clockid == 1) ? CLOCK_REALTIME : CLOCK_MONOTONIC;
309319
if (clock_gettime(other_mac, &ts_other) == 0)
310320
can_seed = true;
@@ -316,8 +326,17 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
316326
if (can_seed) {
317327
const struct timespec *ts_mono = (clockid == 1) ? &ts : &ts_other;
318328
const struct timespec *ts_real = (clockid == 0) ? &ts : &ts_other;
319-
vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec,
320-
ts_real->tv_sec, ts_real->tv_nsec);
329+
330+
/* Publish when the vvar is unseeded, has aged out, or has
331+
* drifted relative to the freshly-sampled REALTIME (catches
332+
* macOS NTP steps).
333+
*/
334+
if (!vdso_anchor_is_seeded(g) ||
335+
vdso_anchor_age_exceeded(g, guest_cntvct) ||
336+
vdso_realtime_drift_exceeded(g, guest_cntvct, ts_real->tv_sec,
337+
ts_real->tv_nsec))
338+
vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec,
339+
ts_real->tv_sec, ts_real->tv_nsec);
321340
}
322341

323342
return 0;
@@ -391,13 +410,55 @@ int64_t sys_clock_nanosleep(guest_t *g,
391410

392411
int64_t sys_gettimeofday(guest_t *g, uint64_t tv_gva, uint64_t tz_gva)
393412
{
394-
(void) tz_gva; /* timezone is obsolete */
413+
bool from_trampoline = current_thread;
414+
uint64_t guest_cntvct = 0;
415+
if (from_trampoline) {
416+
uint64_t elr = 0;
417+
if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
418+
&elr) != HV_SUCCESS ||
419+
elr != vdso_gettimeofday_svc_pc() + 4 ||
420+
hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) !=
421+
HV_SUCCESS ||
422+
guest_cntvct == 0)
423+
from_trampoline = false;
424+
}
425+
395426
struct timeval tv;
396427
if (gettimeofday(&tv, NULL) < 0)
397428
return linux_errno();
398429

399-
if (tv_gva && guest_write_small(g, tv_gva, &tv, sizeof(tv)) < 0)
430+
struct timespec ts_mono;
431+
struct timespec ts_real;
432+
bool can_seed = false;
433+
if (from_trampoline && clock_gettime(CLOCK_MONOTONIC, &ts_mono) == 0 &&
434+
clock_gettime(CLOCK_REALTIME, &ts_real) == 0)
435+
can_seed = true;
436+
437+
linux_timeval_t ltv = {
438+
.tv_sec = tv.tv_sec,
439+
.tv_usec = tv.tv_usec,
440+
};
441+
if (tv_gva && guest_write_small(g, tv_gva, &ltv, sizeof(ltv)) < 0)
400442
return -LINUX_EFAULT;
443+
444+
/* tz is obsolete on Linux but the kernel still zeroes a non-null
445+
* pointer (struct timezone has two int32 fields, 8 bytes total).
446+
* Matching the vDSO fast path's `str xzr, [tz]` here keeps SVC and
447+
* fast-path callers observationally identical.
448+
*/
449+
if (tz_gva) {
450+
const uint64_t tz_zero = 0;
451+
if (guest_write_small(g, tz_gva, &tz_zero, sizeof(tz_zero)) < 0)
452+
return -LINUX_EFAULT;
453+
}
454+
455+
if (can_seed && (!vdso_anchor_is_seeded(g) ||
456+
vdso_anchor_age_exceeded(g, guest_cntvct) ||
457+
vdso_realtime_drift_exceeded(
458+
g, guest_cntvct, ts_real.tv_sec, ts_real.tv_nsec)))
459+
vdso_seed_anchor(g, guest_cntvct, ts_mono.tv_sec, ts_mono.tv_nsec,
460+
ts_real.tv_sec, ts_real.tv_nsec);
461+
401462
return 0;
402463
}
403464

0 commit comments

Comments
 (0)