Skip to content

Commit a00cc3d

Browse files
committed
Bring HVF round-trip floor band into the shim
This closes the per-call shim path: a diagnostic counter array so every fast-path bail is attributed, inline getpgid(0) / getsid(0) using two new identity slots, and a 256-byte urandom / getrandom inline copy with 4 KiB ring-wrap split. The bulk copy uses ldp/stp with a tbz-driven 1..15 byte tail; the ring lock uses LSE swpal; the slow-path refill runs arc4random_buf outside the lock; the second AT probe is skipped when buf and buf+len-1 share a host page. PGSID_PUBLISH reads (pgid, sid) under session_lock via a new proc_snapshot_pgsid so a concurrent setsid on a sibling vCPU cannot publish a torn pair. Measured (Apple M1, 100k iter): read(urandom, 1) 133 -> 100 ns (-25 %) read(urandom, 64) 224 -> 163 ns (-27 %) read(urandom, 256) 528 -> 355 ns (-33 %) getrandom(1) 128 -> 95 ns (-26 %) getrandom(256) 534 -> 366 ns (-31 %) CB_URANDOM_RING_WRAP is retained for ABI stability but stays at zero now that wrap is served inline; a non-zero reading flags a regression.
1 parent b1ce739 commit a00cc3d

13 files changed

Lines changed: 828 additions & 76 deletions

File tree

src/core/bootstrap.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,10 +705,12 @@ int guest_bootstrap_create_vcpu(guest_t *g,
705705
* identity.
706706
*/
707707
shim_globals_init(g);
708+
shim_globals_publish_stats_gate(g);
708709
shim_globals_set_trace_enabled(g, verbose);
709710
shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
710711
shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
711712
proc_get_gid(), proc_get_egid());
713+
proc_publish_pgsid_snapshot(g);
712714
/* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest
713715
* is served by the shim fast path with no cold-start HVC for refill.
714716
*/

src/core/shim-globals.c

Lines changed: 151 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
* src/core/shim.S.
1010
*/
1111

12+
#include <pthread.h>
1213
#include <stdint.h>
14+
#include <stdio.h>
1315
#include <stdlib.h>
1416
#include <string.h>
1517
#include <sched.h>
@@ -46,6 +48,11 @@
4648
* fast path; if they drift here the shim reads from the wrong
4749
* place. Catch the drift at compile time.
4850
*/
51+
_Static_assert(SHIM_GLOBALS_OFF_STATS_EN == 0x04,
52+
"shim.S COUNTER_INC hard-codes STATS_EN byte off 0x04");
53+
_Static_assert(SHIM_GLOBALS_OFF_STATS_EN >= 4 &&
54+
SHIM_GLOBALS_OFF_STATS_EN < SHIM_IDENTITY_BASE,
55+
"STATS_EN byte must sit in the attention/identity padding");
4956
_Static_assert(SHIM_URANDOM_OFF_BITMAP == 0x38,
5057
"shim.S urandom fast path hard-codes BITMAP off 0x38");
5158
_Static_assert(SHIM_URANDOM_OFF_RING_HEAD == 0xB8,
@@ -60,6 +67,35 @@ _Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0,
6067
"shim.S urandom fast path hard-codes RING_LOCK off 0x10C0");
6168
_Static_assert(FD_TABLE_SIZE == 1024,
6269
"shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024");
70+
_Static_assert(SHIM_URANDOM_INLINE_LIMIT == 256,
71+
"shim.S urandom/getrandom fast path hard-codes 256-byte cap");
72+
73+
/* shim.S COUNTER_INC macro hardcodes (SHIM_COUNTERS_OFF & 0xFFF) and the
74+
* 0x1, lsl #12 carry. Keep the literal in sync so a layout shift fails
75+
* the build rather than silently routing increments to the wrong slot.
76+
*/
77+
_Static_assert(SHIM_COUNTERS_OFF == 0x10C8,
78+
"shim.S COUNTER_INC hard-codes SHIM_COUNTERS_OFF=0x10C8");
79+
/* shim.S splits SHIM_COUNTERS_OFF into a shifted-add carry (0x1000) plus
80+
* an imm12 load/store offset (0xC8 + slot byte). Pin the split so any
81+
* future layout shift fails the build instead of silently routing
82+
* increments to the wrong slot.
83+
*/
84+
_Static_assert((SHIM_COUNTERS_OFF & 0xFFF) == 0xC8,
85+
"shim.S SHIM_COUNTERS_OFF_LO12 hard-coded to 0xC8");
86+
_Static_assert((SHIM_COUNTERS_OFF & ~0xFFF) == 0x1000,
87+
"shim.S SHIM_COUNTERS_OFF_HI hard-coded to 0x1000");
88+
_Static_assert(SHIM_IDENTITY_OFF_PGID == 0x1148,
89+
"shim.S getpgid fast path hard-codes PGID off 0x1148");
90+
_Static_assert(SHIM_IDENTITY_OFF_SID == 0x1150,
91+
"shim.S getsid fast path hard-codes SID off 0x1150");
92+
_Static_assert(SHIM_GLOBALS_SIZE >= SHIM_IDENTITY_OFF_SID + 8,
93+
"SHIM_GLOBALS_SIZE must cover the PGID/SID slots");
94+
_Static_assert(SHIM_GLOBALS_SIZE <= BLOCK_2MIB,
95+
"SHIM_GLOBALS_SIZE must fit inside the 2 MiB shim_data block");
96+
_Static_assert(SHIM_COUNTERS_OFF + SHIM_COUNTERS_N * 8 <=
97+
SHIM_IDENTITY_OFF_PGID,
98+
"counter array must not overlap the PGID slot");
6399

64100
static uint8_t *cache_base(const guest_t *g)
65101
{
@@ -114,6 +150,13 @@ void shim_globals_publish_creds(guest_t *g,
114150
store_u64(page, SHIM_IDENTITY_OFF_EGID, egid);
115151
}
116152

153+
void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid)
154+
{
155+
uint8_t *page = cache_base(g);
156+
store_u64(page, SHIM_IDENTITY_OFF_PGID, (uint64_t) pgid);
157+
store_u64(page, SHIM_IDENTITY_OFF_SID, (uint64_t) sid);
158+
}
159+
117160
uint64_t shim_globals_gva(const guest_t *g)
118161
{
119162
return g->shim_data_base;
@@ -242,9 +285,18 @@ void shim_globals_rebuild_urandom_bitmap(void)
242285
}
243286

244287
/* arc4random_buf is documented as deadlock-free and re-entrant. Used
245-
* by both the initial fill at bootstrap and by the slow-path refill
246-
* that runs from sys_read when the shim's fast path falls through due
247-
* to an empty ring.
288+
* by the initial fill at bootstrap and by the slow-path refill that
289+
* runs from sys_read/sys_getrandom when the shim's fast path falls
290+
* through due to an empty ring.
291+
*
292+
* Entropy is generated OUTSIDE the ring_lock: arc4random_buf can take
293+
* microseconds, and any sibling vCPU that hits the fast path while the
294+
* lock is held spins (yield) until release. Generate up to a full ring
295+
* into a stack scratch buffer, then take the lock only to re-read
296+
* head/fill and copy the publishable prefix into the ring. The recheck
297+
* after lock acquire matters: a concurrent fast path may have advanced
298+
* head while entropy was being generated, raising the publishable
299+
* count beyond the pre-lock estimate.
248300
*/
249301
void shim_globals_refill_urandom_ring(guest_t *g)
250302
{
@@ -254,13 +306,31 @@ void shim_globals_refill_urandom_ring(guest_t *g)
254306
uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK);
255307
uint8_t *ring = base + SHIM_URANDOM_OFF_RING;
256308

309+
/* Pre-lock estimate: skip the arc4random_buf + lock when the ring
310+
* is already full. Both cursors are read RELAXED so a torn snapshot
311+
* (head_pre observed past a producer step but tail_pre observed
312+
* before it) can make tail_pre - head_pre wrap to a huge unsigned
313+
* value. A loose ">= RING_SIZE" check would treat that garbage as
314+
* "already full" and skip a genuinely-needed refill. Only the exact
315+
* == RING_SIZE value is a safe full-detection; any other (valid or
316+
* torn) reading falls through to the lock-held recheck below.
317+
*/
318+
uint32_t head_pre = __atomic_load_n(head_p, __ATOMIC_RELAXED);
319+
uint32_t tail_pre = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
320+
uint32_t fill_pre = tail_pre - head_pre;
321+
if (fill_pre == SHIM_URANDOM_RING_SIZE)
322+
return;
323+
324+
uint8_t scratch[SHIM_URANDOM_RING_SIZE];
325+
arc4random_buf(scratch, sizeof(scratch));
326+
257327
urandom_ring_lock(lock_p);
258328

259329
uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE);
260330
uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
261331
uint32_t fill = tail - head;
262332
if (fill >= SHIM_URANDOM_RING_SIZE)
263-
goto out; /* already full */
333+
goto out; /* concurrent refill caught up */
264334
uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill;
265335

266336
/* Producer writes from ring[tail & (SIZE-1)] forward, wrapping
@@ -270,9 +340,9 @@ void shim_globals_refill_urandom_ring(guest_t *g)
270340
uint32_t first = SHIM_URANDOM_RING_SIZE - pos;
271341
if (first > to_fill)
272342
first = to_fill;
273-
arc4random_buf(ring + pos, first);
343+
memcpy(ring + pos, scratch, first);
274344
if (to_fill > first)
275-
arc4random_buf(ring, to_fill - first);
345+
memcpy(ring, scratch + first, to_fill - first);
276346

277347
/* Release-store the new tail so any fast-path consumer that loads
278348
* tail with an acquiring read sees the bytes already in the ring.
@@ -359,3 +429,78 @@ void shim_globals_set_trace_enabled(guest_t *g, bool enabled)
359429
else
360430
shim_globals_attn_and(g, ~ATTN_BIT_TRACE);
361431
}
432+
433+
static const char *const counter_names[SHIM_COUNTERS_N] = {
434+
[SHIM_COUNTER_ATTN_BAIL] = "ATTN_BAIL",
435+
[SHIM_COUNTER_URANDOM_FD_OOR] = "URANDOM_FD_OOR",
436+
[SHIM_COUNTER_URANDOM_FD_BMISS] = "URANDOM_FD_BMISS",
437+
[SHIM_COUNTER_URANDOM_LEN_ZERO] = "URANDOM_LEN_ZERO",
438+
[SHIM_COUNTER_URANDOM_LEN_OVER] = "URANDOM_LEN_OVER",
439+
[SHIM_COUNTER_URANDOM_RING_LOW] = "URANDOM_RING_LOW",
440+
[SHIM_COUNTER_URANDOM_RING_WRAP] = "URANDOM_RING_WRAP",
441+
[SHIM_COUNTER_URANDOM_PROBE_FAIL] = "URANDOM_PROBE_FAIL",
442+
[SHIM_COUNTER_IDENTITY_HIT] = "IDENTITY_HIT",
443+
[SHIM_COUNTER_URANDOM_HIT] = "URANDOM_HIT",
444+
[SHIM_COUNTER_GETRANDOM_HIT] = "GETRANDOM_HIT",
445+
[SHIM_COUNTER_PGSID_HIT] = "PGSID_HIT",
446+
/* Slots 12..15 (SHIM_COUNTERS_N == 16) are intentionally unnamed;
447+
* the dump prints "(reserved)" so they appear in the output when
448+
* non-zero, which would flag an out-of-band increment. Bind a name
449+
* here when a future EL1 service claims one of these slots.
450+
*/
451+
};
452+
453+
uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot)
454+
{
455+
if (slot >= SHIM_COUNTERS_N)
456+
return 0;
457+
const uint8_t *page = (const uint8_t *) g->host_base + g->shim_data_base;
458+
const uint64_t *slot_p =
459+
(const uint64_t *) (page + SHIM_COUNTERS_OFF) + slot;
460+
return __atomic_load_n(slot_p, __ATOMIC_RELAXED);
461+
}
462+
463+
void shim_globals_counters_dump(const guest_t *g)
464+
{
465+
fprintf(stderr, "shim-stats (pid=%lld)\n", (long long) proc_get_pid());
466+
for (unsigned i = 0; i < SHIM_COUNTERS_N; i++) {
467+
const char *name = counter_names[i];
468+
uint64_t v = shim_globals_counter_get(g, i);
469+
if (!name && v == 0)
470+
continue;
471+
fprintf(stderr, " %-20s %llu\n", name ? name : "(reserved)",
472+
(unsigned long long) v);
473+
}
474+
}
475+
476+
static pthread_once_t stats_once = PTHREAD_ONCE_INIT;
477+
static bool stats_enabled_cache;
478+
479+
static void stats_resolve(void)
480+
{
481+
const char *v = getenv("ELFUSE_SHIM_STATS");
482+
stats_enabled_cache = v && v[0] && strcmp(v, "0") != 0;
483+
}
484+
485+
bool shim_globals_stats_enabled(void)
486+
{
487+
pthread_once(&stats_once, stats_resolve);
488+
return stats_enabled_cache;
489+
}
490+
491+
void shim_globals_publish_stats_gate(guest_t *g)
492+
{
493+
uint8_t *slot = cache_base(g) + SHIM_GLOBALS_OFF_STATS_EN;
494+
uint8_t v = shim_globals_stats_enabled() ? 1 : 0;
495+
/* One-shot bring-up publish. Every caller (bootstrap, fork-child
496+
* receive, execve) runs before the guest vCPU starts executing,
497+
* so the host-side ordering between this store and the first
498+
* hv_vcpu_run is what makes the shim observe the published value;
499+
* the release semantics here are conservative, not load-bearing.
500+
* A future runtime setter that mutates the gate after guest entry
501+
* would also need the shim side to upgrade its ldrb to ldarb (or
502+
* gate the read on the attention flag) -- a release-store alone
503+
* does not synchronize with a plain ldrb on the same address.
504+
*/
505+
__atomic_store_n(slot, v, __ATOMIC_RELEASE);
506+
}

src/core/shim-globals.h

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,23 @@
6161
*/
6262
#define SHIM_GLOBALS_OFF_ATTN 0x00
6363

64+
/* Stats gate: single byte at offset 0x04 (inside the natural 4-byte pad
65+
* between the uint32 attention flag and the 8-byte-aligned identity
66+
* slots at 0x08). Nonzero enables the COUNTER_INC body in the EL1 shim;
67+
* zero (the default) makes COUNTER_INC a single ldrb + cbz so disabled
68+
* stats cost one cache-hot byte load instead of the full
69+
* load-add-store-to-shared-counter-line that was paid on every fast
70+
* path before. The byte lives in the same 64-byte cache line as the
71+
* attention flag, so the gate load piggybacks on the line the shim's
72+
* LDAR already pulls into L1 -- no extra coherence traffic in the
73+
* common case.
74+
*
75+
* Plain release-store on publish: the gate is read once per fast-path
76+
* tail with relaxed semantics, and we accept a window after env-var
77+
* resolution where an in-flight syscall still sees the old value.
78+
*/
79+
#define SHIM_GLOBALS_OFF_STATS_EN 0x04
80+
6481
/* Attention is a bitmask, not a boolean. Splitting it by owner lets the
6582
* HVC #5 epilogue's recompute (which polls signal/itimer state) coexist
6683
* with the cred-publish bracket without clobbering it. The shim still
@@ -128,7 +145,63 @@
128145
#define SHIM_URANDOM_RING_SIZE 4096
129146
#define SHIM_URANDOM_OFF_RING_LOCK 0x10C0
130147

131-
#define SHIM_GLOBALS_SIZE 0x10C4
148+
/* Upper bound on the per-call byte count served by the shim's
149+
* urandom/getrandom fast paths. The probe coverage assumes the buffer
150+
* spans at most two host pages so a first+last byte AT probe suffices;
151+
* 256 fits comfortably within both 4 KiB and 16 KiB page sizes. The
152+
* shim itself hardcodes the literal; a static_assert in shim-globals.c
153+
* pins the C macro to the assembly. Ring wraps are handled inline by
154+
* splitting the byte copy at the 4 KiB boundary, so this cap is bounded
155+
* only by probe coverage and per-call ring-fill cost (256 keeps the
156+
* 4 KiB ring serviceable for 16 sequential reads before host refill).
157+
*/
158+
#define SHIM_URANDOM_INLINE_LIMIT 256
159+
160+
/* Fast-path hit / miss counters.
161+
*
162+
* 16 uint64 slots placed after the urandom ring lock. The shim's
163+
* identity_class_fast and urandom_read_fast paths bump the relevant
164+
* slot on every entry and at every bail point so the host can attribute
165+
* fast-path activity instead of guessing. Counters are non-atomic plain
166+
* load-add-store -- under multi-vCPU concurrent bails a small fraction
167+
* of increments race and are lost, which is acceptable for diagnostic
168+
* ratios. Slots 0..7 cover the eight bail reasons the shim distinguishes
169+
* (sticky attention, fd out of range, fd not in urandom bitmap, len zero,
170+
* len over inline cap, ring fill below request, ring wrap, EL0 buffer
171+
* probe failure). Slots 8..11 record fast-path hits so bail rates can be
172+
* computed against a hit denominator. Slots 12..15 are reserved.
173+
*
174+
* The shim hardcodes the byte offset of each slot; the static_asserts
175+
* in shim-globals.c keep the C-side macros and the assembly in sync.
176+
*/
177+
#define SHIM_COUNTERS_OFF 0x10C8
178+
#define SHIM_COUNTERS_N 16
179+
180+
#define SHIM_COUNTER_ATTN_BAIL 0
181+
#define SHIM_COUNTER_URANDOM_FD_OOR 1
182+
#define SHIM_COUNTER_URANDOM_FD_BMISS 2
183+
#define SHIM_COUNTER_URANDOM_LEN_ZERO 3
184+
#define SHIM_COUNTER_URANDOM_LEN_OVER 4
185+
#define SHIM_COUNTER_URANDOM_RING_LOW 5
186+
#define SHIM_COUNTER_URANDOM_RING_WRAP 6
187+
#define SHIM_COUNTER_URANDOM_PROBE_FAIL 7
188+
#define SHIM_COUNTER_IDENTITY_HIT 8
189+
#define SHIM_COUNTER_URANDOM_HIT 9
190+
#define SHIM_COUNTER_GETRANDOM_HIT 10
191+
#define SHIM_COUNTER_PGSID_HIT 11
192+
193+
/* Extended identity slots: pgid and sid.
194+
*
195+
* getpgid(0) and getsid(0) are pure cache reads when the argument is
196+
* zero; the shim serves them out of these slots whenever X0 == 0 and
197+
* the syscall number matches. The host re-publishes after setpgid /
198+
* setsid / exec / fork so the slots match guest_pgid / guest_sid in
199+
* proc-identity.c.
200+
*/
201+
#define SHIM_IDENTITY_OFF_PGID 0x1148
202+
#define SHIM_IDENTITY_OFF_SID 0x1150
203+
204+
#define SHIM_GLOBALS_SIZE 0x1158
132205

133206
/* Initialize the cache region to all-zero. Called once per process at
134207
* the same time the shim_data block is set up (initial bootstrap and
@@ -158,6 +231,21 @@ void shim_globals_publish_creds(guest_t *g,
158231
uint32_t gid,
159232
uint32_t egid);
160233

234+
/* Publish pgid + sid so the shim's getpgid(0) / getsid(0) inline service
235+
* sees the current session/process-group state. Call from process init,
236+
* fork-child receive, exec, setsid, and setpgid. Slot writes are
237+
* independent 64-bit atomic release stores.
238+
*
239+
* No attention bit guards this publish: setpgid / setsid are infrequent
240+
* and the model accepts a brief window in which a concurrent
241+
* getpgid(0) / getsid(0) on a sibling vCPU observes the pre-publish
242+
* value (consistent with Linux's lockless session lookups). Session
243+
* mutators and cache-initialization callers publish through proc-identity
244+
* while holding session_lock, so successful setpgid / setsid calls cannot
245+
* overwrite the cache out of order.
246+
*/
247+
void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid);
248+
161249
/* GVA of the cache base. Equal to g->shim_data_base. Exposed so the
162250
* TPIDR_EL1 setup site and tests can reference one source of truth.
163251
*/
@@ -306,3 +394,31 @@ void shim_globals_rebuild_urandom_bitmap(void);
306394
* forced through the host SVC.
307395
*/
308396
void shim_globals_refill_urandom_ring(guest_t *g);
397+
398+
/* Counter access for diagnostics. shim_globals_counter_get returns the
399+
* cumulative slot value (lossy under multi-vCPU bail contention; see the
400+
* comment block on SHIM_COUNTERS_OFF). slot must be in [0, SHIM_COUNTERS_N).
401+
* shim_globals_counters_dump writes a one-line-per-slot summary to out
402+
* with the SHIM_COUNTER_* names and current values; intended for use at
403+
* process exit when ELFUSE_SHIM_STATS is set.
404+
*/
405+
uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot);
406+
void shim_globals_counters_dump(const guest_t *g);
407+
408+
/* ELFUSE_SHIM_STATS env-var gate (idempotent / cached). When enabled the
409+
* exit path dumps the counter table to stderr so a single bench run
410+
* attributes every fast-path bail without rebuilds. Mirrors the
411+
* ELFUSE_STARTUP_TRACE pattern in core/startup-trace.h.
412+
*/
413+
bool shim_globals_stats_enabled(void);
414+
415+
/* Publish the stats gate byte at SHIM_GLOBALS_OFF_STATS_EN based on
416+
* shim_globals_stats_enabled(). The EL1 shim's COUNTER_INC loads this
417+
* byte and skips the counter increment when zero, so an unset
418+
* ELFUSE_SHIM_STATS pays only a single cache-hot ldrb on each fast-path
419+
* tail instead of a full load-add-store on a shared counter line.
420+
* Call after every shim_globals_init: bootstrap, fork-child receive,
421+
* and execve. The byte stays zero on a fresh shim_globals_init unless
422+
* this publisher runs.
423+
*/
424+
void shim_globals_publish_stats_gate(guest_t *g);

0 commit comments

Comments
 (0)