Skip to content

Commit 75fb59b

Browse files
authored
Merge pull request #48 from sysprog21/perf
Speedup vDSO CNTVCT and amortized urandom
2 parents b065c43 + 7642bee commit 75fb59b

55 files changed

Lines changed: 5386 additions & 296 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Makefile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SRCS := \
2323
core/elf.c \
2424
core/stack.c \
2525
core/vdso.c \
26+
core/shim-globals.c \
2627
core/bootstrap.c \
2728
core/rosetta.c \
2829
core/sysroot.c \
@@ -160,6 +161,24 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
160161
@echo " CROSS $< (with -lpthread)"
161162
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
162163

164+
# test-shim-cred-race spawns a pthread reader while the main thread
165+
# toggles setresuid; the reader spins on the identity fast path.
166+
$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)
167+
@echo " CROSS $< (with -lpthread)"
168+
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
169+
170+
# test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM
171+
# slot to exercise the shim's LDXR/STXR head-advance under contention.
172+
$(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR)
173+
@echo " CROSS $< (with -lpthread)"
174+
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
175+
176+
# test-shim-urandom-toctou races mprotect(PROT_NONE) against urandom
177+
# reads to exercise the EL1 data abort recovery path. Needs pthreads.
178+
$(BUILD_DIR)/test-shim-urandom-toctou: tests/test-shim-urandom-toctou.c | $(BUILD_DIR)
179+
@echo " CROSS $< (with -lpthread)"
180+
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
181+
163182
# test-fuse-basic runs a guest daemon thread and consumer in one process
164183
$(BUILD_DIR)/test-fuse-basic: tests/test-fuse-basic.c | $(BUILD_DIR)
165184
@echo " CROSS $< (with -lpthread)"

src/core/bootstrap.c

Lines changed: 124 additions & 24 deletions
Large diffs are not rendered by default.

src/core/elf.c

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,16 @@ int elf_map_segments(const elf_info_t *info,
208208
const char *path,
209209
void *guest_base,
210210
uint64_t guest_size,
211-
uint64_t load_base)
211+
uint64_t load_base,
212+
uint64_t infra_lo,
213+
uint64_t infra_hi)
212214
{
215+
/* Half-open intersection test for [a, a+alen) and [b, b+blen). When
216+
* infra_lo == infra_hi the caller opted out (early bring-up before
217+
* guest_t is wired up); the host-side writes that follow still get
218+
* the existing guest_size bound check.
219+
*/
220+
bool infra_active = infra_lo < infra_hi;
213221
FILE *f = fopen(path, "rb");
214222
if (!f) {
215223
perror(path);
@@ -264,6 +272,17 @@ int elf_map_segments(const elf_info_t *info,
264272
fclose(f);
265273
return -1;
266274
}
275+
if (infra_active && phdr_dest < infra_hi &&
276+
phdr_dest + ph_total > infra_lo) {
277+
log_error(
278+
"%s: program headers at 0x%llx overlap infra reserve "
279+
"[0x%llx, 0x%llx)",
280+
path, (unsigned long long) phdr_dest, (unsigned long long) infra_lo,
281+
(unsigned long long) infra_hi);
282+
free(ph_buf);
283+
fclose(f);
284+
return -1;
285+
}
267286
memcpy((uint8_t *) guest_base + phdr_dest, ph_buf, ph_total);
268287

269288
/* Copy PT_LOAD contents after AT_PHDR is in place; ET_DYN segments are
@@ -308,15 +327,34 @@ int elf_map_segments(const elf_info_t *info,
308327
return -1;
309328
}
310329

311-
/* Zero the full page-aligned segment extent, not only p_memsz.
312-
* Linux guarantees zero-filled tail bytes in the last mapped page,
313-
* and some dynamic linkers allocate from that page tail before they
314-
* request more memory. Leaving stale bytes there leaks state across
315-
* execve and corrupts the new image.
330+
/* The host memset zeros PAGE_ALIGN_UP(memsz) bytes, not just memsz,
331+
* so the infra-overlap check has to use the same rounded extent.
332+
* Without the rounding here, a segment that ends just below
333+
* infra_lo passes the check and still spills up to PAGE_SIZE-1
334+
* bytes of zero into the infra reserve via the page tail.
316335
*/
317336
uint64_t zero_len = PAGE_ALIGN_UP(memsz);
318337
if (gpa + zero_len > guest_size)
319338
zero_len = guest_size - gpa;
339+
if (infra_active && gpa < infra_hi && gpa + zero_len > infra_lo) {
340+
log_error(
341+
"%s: segment at 0x%llx+0x%llx (zero-extent 0x%llx) overlaps "
342+
"infra reserve [0x%llx, 0x%llx)",
343+
path, (unsigned long long) gpa, (unsigned long long) memsz,
344+
(unsigned long long) zero_len, (unsigned long long) infra_lo,
345+
(unsigned long long) infra_hi);
346+
free(ph_buf);
347+
fclose(f);
348+
return -1;
349+
}
350+
351+
/* Zero the full page-aligned segment extent (zero_len computed above
352+
* with guest_size and infra_reserve checks). Linux guarantees
353+
* zero-filled tail bytes in the last mapped page, and some dynamic
354+
* linkers allocate from that page tail before they request more
355+
* memory. Leaving stale bytes there leaks state across execve and
356+
* corrupts the new image.
357+
*/
320358
memset((uint8_t *) guest_base + gpa, 0, zero_len);
321359

322360
/* Overlay initialized bytes after zeroing so BSS and page tail remain

src/core/elf.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,20 @@ int elf_load(const char *path, elf_info_t *info);
109109
* Also copies program headers into guest memory for AT_PHDR.
110110
* load_base is added to all virtual addresses (0 for ET_EXEC at link addr,
111111
* non-zero for ET_DYN loaded at a chosen base).
112+
* infra_lo and infra_hi delimit the runtime infra reserve (page-table pool,
113+
* shim text, shim_data, vDSO). Any PT_LOAD or PT_PHDR copy whose destination
114+
* intersects [infra_lo, infra_hi) is rejected: those writes go through
115+
* host_base directly and would otherwise bypass the EL1-only page-table
116+
* protection on shim_data. Pass 0,0 only when the guest_t is not yet built.
112117
* Returns 0 on success, -1 on failure.
113118
*/
114119
int elf_map_segments(const elf_info_t *info,
115120
const char *path,
116121
void *guest_base,
117122
uint64_t guest_size,
118-
uint64_t load_base);
123+
uint64_t load_base,
124+
uint64_t infra_lo,
125+
uint64_t infra_hi);
119126

120127
/* Resolve a PT_INTERP path against a sysroot directory.
121128
* Tries three strategies:

src/core/guest.c

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <unistd.h>
3939

4040
#include "core/guest.h"
41+
#include "core/startup-trace.h"
4142
#include "debug/log.h"
4243
#include "utils.h"
4344
#include "runtime/thread.h" /* thread_destroy_all_vcpus */
@@ -60,6 +61,7 @@ static void guest_region_clear(guest_t *g);
6061
#define PT_UXN (1ULL << 54) /* Unprivileged Execute Never */
6162
#define PT_PXN (1ULL << 53) /* Privileged Execute Never */
6263
#define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */
64+
#define PT_AP_RW_EL1 (0ULL << 6) /* AP[2:1]=00: RW at EL1, no access EL0 */
6365
#define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */
6466

6567
/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */
@@ -202,6 +204,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
202204

203205
int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
204206
{
207+
uint64_t t0;
208+
205209
memset(g, 0, sizeof(*g));
206210
g->shm_fd = -1;
207211
g->ipa_base = GUEST_IPA_BASE;
@@ -257,6 +261,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
257261
* seconds max wait) to handle this gracefully.
258262
*/
259263
hv_return_t ret = HV_ERROR;
264+
t0 = startup_trace_now_ns();
260265
for (int attempt = 0; attempt < 30; attempt++) {
261266
hv_vm_config_t config = hv_vm_config_create();
262267
hv_vm_config_set_ipa_size(config, vm_ipa);
@@ -266,6 +271,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
266271
break;
267272
usleep(500000); /* 500ms between attempts */
268273
}
274+
startup_trace_step("hv_vm_create", t0);
269275
if (ret != HV_SUCCESS) {
270276
log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret,
271277
vm_ipa);
@@ -307,8 +313,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
307313
* physical memory. Do NOT memset because that would touch every
308314
* page and defeat demand paging.
309315
*/
316+
t0 = startup_trace_now_ns();
310317
g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE,
311318
MAP_ANON | MAP_PRIVATE, -1, 0);
319+
startup_trace_step("primary_mmap", t0);
312320
if (g->host_base == MAP_FAILED) {
313321
perror("guest: mmap");
314322
g->host_base = NULL;
@@ -320,6 +328,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
320328
* path instead of SCM_RIGHTS fd passing.
321329
*/
322330
char tmppath[] = "/tmp/elfuse-XXXXXX";
331+
t0 = startup_trace_now_ns();
323332
int sfd = mkstemp(tmppath);
324333
if (sfd >= 0) {
325334
unlink(tmppath); /* Unlink immediately; fd keeps file alive */
@@ -335,9 +344,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
335344
close(sfd);
336345
}
337346
}
347+
startup_trace_step("cow_shm_upgrade", t0);
338348

349+
t0 = startup_trace_now_ns();
339350
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size,
340351
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
352+
startup_trace_step("hv_vm_map", t0);
341353
if (ret == HV_SUCCESS) {
342354
mapped_size = try_size;
343355
mapped = true;
@@ -380,6 +392,8 @@ int guest_init_from_shm(guest_t *g,
380392
uint64_t size,
381393
uint32_t ipa_bits)
382394
{
395+
uint64_t t0;
396+
383397
memset(g, 0, sizeof(*g));
384398
g->shm_fd = -1; /* Child does not own the shm */
385399
g->ipa_base = GUEST_IPA_BASE;
@@ -403,8 +417,10 @@ int guest_init_from_shm(guest_t *g,
403417
* the parent's frozen snapshot; writes are private to this process.
404418
* macOS CoW is page-granular: only modified pages are duplicated.
405419
*/
420+
t0 = startup_trace_now_ns();
406421
g->host_base =
407422
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
423+
startup_trace_step("shm_mmap", t0);
408424
if (g->host_base == MAP_FAILED) {
409425
perror("guest: mmap shm");
410426
g->host_base = NULL;
@@ -417,6 +433,7 @@ int guest_init_from_shm(guest_t *g,
417433

418434
/* Create HVF VM with the same IPA width as the parent */
419435
hv_return_t ret = HV_ERROR;
436+
t0 = startup_trace_now_ns();
420437
for (int attempt = 0; attempt < 30; attempt++) {
421438
hv_vm_config_t config = hv_vm_config_create();
422439
hv_vm_config_set_ipa_size(config, ipa_bits);
@@ -426,15 +443,18 @@ int guest_init_from_shm(guest_t *g,
426443
break;
427444
usleep(500000);
428445
}
446+
startup_trace_step("hv_vm_create_shm", t0);
429447
if (ret != HV_SUCCESS) {
430448
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
431449
munmap(g->host_base, size);
432450
g->host_base = NULL;
433451
return -1;
434452
}
435453

454+
t0 = startup_trace_now_ns();
436455
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size,
437456
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
457+
startup_trace_step("hv_vm_map_shm", t0);
438458
if (ret != HV_SUCCESS) {
439459
log_error("guest: hv_vm_map (shm) failed: %d", (int) ret);
440460
hv_vm_destroy();
@@ -1106,6 +1126,16 @@ static int gva_translate_perm(const guest_t *g,
11061126
return -1;
11071127

11081128
int perms = desc_to_perms(l3[l3_idx]);
1129+
/* EL1-only pages (shim_data) are inaccessible to guest EL0 in the
1130+
* page tables; the host accessors that act on a guest-supplied GVA
1131+
* must refuse them too, otherwise a guest could pass a shim_data
1132+
* GVA as a syscall buffer and have the host write into the identity
1133+
* cache or entropy ring on its behalf. The host's own publishers
1134+
* use direct host_base+shim_data_base arithmetic and bypass this
1135+
* walker entirely.
1136+
*/
1137+
if (perms & MEM_PERM_EL1_ONLY)
1138+
return -1;
11091139
if ((perms & required_perms) != required_perms)
11101140
return -1;
11111141

@@ -1136,6 +1166,12 @@ static int gva_translate_perm(const guest_t *g,
11361166

11371167
/* L2 block descriptor: 2MiB granularity. */
11381168
int perms = desc_to_perms(l2[l2_idx]);
1169+
/* See the L3 page-descriptor branch above: EL1-only blocks are
1170+
* inaccessible to host-on-behalf-of-guest accesses for the same
1171+
* reason. shim_data is mapped as a 2MiB EL1-only block at boot.
1172+
*/
1173+
if (perms & MEM_PERM_EL1_ONLY)
1174+
return -1;
11391175
if ((perms & required_perms) != required_perms)
11401176
return -1;
11411177

@@ -2079,10 +2115,20 @@ static uint64_t make_block_desc(uint64_t gpa, int perms)
20792115
}
20802116

20812117
/* Write permissions via AP bits:
2118+
* AP[2:1]=00 -> RW for EL1 only (no EL0 access)
20822119
* AP[2:1]=01 -> RW for EL1 and EL0
20832120
* AP[2:1]=11 -> RO for EL1 and EL0
2121+
* MEM_PERM_EL1_ONLY drops EL0 access entirely; used for shim_data
2122+
* so the guest cannot directly read or store to the cache, ring,
2123+
* bitmap, or attention flag.
20842124
*/
2085-
if (perms & MEM_PERM_W) {
2125+
if (perms & MEM_PERM_EL1_ONLY) {
2126+
desc |= PT_AP_RW_EL1;
2127+
/* EL1-only data: never EL0-executable (already set above if
2128+
* MEM_PERM_X is unset, but assert defensively).
2129+
*/
2130+
desc |= PT_UXN | PT_PXN;
2131+
} else if (perms & MEM_PERM_W) {
20862132
desc |= PT_AP_RW_EL0;
20872133
} else {
20882134
desc |= PT_AP_RO;
@@ -2513,22 +2559,35 @@ static uint64_t make_page_desc(uint64_t pa, int perms)
25132559
if (!(perms & MEM_PERM_X))
25142560
desc |= PT_UXN | PT_PXN;
25152561

2516-
if (perms & MEM_PERM_W)
2562+
if (perms & MEM_PERM_EL1_ONLY) {
2563+
desc |= PT_AP_RW_EL1;
2564+
desc |= PT_UXN | PT_PXN; /* EL1-only data never executes */
2565+
} else if (perms & MEM_PERM_W) {
25172566
desc |= PT_AP_RW_EL0;
2518-
else
2567+
} else {
25192568
desc |= PT_AP_RO;
2569+
}
25202570

25212571
return desc;
25222572
}
25232573

2524-
/* Extract MEM_PERM_* flags from a page table descriptor (block or page). */
2574+
/* Extract MEM_PERM_* flags from a page table descriptor (block or page).
2575+
* The AP[2:1] field encodes the EL1/EL0 access matrix; map 00 to
2576+
* MEM_PERM_RW | MEM_PERM_EL1_ONLY so callers see the privileged-only
2577+
* shim_data slots correctly instead of treating them as read-only.
2578+
*/
25252579
static int desc_to_perms(uint64_t desc)
25262580
{
25272581
int perms = MEM_PERM_R;
25282582
if (!(desc & PT_UXN))
25292583
perms |= MEM_PERM_X;
2530-
if ((desc & (3ULL << 6)) == PT_AP_RW_EL0)
2584+
uint64_t ap = desc & (3ULL << 6);
2585+
if (ap == PT_AP_RW_EL0) {
25312586
perms |= MEM_PERM_W;
2587+
} else if (ap == PT_AP_RW_EL1) {
2588+
perms |= MEM_PERM_W | MEM_PERM_EL1_ONLY;
2589+
}
2590+
/* PT_AP_RO (11) stays MEM_PERM_R only. */
25322591
return perms;
25332592
}
25342593

src/core/guest.h

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,20 +127,28 @@
127127
#define MEM_PERM_R (1 << 0)
128128
#define MEM_PERM_W (1 << 1)
129129
#define MEM_PERM_X (1 << 2)
130+
/* AP[2:1]=00: privileged-only (no EL0 read/write). Combine with MEM_PERM_R/W.
131+
* Used for shim_data so the guest cannot directly read or store to the identity
132+
* cache, urandom bitmap, ring, or attention flag. The EL1 shim still has full
133+
* RW. EL0 reads/writes fault to the EL0-fault path (SIGSEGV in the guest),
134+
* matching what Linux does for kernel-only pages exposed in /proc/self/maps .
135+
*/
136+
#define MEM_PERM_EL1_ONLY (1 << 3)
130137
#define MEM_PERM_RX (MEM_PERM_R | MEM_PERM_X)
131138
#define MEM_PERM_RW (MEM_PERM_R | MEM_PERM_W)
139+
#define MEM_PERM_RW_EL1_ONLY (MEM_PERM_R | MEM_PERM_W | MEM_PERM_EL1_ONLY)
132140

133141
/* A contiguous region of guest memory to be mapped in page tables.
134142
*
135-
* Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every
136-
* boot region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
143+
* Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every boot
144+
* region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
137145
*
138-
* Rosetta segments use va_base != 0 to install a non-identity mapping:
139-
* the rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its
140-
* bytes live in the primary buffer at a low GPA. Page-table entries are
141-
* indexed by va_base + (offset within region) and emit a block descriptor
142-
* whose output address is gpa_start + (offset within region). This is the
143-
* only place in elfuse where guest VA diverges from guest GPA.
146+
* Rosetta segments use va_base != 0 to install a non-identity mapping: the
147+
* rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its bytes
148+
* live in the primary buffer at a low GPA. Page-table entries are indexed by
149+
* va_base + (offset within region) and emit a block descriptor whose output
150+
* address is gpa_start + (offset within region). This is the only place in
151+
* elfuse where guest VA diverges from guest GPA.
144152
*/
145153
typedef struct {
146154
uint64_t gpa_start; /* Output GPA / IPA (2MiB aligned) */

0 commit comments

Comments
 (0)