3838#include <unistd.h>
3939
4040#include "core/guest.h"
41+ #include "core/startup-trace.h"
4142#include "debug/log.h"
4243#include "utils.h"
4344#include "runtime/thread.h" /* thread_destroy_all_vcpus */
@@ -60,6 +61,7 @@ static void guest_region_clear(guest_t *g);
6061#define PT_UXN (1ULL << 54) /* Unprivileged Execute Never */
6162#define PT_PXN (1ULL << 53) /* Privileged Execute Never */
6263#define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */
64+ #define PT_AP_RW_EL1 (0ULL << 6) /* AP[2:1]=00: RW at EL1, no access EL0 */
6365#define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */
6466
6567/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */
@@ -202,6 +204,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
202204
203205int guest_init (guest_t * g , uint64_t size , uint32_t ipa_bits )
204206{
207+ uint64_t t0 ;
208+
205209 memset (g , 0 , sizeof (* g ));
206210 g -> shm_fd = -1 ;
207211 g -> ipa_base = GUEST_IPA_BASE ;
@@ -257,6 +261,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
257261 * seconds max wait) to handle this gracefully.
258262 */
259263 hv_return_t ret = HV_ERROR ;
264+ t0 = startup_trace_now_ns ();
260265 for (int attempt = 0 ; attempt < 30 ; attempt ++ ) {
261266 hv_vm_config_t config = hv_vm_config_create ();
262267 hv_vm_config_set_ipa_size (config , vm_ipa );
@@ -266,6 +271,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
266271 break ;
267272 usleep (500000 ); /* 500ms between attempts */
268273 }
274+ startup_trace_step ("hv_vm_create" , t0 );
269275 if (ret != HV_SUCCESS ) {
270276 log_error ("guest: hv_vm_create failed: %d (ipa_bits=%u)" , (int ) ret ,
271277 vm_ipa );
@@ -307,8 +313,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
307313 * physical memory. Do NOT memset because that would touch every
308314 * page and defeat demand paging.
309315 */
316+ t0 = startup_trace_now_ns ();
310317 g -> host_base = mmap (NULL , try_size , PROT_READ | PROT_WRITE ,
311318 MAP_ANON | MAP_PRIVATE , -1 , 0 );
319+ startup_trace_step ("primary_mmap" , t0 );
312320 if (g -> host_base == MAP_FAILED ) {
313321 perror ("guest: mmap" );
314322 g -> host_base = NULL ;
@@ -320,6 +328,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
320328 * path instead of SCM_RIGHTS fd passing.
321329 */
322330 char tmppath [] = "/tmp/elfuse-XXXXXX" ;
331+ t0 = startup_trace_now_ns ();
323332 int sfd = mkstemp (tmppath );
324333 if (sfd >= 0 ) {
325334 unlink (tmppath ); /* Unlink immediately; fd keeps file alive */
@@ -335,9 +344,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
335344 close (sfd );
336345 }
337346 }
347+ startup_trace_step ("cow_shm_upgrade" , t0 );
338348
349+ t0 = startup_trace_now_ns ();
339350 ret = hv_vm_map (g -> host_base , GUEST_IPA_BASE , try_size ,
340351 HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC );
352+ startup_trace_step ("hv_vm_map" , t0 );
341353 if (ret == HV_SUCCESS ) {
342354 mapped_size = try_size ;
343355 mapped = true;
@@ -380,6 +392,8 @@ int guest_init_from_shm(guest_t *g,
380392 uint64_t size ,
381393 uint32_t ipa_bits )
382394{
395+ uint64_t t0 ;
396+
383397 memset (g , 0 , sizeof (* g ));
384398 g -> shm_fd = -1 ; /* Child does not own the shm */
385399 g -> ipa_base = GUEST_IPA_BASE ;
@@ -403,8 +417,10 @@ int guest_init_from_shm(guest_t *g,
403417 * the parent's frozen snapshot; writes are private to this process.
404418 * macOS CoW is page-granular: only modified pages are duplicated.
405419 */
420+ t0 = startup_trace_now_ns ();
406421 g -> host_base =
407422 mmap (NULL , size , PROT_READ | PROT_WRITE , MAP_PRIVATE , shm_fd , 0 );
423+ startup_trace_step ("shm_mmap" , t0 );
408424 if (g -> host_base == MAP_FAILED ) {
409425 perror ("guest: mmap shm" );
410426 g -> host_base = NULL ;
@@ -417,6 +433,7 @@ int guest_init_from_shm(guest_t *g,
417433
418434 /* Create HVF VM with the same IPA width as the parent */
419435 hv_return_t ret = HV_ERROR ;
436+ t0 = startup_trace_now_ns ();
420437 for (int attempt = 0 ; attempt < 30 ; attempt ++ ) {
421438 hv_vm_config_t config = hv_vm_config_create ();
422439 hv_vm_config_set_ipa_size (config , ipa_bits );
@@ -426,15 +443,18 @@ int guest_init_from_shm(guest_t *g,
426443 break ;
427444 usleep (500000 );
428445 }
446+ startup_trace_step ("hv_vm_create_shm" , t0 );
429447 if (ret != HV_SUCCESS ) {
430448 log_error ("guest: hv_vm_create (shm) failed: %d" , (int ) ret );
431449 munmap (g -> host_base , size );
432450 g -> host_base = NULL ;
433451 return -1 ;
434452 }
435453
454+ t0 = startup_trace_now_ns ();
436455 ret = hv_vm_map (g -> host_base , GUEST_IPA_BASE , size ,
437456 HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC );
457+ startup_trace_step ("hv_vm_map_shm" , t0 );
438458 if (ret != HV_SUCCESS ) {
439459 log_error ("guest: hv_vm_map (shm) failed: %d" , (int ) ret );
440460 hv_vm_destroy ();
@@ -1106,6 +1126,16 @@ static int gva_translate_perm(const guest_t *g,
11061126 return -1 ;
11071127
11081128 int perms = desc_to_perms (l3 [l3_idx ]);
1129+ /* EL1-only pages (shim_data) are inaccessible to guest EL0 in the
1130+ * page tables; the host accessors that act on a guest-supplied GVA
1131+ * must refuse them too, otherwise a guest could pass a shim_data
1132+ * GVA as a syscall buffer and have the host write into the identity
1133+ * cache or entropy ring on its behalf. The host's own publishers
1134+ * use direct host_base+shim_data_base arithmetic and bypass this
1135+ * walker entirely.
1136+ */
1137+ if (perms & MEM_PERM_EL1_ONLY )
1138+ return -1 ;
11091139 if ((perms & required_perms ) != required_perms )
11101140 return -1 ;
11111141
@@ -1136,6 +1166,12 @@ static int gva_translate_perm(const guest_t *g,
11361166
11371167 /* L2 block descriptor: 2MiB granularity. */
11381168 int perms = desc_to_perms (l2 [l2_idx ]);
1169+ /* See the L3 page-descriptor branch above: EL1-only blocks are
1170+ * inaccessible to host-on-behalf-of-guest accesses for the same
1171+ * reason. shim_data is mapped as a 2MiB EL1-only block at boot.
1172+ */
1173+ if (perms & MEM_PERM_EL1_ONLY )
1174+ return -1 ;
11391175 if ((perms & required_perms ) != required_perms )
11401176 return -1 ;
11411177
@@ -2079,10 +2115,20 @@ static uint64_t make_block_desc(uint64_t gpa, int perms)
20792115 }
20802116
20812117 /* Write permissions via AP bits:
2118+ * AP[2:1]=00 -> RW for EL1 only (no EL0 access)
20822119 * AP[2:1]=01 -> RW for EL1 and EL0
20832120 * AP[2:1]=11 -> RO for EL1 and EL0
2121+ * MEM_PERM_EL1_ONLY drops EL0 access entirely; used for shim_data
2122+ * so the guest cannot directly read or store to the cache, ring,
2123+ * bitmap, or attention flag.
20842124 */
2085- if (perms & MEM_PERM_W ) {
2125+ if (perms & MEM_PERM_EL1_ONLY ) {
2126+ desc |= PT_AP_RW_EL1 ;
2127+ /* EL1-only data: never EL0-executable (already set above if
2128+ * MEM_PERM_X is unset, but assert defensively).
2129+ */
2130+ desc |= PT_UXN | PT_PXN ;
2131+ } else if (perms & MEM_PERM_W ) {
20862132 desc |= PT_AP_RW_EL0 ;
20872133 } else {
20882134 desc |= PT_AP_RO ;
@@ -2513,22 +2559,35 @@ static uint64_t make_page_desc(uint64_t pa, int perms)
25132559 if (!(perms & MEM_PERM_X ))
25142560 desc |= PT_UXN | PT_PXN ;
25152561
2516- if (perms & MEM_PERM_W )
2562+ if (perms & MEM_PERM_EL1_ONLY ) {
2563+ desc |= PT_AP_RW_EL1 ;
2564+ desc |= PT_UXN | PT_PXN ; /* EL1-only data never executes */
2565+ } else if (perms & MEM_PERM_W ) {
25172566 desc |= PT_AP_RW_EL0 ;
2518- else
2567+ } else {
25192568 desc |= PT_AP_RO ;
2569+ }
25202570
25212571 return desc ;
25222572}
25232573
2524- /* Extract MEM_PERM_* flags from a page table descriptor (block or page). */
2574+ /* Extract MEM_PERM_* flags from a page table descriptor (block or page).
2575+ * The AP[2:1] field encodes the EL1/EL0 access matrix; map 00 to
2576+ * MEM_PERM_RW | MEM_PERM_EL1_ONLY so callers see the privileged-only
2577+ * shim_data slots correctly instead of treating them as read-only.
2578+ */
25252579static int desc_to_perms (uint64_t desc )
25262580{
25272581 int perms = MEM_PERM_R ;
25282582 if (!(desc & PT_UXN ))
25292583 perms |= MEM_PERM_X ;
2530- if ((desc & (3ULL << 6 )) == PT_AP_RW_EL0 )
2584+ uint64_t ap = desc & (3ULL << 6 );
2585+ if (ap == PT_AP_RW_EL0 ) {
25312586 perms |= MEM_PERM_W ;
2587+ } else if (ap == PT_AP_RW_EL1 ) {
2588+ perms |= MEM_PERM_W | MEM_PERM_EL1_ONLY ;
2589+ }
2590+ /* PT_AP_RO (11) stays MEM_PERM_R only. */
25322591 return perms ;
25332592}
25342593
0 commit comments