@@ -266,17 +266,21 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary)
266266static uint64_t find_free_gap_inner (const guest_t * g ,
267267 uint64_t length ,
268268 uint64_t min_addr ,
269- uint64_t max_addr )
269+ uint64_t max_addr ,
270+ uint64_t align )
270271{
271- /* Round the search start up to the next host-page boundary so an unaligned
272- * addr hint cannot return a result that lands inside a host page already
273- * covered by a preceding region's overlay tail (the overlay extends to
272+ /* Round the search start up to the requested alignment so an unaligned addr
273+ * hint cannot return a result that lands inside a host page already covered
274+ * by a preceding region's overlay tail (the overlay extends to
274275 * ALIGN_UP(r->end, hps)). Apple Silicon enforces 16 KiB host pages;
275276 * aligning to the guest 4 KiB page is not enough. Advance past each walked
276- * region to the same boundary for the same reason.
277+ * region to the same boundary for the same reason. MAP_SHARED file-backed
278+ * allocations may request 2 MiB alignment as a best-effort placement
279+ * preference so consecutive mappings usually avoid sharing an HVF stage-2
280+ * segment, which reduces segment-table fragmentation for memfd-style
281+ * allocation patterns.
277282 */
278- size_t hps = host_page_size_cached ();
279- uint64_t gap_start = ALIGN_UP (min_addr , hps );
283+ uint64_t gap_start = ALIGN_UP (min_addr , align );
280284
281285 /* Skip the prefix of regions entirely below gap_start in O(log n). After a
282286 * successful allocation the gap hint advances near or past the existing
@@ -307,8 +311,10 @@ static uint64_t find_free_gap_inner(const guest_t *g,
307311 g -> regions [i ].start >= gap_start + length )
308312 return gap_start ;
309313
310- /* Region overlaps; advance past it and round to the next host page */
311- gap_start = ALIGN_UP (g -> regions [i ].end , hps );
314+ /* Region overlaps; advance past it and round to the next aligned
315+ * boundary so the caller's alignment promise holds across allocations.
316+ */
317+ gap_start = ALIGN_UP (g -> regions [i ].end , align );
312318 }
313319
314320 /* Check trailing space after all regions */
@@ -321,36 +327,45 @@ static uint64_t find_free_gap_inner(const guest_t *g,
321327 * The hint tracks the first address after the last successful mapping in each
322328 * region, which avoids rescanning the same prefix on sequential mmap activity.
323329 * A miss falls back to the region base so holes reopened by munmap are still
324- * reusable.
330+ * reusable. The align argument is the per-call start boundary the result must
331+ * satisfy; some sys_mmap callers first pass BLOCK_2MIB as a best-effort
332+ * placement preference for MAP_SHARED file-backed allocations, then retry with
333+ * host-page alignment when no 2 MiB-aligned gap is available.
325334 */
326335static uint64_t find_free_gap (guest_t * g ,
327336 uint64_t length ,
328337 uint64_t min_addr ,
329- uint64_t max_addr )
338+ uint64_t max_addr ,
339+ uint64_t align )
330340{
331341 /* RX and RW mappings advance independently, so keep separate hints. */
332342 uint64_t * hint =
333343 (min_addr < MMAP_BASE ) ? & g -> mmap_rx_gap_hint : & g -> mmap_rw_gap_hint ;
334344
335345 /* Advance the hint to the next host-page boundary so the following
336346 * sequential allocation lands on an address that the kernel accepts for
337- * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff
338- * is up to host_page-1 bytes of address-space waste per small allocation;
339- * physical pages are still demand-paged, so RAM cost is unchanged.
347+ * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). Round to the
348+ * host page even when the current call requested a larger align (e.g.
349+ * BLOCK_2MIB for MAP_SHARED file-backed): a subsequent MAP_PRIVATE 4 KiB
350+ * allocation should still be able to occupy the trailing space inside the
351+ * 2 MiB block. find_free_gap_inner re-applies the caller's align on its
352+ * next entry, so a subsequent MAP_SHARED allocation skips past the small
353+ * tenant and lands on the next 2 MiB boundary anyway.
340354 */
341355 size_t hps = host_page_size_cached ();
342356
343357 /* Try cached hint first (only if within the valid range) */
344358 if (* hint >= min_addr && * hint < max_addr ) {
345- uint64_t result = find_free_gap_inner (g , length , * hint , max_addr );
359+ uint64_t result =
360+ find_free_gap_inner (g , length , * hint , max_addr , align );
346361 if (result != UINT64_MAX ) {
347362 * hint = ALIGN_UP (result + length , hps );
348363 return result ;
349364 }
350365 }
351366
352367 /* Full scan from base */
353- uint64_t result = find_free_gap_inner (g , length , min_addr , max_addr );
368+ uint64_t result = find_free_gap_inner (g , length , min_addr , max_addr , align );
354369 if (result != UINT64_MAX )
355370 * hint = ALIGN_UP (result + length , hps );
356371 return result ;
@@ -2184,21 +2199,55 @@ int64_t sys_mmap(guest_t *g,
21842199 if (high_hint >= 0 )
21852200 return high_hint ;
21862201 }
2202+ /* Open the backing fd before the gap-finder so the alignment heuristic
2203+ * can read the host fd's access mode through overlay_fd_writable.
2204+ * Closes on every failure path within the non-fixed branch.
2205+ */
2206+ if (!is_anon ) {
2207+ if (host_fd_ref_open (fd , & backing_ref ) < 0 )
2208+ return - LINUX_EBADF ;
2209+ host_backing_fd = backing_ref .fd ;
2210+ }
2211+ /* Prefer stage-2 2 MiB block boundaries for non-fixed MAP_SHARED
2212+ * file-backed allocations. Without this each shared file mmap whose
2213+ * result lands mid-block forces hvf_apply_file_overlay_quiesced to
2214+ * split the containing HVF segment at both ends; back-to-back memfd
2215+ * allocations burn segments at roughly two per mmap and run the table
2216+ * to GUEST_MAX_HVF_SEGMENTS quickly. This is a placement preference,
2217+ * not a Linux-visible constraint: if no 2 MiB-aligned gap exists, the
2218+ * allocation retries with host-page alignment. The condition mirrors
2219+ * the overlay fast-path's gate (host-page-aligned offset, writable
2220+ * backer) so read-only MAP_SHARED mappings that fall through to the
2221+ * pread snapshot do not pay the alignment cost without the
2222+ * segment-table benefit.
2223+ */
2224+ size_t hps = host_page_size_cached ();
2225+ uint64_t align = (uint64_t ) hps ;
2226+ if (!is_anon && fd >= 0 && (flags & LINUX_MAP_SHARED ) &&
2227+ ((uint64_t ) offset % hps == 0 ) &&
2228+ overlay_fd_writable (host_backing_fd ))
2229+ align = BLOCK_2MIB ;
2230+ uint64_t fallback_align = (uint64_t ) hps ;
21872231 if (needs_exec && !(prot & LINUX_PROT_WRITE )) {
21882232 /* PROT_EXEC without PROT_WRITE: allocate from the RX mmap region.
21892233 * Apple HVF enforces W^X on 2MiB block page table entries, so
21902234 * executable mappings must be in separate 2MiB blocks from writable
21912235 * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute
21922236 * permission.
21932237 */
2194- result_off = find_free_gap (g , length , MMAP_RX_BASE , g -> mmap_limit );
2238+ result_off =
2239+ find_free_gap (g , length , MMAP_RX_BASE , g -> mmap_limit , align );
2240+ if (result_off == UINT64_MAX && align != fallback_align )
2241+ result_off = find_free_gap (g , length , MMAP_RX_BASE ,
2242+ g -> mmap_limit , fallback_align );
21952243 if (result_off == UINT64_MAX ) {
21962244 log_debug (
21972245 "mmap: RX address space exhausted "
21982246 "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)" ,
21992247 (unsigned long long ) length ,
22002248 (unsigned long long ) g -> mmap_limit , g -> ipa_bits ,
22012249 (unsigned long long ) (g -> guest_size >> 30 ));
2250+ host_fd_ref_close (& backing_ref );
22022251 return - LINUX_ENOMEM ;
22032252 }
22042253 /* High-water mark for fork IPC state transfer */
@@ -2232,31 +2281,41 @@ int64_t sys_mmap(guest_t *g,
22322281 */
22332282 uint64_t hint_max =
22342283 (hint_off < MMAP_BASE ) ? MMAP_BASE : g -> mmap_limit ;
2235- result_off =
2236- find_free_gap_inner (g , length , hint_off , hint_max );
2284+ if (align != fallback_align ) {
2285+ uint64_t exact_hint_max = hint_off + length ;
2286+ result_off =
2287+ find_free_gap_inner (g , length , hint_off ,
2288+ exact_hint_max , fallback_align );
2289+ }
2290+ if (result_off == UINT64_MAX )
2291+ result_off = find_free_gap_inner (g , length , hint_off ,
2292+ hint_max , align );
2293+ if (result_off == UINT64_MAX && align != fallback_align )
2294+ result_off = find_free_gap_inner (
2295+ g , length , hint_off , hint_max , fallback_align );
22372296 }
22382297 }
22392298 if (result_off == UINT64_MAX )
2240- result_off = find_free_gap (g , length , MMAP_BASE , g -> mmap_limit );
2299+ result_off =
2300+ find_free_gap (g , length , MMAP_BASE , g -> mmap_limit , align );
2301+ if (result_off == UINT64_MAX && align != fallback_align )
2302+ result_off = find_free_gap (g , length , MMAP_BASE , g -> mmap_limit ,
2303+ fallback_align );
22412304 if (result_off == UINT64_MAX ) {
22422305 log_debug (
22432306 "mmap: RW address space exhausted "
22442307 "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)" ,
22452308 (unsigned long long ) length ,
22462309 (unsigned long long ) g -> mmap_limit , g -> ipa_bits ,
22472310 (unsigned long long ) (g -> guest_size >> 30 ));
2311+ host_fd_ref_close (& backing_ref );
22482312 return - LINUX_ENOMEM ;
22492313 }
22502314 /* High-water mark for fork IPC state transfer */
22512315 uint64_t rw_hwm = result_off + length ;
22522316 if (rw_hwm > g -> mmap_next )
22532317 g -> mmap_next = rw_hwm ;
22542318 }
2255- if (!is_anon ) {
2256- if (host_fd_ref_open (fd , & backing_ref ) < 0 )
2257- return - LINUX_EBADF ;
2258- host_backing_fd = backing_ref .fd ;
2259- }
22602319 if (!region_has_capacity_after_removes (g , NULL , 0 , 1 )) {
22612320 host_fd_ref_close (& backing_ref );
22622321 return - LINUX_ENOMEM ;
@@ -2931,10 +2990,17 @@ int64_t sys_mremap(guest_t *g,
29312990 int needs_exec = (prot & LINUX_PROT_EXEC ) != 0 ;
29322991
29332992 uint64_t new_off ;
2993+ /* mremap moves the data via read_file_range_to_guest and does not
2994+ * reinstall a file overlay at the destination, so 2 MiB alignment
2995+ * would not narrow segment-table growth. Stay at host-page alignment.
2996+ */
2997+ size_t mremap_align = host_page_size_cached ();
29342998 if (needs_exec && !(prot & LINUX_PROT_WRITE ))
2935- new_off = find_free_gap (g , new_size , MMAP_RX_BASE , g -> mmap_limit );
2999+ new_off = find_free_gap (g , new_size , MMAP_RX_BASE , g -> mmap_limit ,
3000+ mremap_align );
29363001 else
2937- new_off = find_free_gap (g , new_size , MMAP_BASE , g -> mmap_limit );
3002+ new_off = find_free_gap (g , new_size , MMAP_BASE , g -> mmap_limit ,
3003+ mremap_align );
29383004
29393005 if (new_off == UINT64_MAX ) {
29403006 if (track_backing_fd >= 0 )
0 commit comments