Skip to content

Commit e45b71c

Browse files
authored
Merge pull request #98 from sysprog21/map-shared-2m
Align non-fixed MAP_SHARED file mmap to 2 MiB
2 parents 3dbea17 + d8b8952 commit e45b71c

3 files changed

Lines changed: 237 additions & 27 deletions

File tree

src/syscall/mem.c

Lines changed: 93 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -266,17 +266,21 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary)
266266
static uint64_t find_free_gap_inner(const guest_t *g,
267267
uint64_t length,
268268
uint64_t min_addr,
269-
uint64_t max_addr)
269+
uint64_t max_addr,
270+
uint64_t align)
270271
{
271-
/* Round the search start up to the next host-page boundary so an unaligned
272-
* addr hint cannot return a result that lands inside a host page already
273-
* covered by a preceding region's overlay tail (the overlay extends to
272+
/* Round the search start up to the requested alignment so an unaligned addr
273+
* hint cannot return a result that lands inside a host page already covered
274+
* by a preceding region's overlay tail (the overlay extends to
274275
* ALIGN_UP(r->end, hps)). Apple Silicon enforces 16 KiB host pages;
275276
* aligning to the guest 4 KiB page is not enough. Advance past each walked
276-
* region to the same boundary for the same reason.
277+
* region to the same boundary for the same reason. MAP_SHARED file-backed
278+
* allocations may request 2 MiB alignment as a best-effort placement
279+
* preference so consecutive mappings usually avoid sharing an HVF stage-2
280+
* segment, which reduces segment-table fragmentation for memfd-style
281+
* allocation patterns.
277282
*/
278-
size_t hps = host_page_size_cached();
279-
uint64_t gap_start = ALIGN_UP(min_addr, hps);
283+
uint64_t gap_start = ALIGN_UP(min_addr, align);
280284

281285
/* Skip the prefix of regions entirely below gap_start in O(log n). After a
282286
* successful allocation the gap hint advances near or past the existing
@@ -307,8 +311,10 @@ static uint64_t find_free_gap_inner(const guest_t *g,
307311
g->regions[i].start >= gap_start + length)
308312
return gap_start;
309313

310-
/* Region overlaps; advance past it and round to the next host page */
311-
gap_start = ALIGN_UP(g->regions[i].end, hps);
314+
/* Region overlaps; advance past it and round to the next aligned
315+
* boundary so the caller's alignment promise holds across allocations.
316+
*/
317+
gap_start = ALIGN_UP(g->regions[i].end, align);
312318
}
313319

314320
/* Check trailing space after all regions */
@@ -321,36 +327,45 @@ static uint64_t find_free_gap_inner(const guest_t *g,
321327
* The hint tracks the first address after the last successful mapping in each
322328
* region, which avoids rescanning the same prefix on sequential mmap activity.
323329
* A miss falls back to the region base so holes reopened by munmap are still
324-
* reusable.
330+
* reusable. The align argument is the per-call start boundary the result must
331+
* satisfy; some sys_mmap callers first pass BLOCK_2MIB as a best-effort
332+
* placement preference for MAP_SHARED file-backed allocations, then retry with
333+
* host-page alignment when no 2 MiB-aligned gap is available.
325334
*/
326335
static uint64_t find_free_gap(guest_t *g,
327336
uint64_t length,
328337
uint64_t min_addr,
329-
uint64_t max_addr)
338+
uint64_t max_addr,
339+
uint64_t align)
330340
{
331341
/* RX and RW mappings advance independently, so keep separate hints. */
332342
uint64_t *hint =
333343
(min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint;
334344

335345
/* Advance the hint to the next host-page boundary so the following
336346
* sequential allocation lands on an address that the kernel accepts for
337-
* mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff
338-
* is up to host_page-1 bytes of address-space waste per small allocation;
339-
* physical pages are still demand-paged, so RAM cost is unchanged.
347+
* mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). Round to the
348+
* host page even when the current call requested a larger align (e.g.
349+
* BLOCK_2MIB for MAP_SHARED file-backed): a subsequent MAP_PRIVATE 4 KiB
350+
* allocation should still be able to occupy the trailing space inside the
351+
* 2 MiB block. find_free_gap_inner re-applies the caller's align on its
352+
* next entry, so a subsequent MAP_SHARED allocation skips past the small
353+
* tenant and lands on the next 2 MiB boundary anyway.
340354
*/
341355
size_t hps = host_page_size_cached();
342356

343357
/* Try cached hint first (only if within the valid range) */
344358
if (*hint >= min_addr && *hint < max_addr) {
345-
uint64_t result = find_free_gap_inner(g, length, *hint, max_addr);
359+
uint64_t result =
360+
find_free_gap_inner(g, length, *hint, max_addr, align);
346361
if (result != UINT64_MAX) {
347362
*hint = ALIGN_UP(result + length, hps);
348363
return result;
349364
}
350365
}
351366

352367
/* Full scan from base */
353-
uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr);
368+
uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr, align);
354369
if (result != UINT64_MAX)
355370
*hint = ALIGN_UP(result + length, hps);
356371
return result;
@@ -2184,21 +2199,55 @@ int64_t sys_mmap(guest_t *g,
21842199
if (high_hint >= 0)
21852200
return high_hint;
21862201
}
2202+
/* Open the backing fd before the gap-finder so the alignment heuristic
2203+
* can read the host fd's access mode through overlay_fd_writable.
2204+
* Closes on every failure path within the non-fixed branch.
2205+
*/
2206+
if (!is_anon) {
2207+
if (host_fd_ref_open(fd, &backing_ref) < 0)
2208+
return -LINUX_EBADF;
2209+
host_backing_fd = backing_ref.fd;
2210+
}
2211+
/* Prefer stage-2 2 MiB block boundaries for non-fixed MAP_SHARED
2212+
* file-backed allocations. Without this each shared file mmap whose
2213+
* result lands mid-block forces hvf_apply_file_overlay_quiesced to
2214+
* split the containing HVF segment at both ends; back-to-back memfd
2215+
* allocations burn segments at roughly two per mmap and run the table
2216+
* to GUEST_MAX_HVF_SEGMENTS quickly. This is a placement preference,
2217+
* not a Linux-visible constraint: if no 2 MiB-aligned gap exists, the
2218+
* allocation retries with host-page alignment. The condition mirrors
2219+
* the overlay fast-path's gate (host-page-aligned offset, writable
2220+
* backer) so read-only MAP_SHARED mappings that fall through to the
2221+
* pread snapshot do not pay the alignment cost without the
2222+
* segment-table benefit.
2223+
*/
2224+
size_t hps = host_page_size_cached();
2225+
uint64_t align = (uint64_t) hps;
2226+
if (!is_anon && fd >= 0 && (flags & LINUX_MAP_SHARED) &&
2227+
((uint64_t) offset % hps == 0) &&
2228+
overlay_fd_writable(host_backing_fd))
2229+
align = BLOCK_2MIB;
2230+
uint64_t fallback_align = (uint64_t) hps;
21872231
if (needs_exec && !(prot & LINUX_PROT_WRITE)) {
21882232
/* PROT_EXEC without PROT_WRITE: allocate from the RX mmap region.
21892233
* Apple HVF enforces W^X on 2MiB block page table entries, so
21902234
* executable mappings must be in separate 2MiB blocks from writable
21912235
* ones. The RX region at MMAP_RX_BASE is pre-mapped with execute
21922236
* permission.
21932237
*/
2194-
result_off = find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit);
2238+
result_off =
2239+
find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit, align);
2240+
if (result_off == UINT64_MAX && align != fallback_align)
2241+
result_off = find_free_gap(g, length, MMAP_RX_BASE,
2242+
g->mmap_limit, fallback_align);
21952243
if (result_off == UINT64_MAX) {
21962244
log_debug(
21972245
"mmap: RX address space exhausted "
21982246
"(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)",
21992247
(unsigned long long) length,
22002248
(unsigned long long) g->mmap_limit, g->ipa_bits,
22012249
(unsigned long long) (g->guest_size >> 30));
2250+
host_fd_ref_close(&backing_ref);
22022251
return -LINUX_ENOMEM;
22032252
}
22042253
/* High-water mark for fork IPC state transfer */
@@ -2232,31 +2281,41 @@ int64_t sys_mmap(guest_t *g,
22322281
*/
22332282
uint64_t hint_max =
22342283
(hint_off < MMAP_BASE) ? MMAP_BASE : g->mmap_limit;
2235-
result_off =
2236-
find_free_gap_inner(g, length, hint_off, hint_max);
2284+
if (align != fallback_align) {
2285+
uint64_t exact_hint_max = hint_off + length;
2286+
result_off =
2287+
find_free_gap_inner(g, length, hint_off,
2288+
exact_hint_max, fallback_align);
2289+
}
2290+
if (result_off == UINT64_MAX)
2291+
result_off = find_free_gap_inner(g, length, hint_off,
2292+
hint_max, align);
2293+
if (result_off == UINT64_MAX && align != fallback_align)
2294+
result_off = find_free_gap_inner(
2295+
g, length, hint_off, hint_max, fallback_align);
22372296
}
22382297
}
22392298
if (result_off == UINT64_MAX)
2240-
result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit);
2299+
result_off =
2300+
find_free_gap(g, length, MMAP_BASE, g->mmap_limit, align);
2301+
if (result_off == UINT64_MAX && align != fallback_align)
2302+
result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit,
2303+
fallback_align);
22412304
if (result_off == UINT64_MAX) {
22422305
log_debug(
22432306
"mmap: RW address space exhausted "
22442307
"(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)",
22452308
(unsigned long long) length,
22462309
(unsigned long long) g->mmap_limit, g->ipa_bits,
22472310
(unsigned long long) (g->guest_size >> 30));
2311+
host_fd_ref_close(&backing_ref);
22482312
return -LINUX_ENOMEM;
22492313
}
22502314
/* High-water mark for fork IPC state transfer */
22512315
uint64_t rw_hwm = result_off + length;
22522316
if (rw_hwm > g->mmap_next)
22532317
g->mmap_next = rw_hwm;
22542318
}
2255-
if (!is_anon) {
2256-
if (host_fd_ref_open(fd, &backing_ref) < 0)
2257-
return -LINUX_EBADF;
2258-
host_backing_fd = backing_ref.fd;
2259-
}
22602319
if (!region_has_capacity_after_removes(g, NULL, 0, 1)) {
22612320
host_fd_ref_close(&backing_ref);
22622321
return -LINUX_ENOMEM;
@@ -2931,10 +2990,17 @@ int64_t sys_mremap(guest_t *g,
29312990
int needs_exec = (prot & LINUX_PROT_EXEC) != 0;
29322991

29332992
uint64_t new_off;
2993+
/* mremap moves the data via read_file_range_to_guest and does not
2994+
* reinstall a file overlay at the destination, so 2 MiB alignment
2995+
* would not narrow segment-table growth. Stay at host-page alignment.
2996+
*/
2997+
size_t mremap_align = host_page_size_cached();
29342998
if (needs_exec && !(prot & LINUX_PROT_WRITE))
2935-
new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit);
2999+
new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit,
3000+
mremap_align);
29363001
else
2937-
new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit);
3002+
new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit,
3003+
mremap_align);
29383004

29393005
if (new_off == UINT64_MAX) {
29403006
if (track_backing_fd >= 0)

tests/test-mmap-hint.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
*/
1111

1212
#include <errno.h>
13+
#include <fcntl.h>
1314
#include <stdint.h>
15+
#include <stdlib.h>
1416
#include <sys/mman.h>
1517
#include <unistd.h>
1618

@@ -69,9 +71,71 @@ static void test_low_hint_exact(void)
6971
munmap(p, len);
7072
}
7173

74+
static void test_shared_file_hint_falls_back_from_2m_alignment(void)
75+
{
76+
TEST("MAP_SHARED file hint falls back from 2MiB alignment");
77+
78+
const size_t page = (size_t) sysconf(_SC_PAGESIZE);
79+
const uintptr_t block_2m = 2ULL * 1024ULL * 1024ULL;
80+
81+
void *anchor =
82+
mmap(NULL, page, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
83+
if (anchor == MAP_FAILED) {
84+
FAIL("anchor mmap failed");
85+
return;
86+
}
87+
88+
uintptr_t anchor_addr = (uintptr_t) anchor;
89+
uintptr_t anchor_block = anchor_addr & ~(block_2m - 1);
90+
if (anchor_addr != anchor_block) {
91+
munmap(anchor, page);
92+
FAIL("anchor not 2MiB-aligned");
93+
return;
94+
}
95+
if (anchor_block < 0x00400000ULL + 0x10000ULL) {
96+
munmap(anchor, page);
97+
FAIL("anchor too low for regression hint");
98+
return;
99+
}
100+
uintptr_t hint_addr = anchor_block - 0x10000ULL;
101+
102+
char path[] = "/tmp/elfuse-mmap-hint-XXXXXX";
103+
int fd = mkstemp(path);
104+
if (fd < 0) {
105+
munmap(anchor, page);
106+
FAIL("mkstemp failed");
107+
return;
108+
}
109+
unlink(path);
110+
111+
if (ftruncate(fd, (off_t) page) < 0) {
112+
close(fd);
113+
munmap(anchor, page);
114+
FAIL("ftruncate failed");
115+
return;
116+
}
117+
118+
void *hint = (void *) hint_addr;
119+
void *p = mmap(hint, page, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
120+
if (p == MAP_FAILED) {
121+
close(fd);
122+
munmap(anchor, page);
123+
FAIL("shared file mmap failed");
124+
return;
125+
}
126+
127+
EXPECT_TRUE((uintptr_t) p == hint_addr,
128+
"shared file mmap should honor host-page-aligned hint");
129+
130+
munmap(p, page);
131+
close(fd);
132+
munmap(anchor, page);
133+
}
134+
72135
int main(void)
73136
{
74137
test_low_hint_exact();
138+
test_shared_file_hint_falls_back_from_2m_alignment();
75139
SUMMARY("test-mmap-hint");
76140
return fails ? 1 : 0;
77141
}

0 commit comments

Comments
 (0)