Skip to content

Commit 60f08cd

Browse files
ValentaTomasbchalios
authored andcommitted
fix(memory): punch holes for shared discard ranges
Use fallocate(PUNCH_HOLE|KEEP_SIZE) for MAP_SHARED file-backed guest memory so memfd-backed balloon hinting/reporting clears the shared backing instead of only dropping PTEs. Signed-off-by: Babis Chalios <babis.chalios@e2b.dev>
1 parent 639196c commit 60f08cd

2 files changed

Lines changed: 150 additions & 11 deletions

File tree

src/vmm/src/test_utils/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#![allow(missing_docs)]
55

6+
use std::fs::File;
67
use std::sync::{Arc, Mutex};
78

89
use vm_memory::{GuestAddress, GuestRegionCollection};
@@ -53,6 +54,22 @@ pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap {
5354
.unwrap()
5455
}
5556

57+
/// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking.
58+
pub fn multi_region_mem_memfd(
59+
regions: &[(GuestAddress, usize)],
60+
huge_page_cfg: HugePageConfig,
61+
) -> (GuestMemoryMmap, Arc<File>) {
62+
let (reg, file) = memory::memfd_backed(regions, false, huge_page_cfg).unwrap();
63+
let mem = GuestRegionCollection::from_regions(
64+
reg.into_iter()
65+
.map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0))
66+
.collect(),
67+
)
68+
.unwrap();
69+
70+
(mem, file)
71+
}
72+
5673
pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec<GuestRegionMmap> {
5774
memory::anonymous(regions.iter().copied(), false, HugePageConfig::None)
5875
.expect("Cannot initialize memory")

src/vmm/src/vstate/memory.rs

Lines changed: 133 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ pub use vm_memory::{
2222
GuestUsize, MemoryRegionAddress, MmapRegion, address,
2323
};
2424
use vm_memory::{GuestMemoryError, GuestMemoryRegionBytes, VolatileSlice, WriteVolatile};
25-
use vmm_sys_util::errno;
25+
use vmm_sys_util::fallocate::FallocateMode;
26+
use vmm_sys_util::{errno, fallocate};
2627

27-
use crate::utils::{get_page_size, u64_to_usize};
28+
use crate::utils::{get_page_size, u64_to_usize, usize_to_u64};
2829
use crate::vmm_config::machine_config::HugePageConfig;
2930
use crate::vstate::vm::VmError;
3031
use crate::{DirtyBitmap, Vm};
@@ -391,8 +392,7 @@ impl GuestRegionMmapExt {
391392
let phys_address = self.get_host_address(caddr)?;
392393

393394
match (self.inner.file_offset(), self.inner.flags()) {
394-
// If and only if we are resuming from a snapshot file, we have a file and it's mapped
395-
// private
395+
// If we are resuming from a snapshot file, we have a file and it's mapped private
396396
(Some(_), flags) if flags & libc::MAP_PRIVATE != 0 => {
397397
// Mmap a new anonymous region over the present one in order to create a hole
398398
// with zero pages.
@@ -420,12 +420,48 @@ impl GuestRegionMmapExt {
420420
Ok(())
421421
}
422422
}
423-
// Match either the case of an anonymous mapping, or the case
424-
// of a shared file mapping.
425-
// TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd
426-
// (or in general MAP_SHARED of a fd). In those cases we should use
427-
// fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE).
428-
// We keep falling to the madvise branch to keep the previous behaviour.
423+
// If we back memory over memfd we have a file mapped shared.
424+
(Some(file_offset), flags) if flags & libc::MAP_SHARED != 0 => {
425+
let Some(offset) = file_offset.start().checked_add(caddr.raw_value()) else {
426+
return Err(GuestMemoryError::InvalidGuestAddress(GuestAddress(
427+
caddr.raw_value(),
428+
)));
429+
};
430+
431+
// fallocate(PUNCH_HOLE) silently no-ops on memfd when [offset, offset+len)
432+
// doesn't cover any whole filesystem block, and returns EINVAL on hugetlbfs
433+
// for ranges not aligned to the huge-page size. Expand [offset, offset+len)
434+
// to the enclosing page boundaries so the punch actually covers every page
435+
// the caller asked to discard.
436+
let page_size = usize_to_u64(if flags & libc::MAP_HUGETLB != 0 {
437+
HugePageConfig::Hugetlbfs2M.page_size()
438+
} else {
439+
get_page_size().map_err(|err| GuestMemoryError::IOError(err.into()))?
440+
});
441+
let end = offset
442+
.checked_add(usize_to_u64(len))
443+
.ok_or(GuestMemoryError::GuestAddressOverflow)?;
444+
let aligned_offset = offset & !(page_size - 1);
445+
let aligned_end = end
446+
.checked_add(page_size - 1)
447+
.ok_or(GuestMemoryError::GuestAddressOverflow)?
448+
& !(page_size - 1);
449+
450+
fallocate::fallocate(
451+
file_offset.file(),
452+
FallocateMode::PunchHole,
453+
true,
454+
aligned_offset,
455+
aligned_end - aligned_offset,
456+
)
457+
.map_err(|err| {
458+
error!("discard_range: punching hole failed: {err:?}");
459+
GuestMemoryError::IOError(err.into())
460+
})?;
461+
462+
Ok(())
463+
}
464+
// Anonymous mapping.
429465
_ => {
430466
// Madvise the region in order to mark it as not used.
431467
// SAFETY: The address and length are known to be valid.
@@ -873,7 +909,7 @@ mod tests {
873909

874910
use super::*;
875911
use crate::snapshot::Snapshot;
876-
use crate::test_utils::single_region_mem;
912+
use crate::test_utils::{multi_region_mem_memfd, single_region_mem};
877913
use crate::utils::{get_page_size, mib_to_bytes};
878914
use crate::vstate::memory::test_utils::into_region_ext;
879915

@@ -1462,6 +1498,92 @@ mod tests {
14621498
);
14631499
}
14641500

1501+
fn check_pages(
1502+
mem: &GuestMemoryMmap,
1503+
page_size: usize,
1504+
first_page_expected: &[u8],
1505+
second_page_expected: &[u8],
1506+
third_page_expected: &[u8],
1507+
) {
1508+
let first_page = 0u64;
1509+
let second_page = usize_to_u64(page_size);
1510+
let third_page = usize_to_u64(2 * page_size);
1511+
1512+
let mut actual_page = vec![0u8; page_size];
1513+
mem.read(actual_page.as_mut_slice(), GuestAddress(first_page))
1514+
.unwrap();
1515+
assert_eq!(first_page_expected, actual_page);
1516+
mem.read(actual_page.as_mut_slice(), GuestAddress(second_page))
1517+
.unwrap();
1518+
assert_eq!(second_page_expected, actual_page);
1519+
mem.read(actual_page.as_mut_slice(), GuestAddress(third_page))
1520+
.unwrap();
1521+
assert_eq!(third_page_expected, actual_page);
1522+
}
1523+
1524+
// A page size-agnostic tests for discard_range() of memory that is
1525+
// mapped on top of memfd
1526+
fn test_discard_range_on_shared_memfd(huge_pages: HugePageConfig) {
1527+
let page_size = huge_pages.page_size();
1528+
let first_page = 0u64;
1529+
let second_page = usize_to_u64(page_size);
1530+
let third_page = usize_to_u64(2 * page_size);
1531+
1532+
let (mem, _file) = multi_region_mem_memfd(
1533+
&[
1534+
(GuestAddress(first_page), page_size),
1535+
(GuestAddress(second_page), page_size),
1536+
(GuestAddress(third_page), page_size),
1537+
],
1538+
huge_pages,
1539+
);
1540+
1541+
// Fill up all three pages with 1s
1542+
let ones = vec![1u8; 3 * page_size];
1543+
mem.write(&ones, GuestAddress(0)).unwrap();
1544+
1545+
// Cleanup the last page
1546+
mem.discard_range(GuestAddress(third_page), page_size)
1547+
.unwrap();
1548+
check_pages(
1549+
&mem,
1550+
page_size,
1551+
&vec![1u8; page_size],
1552+
&vec![1u8; page_size],
1553+
&vec![0u8; page_size],
1554+
);
1555+
1556+
// Cleanup the first page
1557+
mem.discard_range(GuestAddress(0), page_size).unwrap();
1558+
check_pages(
1559+
&mem,
1560+
page_size,
1561+
&vec![0u8; page_size],
1562+
&vec![1u8; page_size],
1563+
&vec![0u8; page_size],
1564+
);
1565+
1566+
// Cleanup starting from first page and landing in second
1567+
mem.discard_range(GuestAddress(1), page_size).unwrap();
1568+
check_pages(
1569+
&mem,
1570+
page_size,
1571+
&vec![0u8; page_size],
1572+
&vec![0u8; page_size],
1573+
&vec![0u8; page_size],
1574+
);
1575+
}
1576+
1577+
#[test]
1578+
fn test_discard_range_on_shared_memfd_4k() {
1579+
test_discard_range_on_shared_memfd(HugePageConfig::None)
1580+
}
1581+
1582+
#[test]
1583+
fn test_discard_range_on_shared_memfd_2m() {
1584+
test_discard_range_on_shared_memfd(HugePageConfig::Hugetlbfs2M)
1585+
}
1586+
14651587
/// Verifies that `slots_intersecting_range` returns the correct slots for
14661588
/// ranges at slot boundaries, interior to a slot, and spanning two slots.
14671589
#[test]

0 commit comments

Comments
 (0)