Skip to content

Commit 4cca12e

Browse files
ValentaTomasbchalios
authored andcommitted
fix(memory): punch holes for shared discard ranges
Use fallocate(PUNCH_HOLE|KEEP_SIZE) for MAP_SHARED file-backed guest memory so memfd-backed balloon hinting/reporting clears the shared backing instead of only dropping PTEs. Signed-off-by: Babis Chalios <babis.chalios@e2b.dev>
1 parent 639196c commit 4cca12e

2 files changed

Lines changed: 137 additions & 13 deletions

File tree

src/vmm/src/test_utils/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#![allow(missing_docs)]
55

6+
use std::fs::File;
67
use std::sync::{Arc, Mutex};
78

89
use vm_memory::{GuestAddress, GuestRegionCollection};
@@ -53,6 +54,22 @@ pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap {
5354
.unwrap()
5455
}
5556

57+
/// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking.
58+
pub fn multi_region_mem_memfd(
59+
regions: &[(GuestAddress, usize)],
60+
huge_page_cfg: HugePageConfig,
61+
) -> (GuestMemoryMmap, Arc<File>) {
62+
let (reg, file) = memory::memfd_backed(regions, false, huge_page_cfg).unwrap();
63+
let mem = GuestRegionCollection::from_regions(
64+
reg.into_iter()
65+
.map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0))
66+
.collect(),
67+
)
68+
.unwrap();
69+
70+
(mem, file)
71+
}
72+
5673
pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec<GuestRegionMmap> {
5774
memory::anonymous(regions.iter().copied(), false, HugePageConfig::None)
5875
.expect("Cannot initialize memory")

src/vmm/src/vstate/memory.rs

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
// found in the THIRD-PARTY file.
77

88
use std::fs::File;
9-
use std::io::SeekFrom;
9+
use std::io::{self, SeekFrom};
1010
use std::ops::Deref;
1111
use std::sync::{Arc, Mutex};
1212

@@ -22,9 +22,10 @@ pub use vm_memory::{
2222
GuestUsize, MemoryRegionAddress, MmapRegion, address,
2323
};
2424
use vm_memory::{GuestMemoryError, GuestMemoryRegionBytes, VolatileSlice, WriteVolatile};
25-
use vmm_sys_util::errno;
25+
use vmm_sys_util::fallocate::FallocateMode;
26+
use vmm_sys_util::{errno, fallocate};
2627

27-
use crate::utils::{get_page_size, u64_to_usize};
28+
use crate::utils::{get_page_size, u64_to_usize, usize_to_u64};
2829
use crate::vmm_config::machine_config::HugePageConfig;
2930
use crate::vstate::vm::VmError;
3031
use crate::{DirtyBitmap, Vm};
@@ -388,11 +389,29 @@ impl GuestRegionMmapExt {
388389
caddr: MemoryRegionAddress,
389390
len: usize,
390391
) -> Result<(), GuestMemoryError> {
391-
let phys_address = self.get_host_address(caddr)?;
392+
let region_page_size = match self.is_hugetlbfs() {
393+
Some(true) => HugePageConfig::Hugetlbfs2M.page_size(),
394+
_ => HugePageConfig::None.page_size(),
395+
};
392396

397+
let end = caddr
398+
.0
399+
.checked_add(usize_to_u64(len))
400+
.ok_or_else(|| GuestMemoryError::GuestAddressOverflow)?;
401+
402+
// Both the starting address and the length of the region we want to discard needs to be
403+
// page aligned, otherwise we won't be able to actually discard the memory.
404+
if !caddr.0.is_multiple_of(usize_to_u64(region_page_size))
405+
|| !end.is_multiple_of(usize_to_u64(region_page_size))
406+
{
407+
return Err(GuestMemoryError::IOError(
408+
io::ErrorKind::InvalidInput.into(),
409+
));
410+
}
411+
412+
let phys_address = self.get_host_address(caddr)?;
393413
match (self.inner.file_offset(), self.inner.flags()) {
394-
// If and only if we are resuming from a snapshot file, we have a file and it's mapped
395-
// private
414+
// If we are resuming from a snapshot file, we have a file and it's mapped private
396415
(Some(_), flags) if flags & libc::MAP_PRIVATE != 0 => {
397416
// Mmap a new anonymous region over the present one in order to create a hole
398417
// with zero pages.
@@ -420,12 +439,29 @@ impl GuestRegionMmapExt {
420439
Ok(())
421440
}
422441
}
423-
// Match either the case of an anonymous mapping, or the case
424-
// of a shared file mapping.
425-
// TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd
426-
// (or in general MAP_SHARED of a fd). In those cases we should use
427-
// fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE).
428-
// We keep falling to the madvise branch to keep the previous behaviour.
442+
// If we back memory over memfd we have a file mapped shared.
443+
(Some(file_offset), flags) if flags & libc::MAP_SHARED != 0 => {
444+
let Some(offset) = file_offset.start().checked_add(caddr.raw_value()) else {
445+
return Err(GuestMemoryError::InvalidGuestAddress(GuestAddress(
446+
caddr.raw_value(),
447+
)));
448+
};
449+
450+
fallocate::fallocate(
451+
file_offset.file(),
452+
FallocateMode::PunchHole,
453+
true,
454+
offset,
455+
usize_to_u64(len),
456+
)
457+
.map_err(|err| {
458+
error!("discard_range: punching hole failed: {err:?}");
459+
GuestMemoryError::IOError(err.into())
460+
})?;
461+
462+
Ok(())
463+
}
464+
// Anonymous mapping.
429465
_ => {
430466
// Madvise the region in order to mark it as not used.
431467
// SAFETY: The address and length are known to be valid.
@@ -873,7 +909,7 @@ mod tests {
873909

874910
use super::*;
875911
use crate::snapshot::Snapshot;
876-
use crate::test_utils::single_region_mem;
912+
use crate::test_utils::{multi_region_mem_memfd, single_region_mem};
877913
use crate::utils::{get_page_size, mib_to_bytes};
878914
use crate::vstate::memory::test_utils::into_region_ext;
879915

@@ -1462,6 +1498,77 @@ mod tests {
14621498
);
14631499
}
14641500

1501+
fn check_mem_contents(mem: &GuestMemoryMmap, offset: usize, expected: &[u8]) {
1502+
let addr = GuestAddress(usize_to_u64(offset));
1503+
let mut actual_page = vec![0u8; expected.len()];
1504+
mem.read(actual_page.as_mut_slice(), addr).unwrap();
1505+
assert_eq!(actual_page, expected);
1506+
}
1507+
1508+
fn test_discard_range_on_memfd(huge_pages: HugePageConfig) {
1509+
// 8MiB of memory in total (multiples of both possible page sizes)
1510+
const REGION_SIZE: usize = 4 * 1024 * 1024;
1511+
1512+
let (mem, _file) = multi_region_mem_memfd(
1513+
&[
1514+
(GuestAddress(0), REGION_SIZE),
1515+
(GuestAddress(usize_to_u64(REGION_SIZE)), REGION_SIZE),
1516+
],
1517+
huge_pages,
1518+
);
1519+
1520+
let page_size = huge_pages.page_size();
1521+
1522+
// Fill up memory with 1s
1523+
let ones = vec![1u8; 2 * REGION_SIZE];
1524+
mem.write(&ones, GuestAddress(0)).unwrap();
1525+
1526+
check_mem_contents(&mem, 0, &vec![1u8; 2 * REGION_SIZE]);
1527+
1528+
// Discard the entire first region
1529+
mem.discard_range(GuestAddress(0), REGION_SIZE).unwrap();
1530+
check_mem_contents(&mem, 0, &vec![0u8; REGION_SIZE]);
1531+
check_mem_contents(&mem, REGION_SIZE, &vec![1u8; REGION_SIZE]);
1532+
1533+
// discard_range() works on page granularity. Discard the first page of the second region.
1534+
mem.discard_range(GuestAddress(usize_to_u64(REGION_SIZE)), page_size)
1535+
.unwrap();
1536+
check_mem_contents(&mem, REGION_SIZE, &vec![0u8; page_size]);
1537+
check_mem_contents(
1538+
&mem,
1539+
REGION_SIZE + page_size,
1540+
&vec![1u8; REGION_SIZE - page_size],
1541+
);
1542+
1543+
// discard_range() won't actually work with unaligned regions
1544+
1545+
// Try to discard less than a page
1546+
mem.discard_range(GuestAddress(usize_to_u64(REGION_SIZE + page_size)), 1024)
1547+
.unwrap_err();
1548+
mem.discard_range(
1549+
GuestAddress(usize_to_u64(REGION_SIZE + page_size)),
1550+
page_size + 1024,
1551+
)
1552+
.unwrap_err();
1553+
1554+
// Try to discard unaligned address
1555+
mem.discard_range(
1556+
GuestAddress(usize_to_u64(REGION_SIZE + page_size + 1024)),
1557+
page_size,
1558+
)
1559+
.unwrap_err();
1560+
}
1561+
1562+
#[test]
1563+
fn test_discard_range_on_memfd_4k() {
1564+
test_discard_range_on_memfd(HugePageConfig::None)
1565+
}
1566+
1567+
#[test]
1568+
fn test_discard_range_on_memfd_2m() {
1569+
test_discard_range_on_memfd(HugePageConfig::Hugetlbfs2M)
1570+
}
1571+
14651572
/// Verifies that `slots_intersecting_range` returns the correct slots for
14661573
/// ranges at slot boundaries, interior to a slot, and spanning two slots.
14671574
#[test]

0 commit comments

Comments
 (0)