Skip to content

Commit 09f3dd7

Browse files
danbugsandreiltd
authored andcommitted
feat: use multi-space walker in snapshot relocation
Replaces the old filtered_mappings / per-root dedup-by-phys_base compaction loop in Snapshot::new with a single walk_va_spaces pass that emits SpaceAwareMapping entries. For ThisSpace leaves the code still compacts into the dense snapshot blob via the phys_seen dedup map. For AnotherSpace entries (produced when a later root reuses an earlier root's intermediate table) it calls space_aware_map to link the rebuilt table in-place. This fixes Nanvix-on-Hyperlight with multiple process PDs: the kernel-half PTs that process PDs share by pointing at the same PT page would previously each get cloned into independent copies, so a post-boot write into kernel memory via one PD would not be visible via another. The linked-table rebuild preserves the shared-PT invariant, and Hello 5/5 now runs cleanly under NANVIX_REPEAT=4. Signed-off-by: danbugs <danilochiarlone@gmail.com> Signed-off-by: Tomasz Andrzejak <andreiltd@gmail.com>
1 parent b4f8388 commit 09f3dd7

1 file changed

Lines changed: 120 additions & 79 deletions

File tree

src/hyperlight_host/src/sandbox/snapshot.rs

Lines changed: 120 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
use std::collections::HashMap;
17+
use std::collections::{BTreeMap, HashMap};
1818
use std::sync::atomic::{AtomicU64, Ordering};
1919

2020
use hyperlight_common::layout::{scratch_base_gpa, scratch_base_gva};
2121
use hyperlight_common::vmem;
2222
use hyperlight_common::vmem::{
23-
BasicMapping, CowMapping, Mapping, MappingKind, PAGE_SIZE, TableOps,
23+
BasicMapping, CowMapping, Mapping, MappingKind, PAGE_SIZE, SpaceAwareMapping, SpaceId,
24+
TableOps,
2425
};
2526
use tracing::{Span, instrument};
2627

@@ -242,43 +243,23 @@ impl<'a> core::convert::AsRef<SharedMemoryPageTableBuffer<'a>> for SharedMemoryP
242243
self
243244
}
244245
}
245-
fn filtered_mappings<'a>(
246-
snap: &'a [u8],
247-
scratch: &'a [u8],
248-
regions: &[MemoryRegion],
249-
layout: SandboxMemoryLayout,
250-
root_pts: &[u64],
251-
) -> Vec<(usize, Mapping, &'a [u8])> {
252-
let mut result = Vec::new();
253-
let scratch_gva = scratch_base_gva(layout.get_scratch_size());
254-
255-
for (root_idx, &root_pt) in root_pts.iter().enumerate() {
256-
let op = SharedMemoryPageTableBuffer::new(snap, scratch, layout, root_pt);
257-
258-
let iter = unsafe { vmem::virt_to_phys(&op, 0, hyperlight_common::layout::MAX_GVA as u64) };
259-
260-
for m in iter {
261-
// the scratch map doesn't count
262-
if m.virt_base >= scratch_gva {
263-
continue;
264-
}
265-
// neither does the mapping of the snapshot's own page tables
266-
#[cfg(not(feature = "i686-guest"))]
267-
if m.virt_base >= hyperlight_common::layout::SNAPSHOT_PT_GVA_MIN as u64
268-
&& m.virt_base <= hyperlight_common::layout::SNAPSHOT_PT_GVA_MAX as u64
269-
{
270-
continue;
271-
}
272-
273-
if let Some(contents) =
274-
unsafe { guest_page(snap, scratch, regions, layout, m.phys_base) }
275-
{
276-
result.push((root_idx, m, contents));
277-
}
278-
}
279-
}
280-
281-
result
246+
/// Return true if `virt_base` is a VA we must not preserve into the
247+
/// rebuilt snapshot page tables: it is either part of the scratch
248+
/// region (re-mapped freshly by `map_specials`) or, on amd64, part of
249+
/// the self-map of the snapshot's own page tables.
250+
fn skip_virt(virt_base: u64, scratch_gva: u64) -> bool {
251+
if virt_base >= scratch_gva {
252+
return true;
253+
}
254+
#[cfg(not(feature = "i686-guest"))]
255+
if virt_base >= hyperlight_common::layout::SNAPSHOT_PT_GVA_MIN as u64
256+
&& virt_base <= hyperlight_common::layout::SNAPSHOT_PT_GVA_MAX as u64
257+
{
258+
return true;
259+
}
260+
#[cfg(feature = "i686-guest")]
261+
let _ = virt_base;
262+
false
282263
}
283264

284265
/// Find the contents of the page which starts at gpa in guest physical
@@ -456,55 +437,115 @@ impl Snapshot {
456437
entrypoint: NextAction,
457438
) -> Result<Self> {
458439
let mut phys_seen = HashMap::<u64, usize>::new();
440+
let scratch_gva = scratch_base_gva(layout.get_scratch_size());
459441
let memory = shared_mem.with_contents(|snap_c| {
460442
scratch_mem.with_contents(|scratch_c| {
461-
// Phase 1: walk every PT root and collect live pages,
462-
// tagged with which root they belong to (for per-process
463-
// PD isolation on i686).
464-
let live_pages =
465-
filtered_mappings(snap_c, scratch_c, &regions, layout, root_pt_gpas);
466-
467-
// Phase 2: compact live pages into a dense snapshot blob
468-
// and build new page tables with compacted GPAs.
443+
// Phase 1: walk every PT root together. This detects
444+
// aliased intermediate tables (e.g. Nanvix's kernel-
445+
// half PTs, which multiple process PDs share by
446+
// pointing at the same PT page). The walker emits
447+
// `ThisSpace(leaf)` for private leaves and
448+
// `AnotherSpace(ref)` for sub-trees that were already
449+
// seen via an earlier root. Results are returned in
450+
// `root_pt_gpas` order — which is also the topological
451+
// order of the `AnotherSpace` references — so
452+
// processing in iteration order is safe.
453+
let op = SharedMemoryPageTableBuffer::new(
454+
snap_c,
455+
scratch_c,
456+
layout,
457+
root_pt_gpas.first().copied().unwrap_or(0),
458+
);
459+
let walk = unsafe {
460+
vmem::walk_va_spaces(
461+
&op,
462+
root_pt_gpas,
463+
0,
464+
hyperlight_common::layout::MAX_GVA as u64,
465+
)
466+
};
467+
468+
// Phase 2: rebuild each space's page tables, compacting
469+
// `ThisSpace` leaves into a dense snapshot blob and
470+
// linking `AnotherSpace` entries to already-built
471+
// spaces' tables.
469472
// TODO: Look for opportunities to hugepage map
470473
let mut snapshot_memory: Vec<u8> = Vec::new();
471474
let pt_buf = GuestPageTableBuffer::new(layout.get_pt_base_gpa() as usize);
472475
for _ in 1..root_pt_gpas.len() {
473476
unsafe { pt_buf.alloc_table() };
474477
}
475478

476-
for (_root_idx, mapping, contents) in live_pages {
477-
// Convert a snapshot mapping kind for compaction: writable
478-
// pages become CoW, read-only pages stay read-only.
479-
let kind = match mapping.kind {
480-
MappingKind::Cow(cm) => MappingKind::Cow(cm),
481-
MappingKind::Basic(bm) if bm.writable => MappingKind::Cow(CowMapping {
482-
readable: bm.readable,
483-
executable: bm.executable,
484-
}),
485-
MappingKind::Basic(bm) => MappingKind::Basic(BasicMapping {
486-
readable: bm.readable,
487-
writable: false,
488-
executable: bm.executable,
489-
}),
490-
MappingKind::Unmapped => continue,
491-
};
492-
let new_gpa = phys_seen.entry(mapping.phys_base).or_insert_with(|| {
493-
let new_offset = snapshot_memory.len();
494-
snapshot_memory.extend(contents);
495-
new_offset + SandboxMemoryLayout::BASE_ADDRESS
496-
});
497-
498-
pt_buf.set_root_offset(_root_idx * PAGE_SIZE);
499-
500-
let mapping = Mapping {
501-
phys_base: *new_gpa as u64,
502-
virt_base: mapping.virt_base,
503-
len: PAGE_SIZE as u64,
504-
kind,
505-
user_accessible: mapping.user_accessible,
506-
};
507-
unsafe { vmem::map(&pt_buf, mapping) };
479+
let mut built_roots: BTreeMap<SpaceId, u64> = BTreeMap::new();
480+
for (root_idx, (space_id, mappings)) in walk.into_iter().enumerate() {
481+
pt_buf.set_root_offset(root_idx * PAGE_SIZE);
482+
built_roots.insert(
483+
space_id,
484+
(layout.get_pt_base_gpa() as usize + root_idx * PAGE_SIZE) as u64,
485+
);
486+
487+
for sam in mappings {
488+
match sam {
489+
SpaceAwareMapping::ThisSpace(mapping) => {
490+
// Drop the scratch region and (on
491+
// amd64) the snapshot's own PT
492+
// self-map; both are re-mapped
493+
// freshly by `map_specials`.
494+
if skip_virt(mapping.virt_base, scratch_gva) {
495+
continue;
496+
}
497+
let Some(contents) = (unsafe {
498+
guest_page(snap_c, scratch_c, &regions, layout, mapping.phys_base)
499+
}) else {
500+
continue;
501+
};
502+
503+
// Writable pages become CoW in the
504+
// rebuilt snapshot; read-only pages
505+
// stay read-only.
506+
let kind = match mapping.kind {
507+
MappingKind::Cow(cm) => MappingKind::Cow(cm),
508+
MappingKind::Basic(bm) if bm.writable => {
509+
MappingKind::Cow(CowMapping {
510+
readable: bm.readable,
511+
executable: bm.executable,
512+
})
513+
}
514+
MappingKind::Basic(bm) => MappingKind::Basic(BasicMapping {
515+
readable: bm.readable,
516+
writable: false,
517+
executable: bm.executable,
518+
}),
519+
MappingKind::Unmapped => continue,
520+
};
521+
let new_gpa =
522+
phys_seen.entry(mapping.phys_base).or_insert_with(|| {
523+
let new_offset = snapshot_memory.len();
524+
snapshot_memory.extend(contents);
525+
new_offset + SandboxMemoryLayout::BASE_ADDRESS
526+
});
527+
528+
let compacted = Mapping {
529+
phys_base: *new_gpa as u64,
530+
virt_base: mapping.virt_base,
531+
len: PAGE_SIZE as u64,
532+
kind,
533+
user_accessible: mapping.user_accessible,
534+
};
535+
unsafe { vmem::map(&pt_buf, compacted) };
536+
}
537+
SpaceAwareMapping::AnotherSpace(ref_map) => {
538+
// Link to the owning space's already-
539+
// rebuilt intermediate table — this
540+
// is what preserves Nanvix's
541+
// kernel-half-shared invariant across
542+
// process PDs after relocation.
543+
unsafe {
544+
vmem::space_aware_map(&pt_buf, ref_map, &built_roots);
545+
}
546+
}
547+
}
548+
}
508549
}
509550

510551
// Phase 3: Map the scratch region into each root.

0 commit comments

Comments
 (0)