Skip to content

Commit 0afe5fc

Browse files
authored
Handle more failures deallocating pooled memroy (#12888)
* Handle more failures deallocating pooled memroy This commit replaces a few panics in the pooling allocator with error-handling of what happens at runtime. This is a defense-in-depth measure to ensure that the pooling allocator doesn't panic at runtime and instead handles errors where possible. The first path fixed is in `deallocate_memory` where resetting a slot could result in an error being returned on non-Linux platforms, and if this happened it would cause a panic. The error is instead gracefully handled by continuing slot deallocation but avoiding putting the image itself back into memory. This leaves the slot in an `Unknown` state which is already handled by resetting the state upon reuse. The main consequence here is that future statistics about resident bytes won't be accurate, but these are already inaccurate on non-Linux platforms anyway, so there's no loss. The second path fixes is in flushing a `DecommitQueue` where `decommit_pages` was asserted to succeed. Instead now the error is handled by dropping all images and leaving slots in an `Unknown` state, similar to `deallocate_memory`. * Review comments
1 parent 4b661a2 commit 0afe5fc

File tree

3 files changed

+79
-39
lines changed

3 files changed

+79
-39
lines changed

crates/wasmtime/src/runtime/vm/instance/allocator/pooling.rs

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -721,33 +721,52 @@ unsafe impl InstanceAllocator for PoolingInstanceAllocator {
721721
let prev = self.live_memories.fetch_sub(1, Ordering::Relaxed);
722722
debug_assert!(prev > 0);
723723

724-
// Reset the image slot. If there is any error clearing the
725-
// image, just drop it here, and let the drop handler for the
726-
// slot unmap in a way that retains the address space
727-
// reservation.
724+
// Reset the image slot. Depending on whether this is successful or not
725+
// the `image` is preserved for future use. On success it's queued up to
726+
// get deallocated later, and on failure the slot is deallocated
727+
// immediately without preserving the image.
728728
let mut image = memory.unwrap_static_image();
729729
let mut queue = DecommitQueue::default();
730-
let bytes_resident = image
731-
.clear_and_remain_ready(
732-
self.pagemap.as_ref(),
733-
self.memories.keep_resident,
734-
|ptr, len| {
735-
// SAFETY: the memory in `image` won't be used until this
736-
// decommit queue is flushed, and by definition the memory is
737-
// not in use when calling this function.
738-
unsafe {
739-
queue.push_raw(ptr, len);
740-
}
741-
},
742-
)
743-
.expect("failed to reset memory image");
730+
let bytes_resident = image.clear_and_remain_ready(
731+
self.pagemap.as_ref(),
732+
self.memories.keep_resident,
733+
|ptr, len| {
734+
// SAFETY: the memory in `image` won't be used until this
735+
// decommit queue is flushed, and by definition the memory is
736+
// not in use when calling this function.
737+
unsafe {
738+
queue.push_raw(ptr, len);
739+
}
740+
},
741+
);
744742

745-
// SAFETY: this image is not in use and its memory regions were enqueued
746-
// with `push_raw` above.
747-
unsafe {
748-
queue.push_memory(allocation_index, image, bytes_resident);
743+
match bytes_resident {
744+
Ok(bytes_resident) => {
745+
// SAFETY: this image is not in use and its memory regions were enqueued
746+
// with `push_raw` above.
747+
unsafe {
748+
queue.push_memory(allocation_index, image, bytes_resident);
749+
}
750+
self.merge_or_flush(queue);
751+
}
752+
Err(e) => {
753+
log::warn!("ignoring clear_and_remain_ready error {e}");
754+
// SAFETY: `allocation_index` comes from this pool, as an unsafe
755+
// contract of this function itself, and it's guaranteed to be no
756+
// longer in use so safe to deallocate. The slot couldn't be
757+
// preserved so it's dropped here.
758+
//
759+
// Note that at this point it's not clear how many bytes are
760+
// resident in memory, so it's inevitably going to leave statistics
761+
// a little off. Also note though that non-Linux platforms don't
762+
// keep track of resident bytes anyway, and this path is only
763+
// reachable on non-Linux platforms because Linux can't return an
764+
// error.
765+
unsafe {
766+
self.memories.deallocate(allocation_index, None, 0);
767+
}
768+
}
749769
}
750-
self.merge_or_flush(queue);
751770
}
752771

753772
fn allocate_table<'a, 'b: 'a, 'c: 'a>(

crates/wasmtime/src/runtime/vm/instance/allocator/pooling/decommit_queue.rs

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
use super::PoolingInstanceAllocator;
1212
use crate::vm::{MemoryAllocationIndex, MemoryImageSlot, Table, TableAllocationIndex};
1313
use smallvec::SmallVec;
14+
use std::io;
1415

1516
#[cfg(feature = "async")]
1617
use wasmtime_fiber::FiberStack;
@@ -155,18 +156,14 @@ impl DecommitQueue {
155156
self.stacks.push((SendSyncStack(stack), bytes_resident));
156157
}
157158

158-
fn decommit_all_raw(&mut self) {
159+
/// Returns if any decommit call failed.
160+
fn decommit_all_raw(&mut self) -> io::Result<()> {
159161
for iovec in self.raw.drain(..) {
160162
unsafe {
161-
crate::vm::sys::vm::decommit_pages(iovec.0.iov_base.cast(), iovec.0.iov_len)
162-
.unwrap_or_else(|e| {
163-
panic!(
164-
"failed to decommit ptr={:#p}, len={:#x}: {e}",
165-
iovec.0.iov_base, iovec.0.iov_len
166-
)
167-
});
163+
crate::vm::sys::vm::decommit_pages(iovec.0.iov_base.cast(), iovec.0.iov_len)?;
168164
}
169165
}
166+
Ok(())
170167
}
171168

172169
/// Flush this queue, decommitting all enqueued regions in batch.
@@ -175,14 +172,26 @@ impl DecommitQueue {
175172
/// the associated free lists; `false` if the queue was empty.
176173
pub fn flush(mut self, pool: &PoolingInstanceAllocator) -> bool {
177174
// First, do the raw decommit syscall(s).
178-
self.decommit_all_raw();
175+
let decommit_succeeded = self.decommit_all_raw().is_ok();
179176

180177
// Second, restore the various entities to their associated pools' free
181178
// lists. This is safe, and they are ready for reuse, now that their
182179
// memory regions have been decommitted.
180+
//
181+
// Note that for memory images the images are all dropped here and
182+
// ignored if any decommits failed. This signifies how the state of the
183+
// slot is unknown and needs to be paved over in the future. Also note
184+
// that `bytes_resident` is probably too low, but there's no other
185+
// precise way to know, so it's left here as-is and it'll get reset when
186+
// the slot is reused.
183187
let mut deallocated_any = false;
184188
for (allocation_index, image, bytes_resident) in self.memories {
185189
deallocated_any = true;
190+
let image = if decommit_succeeded {
191+
Some(image)
192+
} else {
193+
None
194+
};
186195
unsafe {
187196
pool.memories
188197
.deallocate(allocation_index, image, bytes_resident);

crates/wasmtime/src/runtime/vm/instance/allocator/pooling/memory_pool.rs

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -456,18 +456,23 @@ impl MemoryPool {
456456

457457
/// Deallocate a previously-allocated memory.
458458
///
459+
/// If `image` is `None` then the state of this memory's slot is left
460+
/// unknown. Otherwise `image` is used to retain information about the state
461+
/// of this slot.
462+
///
459463
/// # Safety
460464
///
461465
/// The memory must have been previously allocated from this pool and
462466
/// assigned the given index, must currently be in an allocated state, and
463467
/// must never be used again.
464468
///
465469
/// The caller must have already called `clear_and_remain_ready` on the
466-
/// memory's image and flushed any enqueued decommits for this memory.
470+
/// memory's image and flushed any enqueued decommits for this memory. Note
471+
/// that if `image` is `None` then this is not required.
467472
pub unsafe fn deallocate(
468473
&self,
469474
allocation_index: MemoryAllocationIndex,
470-
image: MemoryImageSlot,
475+
image: Option<MemoryImageSlot>,
471476
bytes_resident: usize,
472477
) {
473478
self.return_memory_image_slot(allocation_index, image);
@@ -520,7 +525,7 @@ impl MemoryPool {
520525
let index = MemoryAllocationIndex(id.0);
521526
if let Ok(mut slot) = self.take_memory_image_slot(index) {
522527
if slot.remove_image().is_ok() {
523-
self.return_memory_image_slot(index, slot);
528+
self.return_memory_image_slot(index, Some(slot));
524529
}
525530
}
526531

@@ -587,16 +592,23 @@ impl MemoryPool {
587592
}
588593

589594
/// Return ownership of the given image slot.
595+
///
596+
/// If `slot` is not provided then it's reset with `Unknown` meaning a
597+
/// future allocation will need to pave over it to use it.
590598
fn return_memory_image_slot(
591599
&self,
592600
allocation_index: MemoryAllocationIndex,
593-
slot: MemoryImageSlot,
601+
slot: Option<MemoryImageSlot>,
594602
) {
595-
assert!(!slot.is_dirty());
596-
597603
let prev = mem::replace(
598604
&mut *self.image_slots[allocation_index.index()].lock().unwrap(),
599-
ImageSlot::PreviouslyUsed(slot),
605+
match slot {
606+
Some(slot) => {
607+
assert!(!slot.is_dirty());
608+
ImageSlot::PreviouslyUsed(slot)
609+
}
610+
None => ImageSlot::Unknown,
611+
},
600612
);
601613
assert!(matches!(prev, ImageSlot::Unknown));
602614
}

0 commit comments

Comments
 (0)