Skip to content

Commit e1dde69

Browse files
committed
fix(apf): resolve non exitless errors for apf
There were a couple of error on the non exitless path for APF. We saw these with the memory hotplug which flodded the APF buffer. Signed-off-by: Jack Thomson <jackabt@amazon.com>
1 parent 5d152eb commit e1dde69

7 files changed

Lines changed: 151 additions & 67 deletions

File tree

resources/seccomp/x86_64-unknown-linux-musl.json

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,18 @@
508508
{
509509
"syscall": "mprotect",
510510
"comment": "Used by memory hotplug to protect access to underlying host memory"
511+
},
512+
{
513+
"syscall": "ioctl",
514+
"args": [
515+
{
516+
"index": 1,
517+
"type": "dword",
518+
"op": "eq",
519+
"val": 1075883734,
520+
"comment": "KVM_ASYNC_PF (accept/ready/sync_complete)"
521+
}
522+
]
511523
}
512524
]
513525
},
@@ -580,8 +592,8 @@
580592
"comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown."
581593
},
582594
{
583-
"syscall": "getrandom",
584-
"comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread"
595+
"syscall": "getrandom",
596+
"comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread"
585597
},
586598
{
587599
"syscall": "accept4",
@@ -1242,6 +1254,22 @@
12421254
"comment": "KVM_IRQFD"
12431255
}
12441256
]
1257+
},
1258+
{
1259+
"syscall": "sendto",
1260+
"comment": "APF fallback: vCPU sends fault request to handler via socket"
1261+
},
1262+
{
1263+
"syscall": "ioctl",
1264+
"args": [
1265+
{
1266+
"index": 1,
1267+
"type": "dword",
1268+
"op": "eq",
1269+
"val": 1075883734,
1270+
"comment": "KVM_ASYNC_PF (accept/ready/sync_complete)"
1271+
}
1272+
]
12451273
}
12461274
]
12471275
}

src/firecracker/examples/uffd/on_demand_handler.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ fn main() {
3232
.accept()
3333
.expect("Cannot listen on APF UDS socket");
3434
apf_stream
35+
.set_nonblocking(true)
36+
.expect("Cannot set APF stream non-blocking");
37+
apf_stream
3538
} else {
3639
let (apf_stream, _) = UnixStream::pair().expect("Cannot create APF socket pair");
3740
apf_stream

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 51 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,8 @@ pub struct FaultRequest {
231231
pub offset: u64,
232232
/// Flags
233233
pub flags: u64,
234-
/// Async PF token
235-
pub token: Option<u32>,
234+
/// Async PF GPA (set for APF fallback faults, None for sync faults)
235+
pub gpa: Option<u64>,
236236
}
237237

238238
impl FaultRequest {
@@ -242,7 +242,7 @@ impl FaultRequest {
242242
offset: self.offset,
243243
len,
244244
flags: self.flags,
245-
token: self.token,
245+
gpa: self.gpa,
246246
zero: false,
247247
}
248248
}
@@ -259,8 +259,8 @@ pub struct FaultReply {
259259
pub len: u64,
260260
/// Flags, must be copied from `FaultRequest`, otherwise 0
261261
pub flags: u64,
262-
/// Async PF token, must be copied from `FaultRequest`, otherwise None
263-
pub token: Option<u32>,
262+
/// Async PF GPA, must be copied from `FaultRequest`, otherwise None
263+
pub gpa: Option<u64>,
264264
/// Whether the populated pages are zero pages
265265
pub zero: bool,
266266
}
@@ -423,6 +423,17 @@ impl UffdHandler {
423423
}
424424
}
425425

426+
/// Convert a guest physical address to an offset in the backing memory file.
427+
#[inline]
428+
pub fn gpa_to_offset(&self, gpa: u64) -> Option<usize> {
429+
for region in &self.mem_regions {
430+
if region.gpa_start <= gpa && gpa < region.gpa_start + region.size as u64 {
431+
return Some((gpa - region.gpa_start + region.offset) as usize);
432+
}
433+
}
434+
None
435+
}
436+
426437
pub fn read_event(&mut self) -> Result<Option<Event>, Error> {
427438
self.uffd.read_event()
428439
}
@@ -733,6 +744,13 @@ impl Runtime {
733744
self.stream.write_all(&encoded).unwrap();
734745
}
735746

747+
pub fn send_apf_fault_reply(&mut self, fault_reply: FaultReply) {
748+
let encoded = bitcode::encode(&fault_reply);
749+
let size = (encoded.len() as u32).to_le_bytes();
750+
self.apf_stream.write_all(&size).unwrap();
751+
self.apf_stream.write_all(&encoded).unwrap();
752+
}
753+
736754
pub fn construct_handler(
737755
stream: &UnixStream,
738756
backing_memory: *mut u8,
@@ -857,6 +875,11 @@ impl Runtime {
857875

858876
let mut uffd_msg_iter =
859877
UffdMsgIterBitcode::new(self.stream.try_clone().expect("Failed to clone stream"));
878+
let mut apf_msg_iter = UffdMsgIterBitcode::new(
879+
self.apf_stream
880+
.try_clone()
881+
.expect("Failed to clone APF stream"),
882+
);
860883

861884
loop {
862885
let pollfd_ptr = pollfds.as_mut_ptr();
@@ -879,52 +902,40 @@ impl Runtime {
879902
if fd.fd == stream_fd {
880903
for fault_request in uffd_msg_iter.by_ref() {
881904
let page_size = self.handler.page_size;
882-
905+
let offset = fault_request
906+
.gpa
907+
.and_then(|gpa| self.handler.gpa_to_offset(gpa))
908+
.unwrap_or(fault_request.offset as usize);
883909
assert!(
884-
(fault_request.offset as usize) < self.handler.size(),
910+
offset < self.handler.size(),
885911
"received bogus offset from firecracker"
886912
);
887-
888-
pf_vcpu_event_dispatch(
889-
&mut self.handler,
890-
fault_request.offset as usize,
891-
);
892-
913+
pf_vcpu_event_dispatch(&mut self.handler, offset);
893914
self.send_fault_reply(fault_request.into_reply(page_size as u64));
894915
}
895916
} else if fd.fd == apf_stream_fd {
896-
// APF fallback path: fault requests over socket
897-
for fault_request in uffd_msg_iter.by_ref() {
917+
// APF fallback path: read from APF socket, reply on APF socket
918+
for fault_request in apf_msg_iter.by_ref() {
898919
let page_size = self.handler.page_size;
899-
920+
let offset = fault_request
921+
.gpa
922+
.and_then(|gpa| self.handler.gpa_to_offset(gpa))
923+
.unwrap_or(fault_request.offset as usize);
900924
assert!(
901-
(fault_request.offset as usize) < self.handler.size(),
902-
"received bogus offset from firecracker"
903-
);
904-
905-
pf_vcpu_event_dispatch(
906-
&mut self.handler,
907-
fault_request.offset as usize,
925+
offset < self.handler.size(),
926+
"received bogus offset from APF handler"
908927
);
909-
910-
self.send_fault_reply(fault_request.into_reply(page_size as u64));
928+
pf_vcpu_event_dispatch(&mut self.handler, offset);
929+
self.send_apf_fault_reply(fault_request.into_reply(page_size as u64));
911930
}
912931
} else if let Some(ctx) = self.exitless_vcpus.get_mut(&fd.fd) {
913932
// Exitless APF: drain notify ring and resolve pages
914933
ctx.drain_eventfd();
915934
while let Some(entry) = ctx.notify_ring().pop() {
916-
let gpa = entry.gpa;
917-
// Find the region and offset for this GPA
918-
for region in &self.handler.mem_regions {
919-
if region.gpa_start <= gpa
920-
&& gpa < region.gpa_start + region.size as u64
921-
{
922-
let offset = (gpa - region.gpa_start + region.offset) as usize;
923-
pf_vcpu_event_dispatch(&mut self.handler, offset);
924-
break;
925-
}
935+
if let Some(offset) = self.handler.gpa_to_offset(entry.gpa) {
936+
pf_vcpu_event_dispatch(&mut self.handler, offset);
926937
}
927-
ctx.signal_ready(gpa);
938+
ctx.signal_ready(entry.gpa);
928939
}
929940
} else {
930941
// Handle one of uffd page faults
@@ -977,6 +988,9 @@ mod tests {
977988
let (apf_stream, _) = apf_listener
978989
.accept()
979990
.expect("Cannot listen on APF UDS socket");
991+
apf_stream
992+
.set_nonblocking(true)
993+
.expect("Cannot set APF stream non-blocking");
980994
// Update runtime with actual runtime
981995
let runtime = uninit_runtime.write(Runtime::new(stream, file, apf_stream));
982996
runtime.run(|_: &mut UffdHandler| {}, |_: &mut UffdHandler, _: usize| {});
@@ -1024,7 +1038,7 @@ mod tests {
10241038
vcpu: 0,
10251039
offset: 0xDEAD_0000, // way beyond 0x1000 handler size
10261040
flags: 0,
1027-
token: None,
1041+
gpa: None,
10281042
};
10291043
let encoded = bitcode::encode(&bogus_request);
10301044
let size = (encoded.len() as u32).to_le_bytes();

src/vmm/src/builder.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ pub fn build_microvm_for_boot(
430430
_guest_memfd: None,
431431
_eventfd: None,
432432
apf_stream: None,
433+
apf_stream_fd: None,
433434
apf_supported: false,
434435
exitless_apf: Vec::new(),
435436
exitless_apf_setup_done: false,
@@ -769,7 +770,9 @@ pub fn build_microvm_from_snapshot(
769770
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
770771
if let Some(uff_socket) = uffd_socket {
771772
let broker = UffdMessageBroker::new(uff_socket);
772-
let apf_exitless = apf_supported && std::env::var("FC_APF_NO_EXITLESS").is_err();
773+
let apf_exitless = apf_supported
774+
&& std::env::var("FC_APF_NO_EXITLESS").is_err()
775+
&& !std::path::Path::new("/apf_no_exitless").exists();
773776
let (contexts, done) = if apf_exitless {
774777
let mut contexts = Vec::new();
775778
let mut handler_fds = Vec::new();
@@ -885,6 +888,12 @@ pub fn build_microvm_from_snapshot(
885888
device_manager,
886889
_guest_memfd: None,
887890
_eventfd: None,
891+
apf_stream_fd: apf_stream.as_ref().map(|s| {
892+
s.lock()
893+
.expect("APF stream lock poisoned")
894+
.stream()
895+
.as_raw_fd()
896+
}),
888897
apf_stream,
889898
apf_supported,
890899
exitless_apf: exitless_apf_contexts,
@@ -1246,6 +1255,7 @@ pub(crate) mod tests {
12461255
_guest_memfd: None,
12471256
_eventfd: None,
12481257
apf_stream: None,
1258+
apf_stream_fd: None,
12491259
apf_supported: false,
12501260
exitless_apf: Vec::new(),
12511261
exitless_apf_setup_done: false,

src/vmm/src/lib.rs

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEvent
135135
use seccomp::BpfProgram;
136136
use snapshot::Persist;
137137
use userfaultfd::Uffd;
138-
use vm_memory::GuestAddress;
139138
use vmm_sys_util::epoll::EventSet;
140139
use vmm_sys_util::eventfd::EventFd;
141140
use vmm_sys_util::terminal::Terminal;
@@ -170,9 +169,7 @@ use crate::vmm_config::memory_hotplug::MemoryHotplugConfig;
170169
use crate::vmm_config::mmds::MmdsConfig;
171170
use crate::vmm_config::net::NetworkInterfaceConfig;
172171
use crate::vmm_config::vsock::VsockDeviceConfig;
173-
use crate::vstate::memory::{
174-
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
175-
};
172+
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
176173
use crate::vstate::memory::{KVM_APF_OP_READY, KVM_ASYNC_PF, KvmAPFReq};
177174
use crate::vstate::vcpu::VcpuState;
178175
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
@@ -355,6 +352,8 @@ pub struct Vmm {
355352
pub(crate) _eventfd: Option<i32>,
356353
/// APF stream shared with vCPUs
357354
apf_stream: Option<vstate::vcpu::SharedApfStream>,
355+
/// Cached raw fd for APF stream (avoids mutex lock per event loop iteration)
356+
apf_stream_fd: Option<RawFd>,
358357
/// Whether the kernel supports KVM_CAP_ASYNC_PF_USERFAULT
359358
apf_supported: bool,
360359
/// Exitless APF contexts (one per vCPU)
@@ -873,18 +872,13 @@ impl Vmm {
873872
}
874873

875874
fn process_vcpu_userfault(&mut self, vcpu: u32, userfault_data: UserfaultData) {
876-
// APF requests are now sent directly by vCPUs, so we only handle sync faults here
877-
let offset = self
878-
.vm
879-
.guest_memory()
880-
.gpa_to_offset(GuestAddress(userfault_data.gpa))
881-
.expect("Failed to convert GPA to offset");
882-
875+
// Sync path: vCPU blocked on condvar, send fault to handler for resolution.
876+
// Always send GPA — the handler converts GPA→offset via its mem_regions.
883877
let fault_request = FaultRequest {
884878
vcpu,
885-
offset,
879+
offset: 0,
886880
flags: userfault_data.flags,
887-
gpa: None,
881+
gpa: Some(userfault_data.gpa),
888882
};
889883

890884
self.uffd_socket
@@ -1051,15 +1045,11 @@ impl MutEventSubscriber for Vmm {
10511045
self.process_uffd_socket();
10521046
}
10531047

1054-
if let Some(apf_stream) = &self.apf_stream {
1055-
let fd = apf_stream
1056-
.lock()
1057-
.expect("APF stream lock poisoned")
1058-
.stream()
1059-
.as_raw_fd();
1060-
if source == fd && event_set == EventSet::IN {
1061-
self.process_apf_socket();
1062-
}
1048+
if let Some(fd) = self.apf_stream_fd
1049+
&& source == fd
1050+
&& event_set == EventSet::IN
1051+
{
1052+
self.process_apf_socket();
10631053
}
10641054
}
10651055

0 commit comments

Comments
 (0)