Skip to content

Commit d2dc25a

Browse files
simongdaviesCopilot
andcommitted
feat: add ERR_REENTRANT deadlock detection for host callbacks
When a host function or print callback attempts to call sandbox methods (callHandler, snapshot, restore, unload) while guest code is executing, return ERR_REENTRANT immediately instead of deadlocking. Implementation: - Add executing_flag (AtomicBool) to LoadedJSSandboxWrapper - Set flag true during spawn_blocking, false after completion - with_inner/take_inner_with use try_lock when flag is set - New ErrorCode::Reentrant variant with ERR_REENTRANT code This primarily prevents guaranteed deadlocks in host function callbacks (where the Rust side block_on's waiting for the result while holding the lock). For print callbacks (fire-and-forget) it's a safety net. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent c48ac5c commit d2dc25a

1 file changed

Lines changed: 66 additions & 14 deletions

File tree

src/js-host-api/src/lib.rs

Lines changed: 66 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ enum ErrorCode {
123123
Consumed,
124124
/// Internal / unexpected failure (lock poison, task join error, etc.).
125125
Internal,
126+
/// Reentrant call detected — calling sandbox methods from a host callback.
127+
Reentrant,
126128
}
127129

128130
impl ErrorCode {
@@ -135,6 +137,7 @@ impl ErrorCode {
135137
Self::InvalidArg => "ERR_INVALID_ARG",
136138
Self::Consumed => "ERR_CONSUMED",
137139
Self::Internal => "ERR_INTERNAL",
140+
Self::Reentrant => "ERR_REENTRANT",
138141
}
139142
}
140143
}
@@ -230,6 +233,15 @@ fn join_error(err: tokio::task::JoinError) -> napi::Error {
230233
)
231234
}
232235

236+
/// Creates an error for reentrant calls detected at runtime.
237+
fn reentrant_error() -> napi::Error {
238+
hl_error(
239+
ErrorCode::Reentrant,
240+
"Cannot call sandbox methods from a host callback — the sandbox lock is held during \
241+
guest execution. Perform this operation after callHandler() resolves instead",
242+
)
243+
}
244+
233245
// ── Snapshot ─────────────────────────────────────────────────────────
234246

235247
/// A captured point-in-time state of a sandbox.
@@ -435,9 +447,10 @@ impl SandboxBuilderWrapper {
435447
//
436448
// **Reentrancy note:** The print callback runs while the sandbox
437449
// Mutex is held (inside `call_handler`'s `spawn_blocking`).
438-
// Calling Hyperlight APIs that acquire the same lock from within
439-
// the callback (e.g. `snapshot()`, `restore()`, `unload()`) will
440-
// deadlock. Keep print callbacks simple — logging only.
450+
// If user code in the callback attempts to call methods that
451+
// acquire the same lock (e.g. `snapshot()`, `restore()`,
452+
// `unload()`, `callHandler()`), the `executing_flag` deadlock
453+
// detection will return `ERR_REENTRANT` instead of hanging.
441454
let print_fn = move |msg: String| -> i32 {
442455
let status = callback.call(msg, ThreadsafeFunctionCallMode::Blocking);
443456
if status == Status::Ok {
@@ -853,6 +866,7 @@ impl JSSandboxWrapper {
853866
poisoned_flag,
854867
last_call_stats: Arc::new(ArcSwapOption::empty()),
855868
disposed_flag: Arc::new(AtomicBool::new(false)),
869+
executing_flag: Arc::new(AtomicBool::new(false)),
856870
})
857871
}
858872

@@ -935,17 +949,37 @@ pub struct LoadedJSSandboxWrapper {
935949
/// `unload()`), for lock-free checks in sync getters that don't
936950
/// consult the inner Mutex.
937951
disposed_flag: Arc<AtomicBool>,
952+
953+
/// Tracks whether guest code is currently executing inside `call_handler`.
954+
///
955+
/// Used for deadlock detection: if a host callback (print fn, host function)
956+
/// tries to call a method that needs the inner Mutex while this is `true`,
957+
/// we return `ERR_REENTRANT` immediately instead of deadlocking.
958+
executing_flag: Arc<AtomicBool>,
938959
}
939960

940961
type LoadedJSSandboxGuard = OwnedMappedMutexGuard<Option<LoadedJSSandbox>, LoadedJSSandbox>;
941962

942963
impl LoadedJSSandboxWrapper {
943964
/// Borrow the inner value mutably via Mutex, or error if consumed.
965+
///
966+
/// Performs deadlock detection: if `executing_flag` is set (meaning we're
967+
/// inside a `call_handler` on a background thread), `try_lock` is attempted
968+
/// first. If it fails, we know this is a reentrant call from a host callback
969+
/// and return `ERR_REENTRANT` immediately instead of deadlocking.
944970
async fn with_inner<R>(
945971
&self,
946972
f: impl AsyncFnOnce(LoadedJSSandboxGuard) -> napi::Result<R>,
947973
) -> napi::Result<R> {
948-
let sandbox = self.inner.clone().lock_owned().await;
974+
let sandbox = if self.executing_flag.load(Ordering::Acquire) {
975+
// We're inside a host callback — try_lock to detect reentrancy.
976+
match self.inner.clone().try_lock_owned() {
977+
Ok(guard) => guard,
978+
Err(_) => return Err(reentrant_error()),
979+
}
980+
} else {
981+
self.inner.clone().lock_owned().await
982+
};
949983
let sandbox = OwnedMutexGuard::try_map(sandbox, Option::as_mut)
950984
.map_err(|_| consumed_error("LoadedJSSandbox"))?;
951985
f(sandbox).await
@@ -954,30 +988,42 @@ impl LoadedJSSandboxWrapper {
954988
/// Borrow the inner value mutably via Mutex, or error if consumed.
955989
/// The closure `f` will run using spawn_blocking, so it can perform long-running operations without
956990
/// blocking the Node.js event loop. This is the main way to interact with the inner `LoadedJSSandbox`.
991+
///
992+
/// Sets `executing_flag` for the duration of the blocking closure so that
993+
/// reentrant calls from host callbacks are detected instead of deadlocking.
957994
async fn with_blocking_inner<R: Send + 'static>(
958995
&self,
959996
f: impl FnOnce(LoadedJSSandboxGuard) -> napi::Result<R> + Send + 'static,
960997
) -> napi::Result<R> {
998+
let executing_flag = self.executing_flag.clone();
961999
self.with_inner(async move |sandbox| {
962-
tokio::task::spawn_blocking(move || f(sandbox))
1000+
executing_flag.store(true, Ordering::Release);
1001+
let result = tokio::task::spawn_blocking(move || f(sandbox))
9631002
.await
964-
.map_err(join_error)?
1003+
.map_err(join_error)?;
1004+
executing_flag.store(false, Ordering::Release);
1005+
result
9651006
})
9661007
.await
9671008
}
9681009

9691010
/// Take ownership of the inner value, returning a consumed-state error if
9701011
/// this instance has already been used.
1012+
///
1013+
/// Performs the same deadlock detection as `with_inner`.
9711014
async fn take_inner_with<R>(
9721015
&self,
9731016
f: impl AsyncFnOnce(LoadedJSSandbox) -> napi::Result<R>,
9741017
) -> napi::Result<R> {
975-
let sandbox = self
976-
.inner
977-
.lock()
978-
.await
979-
.take()
980-
.ok_or_else(|| consumed_error("LoadedJSSandbox"))?;
1018+
let sandbox = if self.executing_flag.load(Ordering::Acquire) {
1019+
match self.inner.try_lock() {
1020+
Ok(mut guard) => guard.take(),
1021+
Err(_) => return Err(reentrant_error()),
1022+
}
1023+
} else {
1024+
self.inner.lock().await.take()
1025+
}
1026+
.ok_or_else(|| consumed_error("LoadedJSSandbox"))?;
9811027
self.disposed_flag.store(true, Ordering::Release);
9821028
f(sandbox).await
9831029
}
@@ -986,14 +1032,20 @@ impl LoadedJSSandboxWrapper {
9861032
/// this instance has already been used.
9871033
/// The closure `f` will run using spawn_blocking, so it can perform long-running operations without
9881034
/// blocking the Node.js event loop. This is the main way to interact with the inner `LoadedJSSandbox`.
1035+
///
1036+
/// Sets `executing_flag` for the duration of the blocking closure.
9891037
async fn take_blocking_inner_with<R: Send + 'static>(
9901038
&self,
9911039
f: impl FnOnce(LoadedJSSandbox) -> napi::Result<R> + Send + 'static,
9921040
) -> napi::Result<R> {
1041+
let executing_flag = self.executing_flag.clone();
9931042
self.take_inner_with(async move |sandbox| {
994-
tokio::task::spawn_blocking(move || f(sandbox))
1043+
executing_flag.store(true, Ordering::Release);
1044+
let result = tokio::task::spawn_blocking(move || f(sandbox))
9951045
.await
996-
.map_err(join_error)?
1046+
.map_err(join_error)?;
1047+
executing_flag.store(false, Ordering::Release);
1048+
result
9971049
})
9981050
.await
9991051
}

0 commit comments

Comments
 (0)