fix(platform): improve handling of bursts in running scripts

azasypkin · azasypkin · commit 62ad6d163868 · 2026-05-30T18:43:44.000+02:00
diff --git a/src/js_runtime.rs b/src/js_runtime.rs
@@ -18,7 +18,7 @@ use deno_core::{JsRuntimeForSnapshot, PollEventLoopOptions, RuntimeOptions, scop
 use serde::Deserialize;
 use std::{
     sync::{
-        Arc, OnceLock,
+        Arc, Once, OnceLock,
         atomic::{AtomicBool, AtomicUsize, Ordering},
     },
     time::{Duration, Instant},
@@ -73,14 +73,34 @@ globalThis.secutils = {
 /// which is exactly what we want.
 static STARTUP_SNAPSHOT: OnceLock<&'static [u8]> = OnceLock::new();
 
+/// Guards a single `deno_core::JsRuntime::init_platform` call for the whole process.
+static PLATFORM_INIT: Once = Once::new();
+
+/// Initialises the V8 platform exactly once, no matter which entry point gets there first.
+/// `JsRuntime::init_platform` (the server boot path) and the lazy snapshot build (the path tests
+/// and benches hit when they call `execute_script` without an explicit platform init) both funnel
+/// through here. The V8 platform must be installed *before* the first isolate is created, otherwise
+/// concurrent isolate creation across worker threads races on V8's implicit default-platform setup
+/// and trips a fatal `v8::HandleScope::CreateHandle` ("Cannot create a handle without a
+/// HandleScope") during GC.
+fn ensure_platform() {
+    PLATFORM_INIT.call_once(|| {
+        deno_core::JsRuntime::init_platform(None);
+    });
+}
+
 fn build_startup_snapshot() -> &'static [u8] {
-    // The snapshot intentionally does not bake in `secutils_ext` or any JS
-    // modules: baking ops into a snapshot requires the snapshotting runtime
-    // to register the exact same op layout at runtime (a minefield of subtle
-    // version/feature mismatches). What we capture here is the expensive
-    // part - the V8 context setup and builtin JS globals. `secutils_ext` is
-    // still registered at `JsRuntime::new` time per invocation, but on top
-    // of a warm, pre-initialised context.
+    // The V8 platform has to be live before we create the snapshotting isolate (the first isolate
+    // in the process on the lazy path). Doing it here means every consumer - server, benches, and
+    // the test binary - gets a properly initialised platform regardless of whether
+    // `JsRuntime::init_platform` was called explicitly.
+    ensure_platform();
+
+    // The snapshot intentionally does not bake in `secutils_ext` or any JS modules: baking ops into
+    // a snapshot requires the snapshotting runtime to register the exact same op layout at runtime
+    // (a minefield of subtle version/feature mismatches). What we capture here is the expensive
+    // part - the V8 context setup and builtin JS globals. `secutils_ext` is still registered at
+    // `JsRuntime::new` time per invocation, but on top of a warm, pre-initialised context.
     let runtime = JsRuntimeForSnapshot::new(RuntimeOptions::default());
     let snapshot = runtime.snapshot();
     Box::leak(snapshot) as &[u8]
@@ -198,7 +218,7 @@ impl JsRuntime {
     /// snapshot, and eagerly spins up the worker pool. Should be called exactly
     /// once, from the main thread, during server startup.
     pub fn init_platform() {
-        deno_core::JsRuntime::init_platform(None);
+        ensure_platform();
         // Build the snapshot on the main thread before any worker boots so the
         // first script execution on each worker does not pay for it. V8 requires
         // the snapshotting isolate to run on a single thread, which is why we
diff --git a/src/js_runtime/worker_pool.rs b/src/js_runtime/worker_pool.rs
@@ -1,41 +1,51 @@
-//! A pool of long-lived worker threads, each owning a persistent
-//! `CurrentThread` tokio runtime + `LocalSet`. Script executions are
-//! dispatched round-robin to workers over an unbounded mpsc channel and run
-//! *concurrently* within each worker (each task is `spawn_local`-ed), so a
-//! script that parks on an async op - e.g. a `secutils.kv.watch` long-poll
-//! that can idle for tens of seconds - never blocks the other scripts sharing
-//! its worker thread. CPU-bound work is still cooperative: a script doing
-//! synchronous V8 work holds the thread until its next await point.
+//! An elastic pool of worker threads, each owning a persistent `CurrentThread` tokio runtime. A
+//! submitted script runs on exactly one worker thread and is driven to completion there
+//! (`block_on`) before that worker accepts the next task - i.e. **at most one V8 isolate is ever
+//! live per worker thread at a time**.
 //!
-//! This replaces the previous per-call `spawn_blocking` + `new_current_thread`
-//! pattern, which paid the full cost of building a fresh tokio runtime (and
-//! its I/O driver, which consumes a kqueue/epoll fd) on every invocation.
+//! ## Why one isolate per thread
 //!
-//! Each task still creates a fresh V8 isolate for strong isolation between
-//! scripts; reusing the worker thread and its tokio runtime is what yields
-//! the steady-state win. Future work (V8 startup snapshot, isolate pooling)
-//! can further reduce per-task cost on top of this foundation.
+//! A V8 isolate is pinned to the thread that created it, and V8 tracks the "current isolate" and
+//! the active `HandleScope` in *thread-local* state. Interleaving the event loops of two isolates
+//! on a single OS thread (e.g. by `spawn_local`-ing several script futures onto one `LocalSet`)
+//! corrupts that thread-local state: when one isolate parks on an `await` and another isolate
+//! resumes on the same thread, V8 can try to create a handle while the thread's current
+//! `HandleScope` belongs to the *other* isolate, aborting the process with a fatal
+//! `v8::HandleScope::CreateHandle` ("Cannot create a handle without a HandleScope"). Debug V8
+//! asserts this eagerly, release builds elide the check but the underlying state corruption is
+//! still UB.
 //!
-//! Note: we deliberately keep an unbounded channel here. Back-pressure for
-//! user-visible workloads is already applied upstream (e.g., the
-//! `max_concurrent_responder_requests` semaphore in the responder handler),
-//! and queueing cheap `ScriptTask`s is preferable to blocking the producing
-//! future on `mpsc::Sender::send`.
+//! So concurrency is achieved by running each concurrent script on its own thread, never by sharing
+//! a thread between two isolates.
+//!
+//! ## Elasticity
+//!
+//! A baseline of `min_workers` threads is pre-spawned and kept warm so the common case pays no
+//! thread-creation cost. When every worker is busy (for instance because several scripts are parked
+//! on a `secutils.kv.watch` long-poll that can idle for tens of seconds), additional workers are
+//! spawned on demand up to `max_workers`. Overflow workers above the baseline exit after
+//! `IDLE_TIMEOUT` of inactivity so a burst of long-polls does not leave threads lingering forever.
+//!
+//! Back-pressure for user-visible workloads is already applied upstream (e.g. the
+//! `max_concurrent_responder_requests` semaphore in the responder handler), so the task queue is
+//! unbounded: queueing a cheap `ScriptTask` is preferable to blocking the producing future.
 
 use std::{
+    collections::VecDeque,
     future::Future,
     pin::Pin,
-    sync::{
-        OnceLock,
-        atomic::{AtomicUsize, Ordering},
-    },
+    sync::{Condvar, Mutex, OnceLock},
+    time::Duration,
 };
-use tokio::{runtime::Builder, sync::mpsc, task::LocalSet};
+use tokio::runtime::Builder;
 
-/// A boxed, thread-movable closure that, when invoked on a worker thread,
-/// yields a (!Send) future performing the actual script work. The future is
-/// `!Send` because V8 isolates are tied to the thread that created them;
-/// running it inside a `LocalSet` is sufficient.
+/// How long an overflow worker (one spawned above `min_workers`) waits for new work before exiting
+/// and releasing its thread + tokio runtime.
+const IDLE_TIMEOUT: Duration = Duration::from_secs(30);
+
+/// A boxed, thread-movable closure that, when invoked on a worker thread, yields a (!Send) future
+/// performing the actual script work. The future is `!Send` because V8 isolates are tied to the
+/// thread that created them, it is block_on-ed to completion on the worker that picked up the task.
 type TaskBuilder = Box<dyn FnOnce() -> Pin<Box<dyn Future<Output = ()> + 'static>> + Send>;
 
 /// A unit of work dispatched to a worker. Owns everything it needs, including
@@ -56,82 +66,146 @@ impl ScriptTask {
     }
 }
 
-/// A round-robin pool of worker threads. Each worker has its own tokio
-/// `CurrentThread` runtime and `LocalSet`; tasks submitted to a worker run
-/// sequentially in FIFO order, so the pool provides up to `workers.len()`-way
-/// parallelism across workers.
+/// Mutable pool bookkeeping, guarded by [`Shared::lock`].
+struct State {
+    /// FIFO queue of tasks waiting for a free worker.
+    tasks: VecDeque<ScriptTask>,
+    /// Total number of live worker threads (busy + idle).
+    total: usize,
+    /// Number of worker threads currently parked waiting for a task.
+    idle: usize,
+}
+
+struct Shared {
+    lock: Mutex<State>,
+    /// Signalled when a task is enqueued (wakes one parked worker).
+    available: Condvar,
+}
+
+/// An elastic round-robin-free pool: any idle worker pops the next queued task, so work is
+/// naturally balanced across whatever workers are free.
 pub struct WorkerPool {
-    workers: Vec<mpsc::UnboundedSender<ScriptTask>>,
-    next: AtomicUsize,
+    shared: &'static Shared,
+    min_workers: usize,
+    max_workers: usize,
 }
 
 impl WorkerPool {
-    fn new(num_workers: usize) -> Self {
-        let num_workers = num_workers.max(1);
-        let mut workers = Vec::with_capacity(num_workers);
-        for idx in 0..num_workers {
-            workers.push(spawn_worker(idx));
-        }
-        Self {
-            workers,
-            next: AtomicUsize::new(0),
+    fn new(min_workers: usize, max_workers: usize) -> Self {
+        let min_workers = min_workers.max(1);
+        let max_workers = max_workers.max(min_workers);
+        let shared: &'static Shared = Box::leak(Box::new(Shared {
+            lock: Mutex::new(State {
+                tasks: VecDeque::new(),
+                total: 0,
+                idle: 0,
+            }),
+            available: Condvar::new(),
+        }));
+
+        let pool = Self {
+            shared,
+            min_workers,
+            max_workers,
+        };
+
+        // Pre-spawn the warm baseline so the common path pays no thread-creation latency.
+        {
+            let mut state = shared.lock.lock().expect("worker pool mutex poisoned");
+            for _ in 0..min_workers {
+                state.total += 1;
+                spawn_worker(shared, min_workers);
+            }
         }
+
+        pool
     }
 
-    /// Submit a task to the next worker in round-robin order. Fails only if
-    /// every worker thread has panicked and its receiver has been dropped.
+    /// Submit a task. The task is enqueued and either handed to an already-idle worker or, if every
+    /// worker is busy and the pool has not hit its ceiling, picked up by a freshly spawned worker.
+    /// Returns `Err(task)` only if the pool somehow has no workers and cannot spawn one (it always
+    /// can in practice), preserving the previous fallible signature for callers.
     pub fn submit(&self, task: ScriptTask) -> Result<(), ScriptTask> {
-        let len = self.workers.len();
-        let start = self.next.fetch_add(1, Ordering::Relaxed) % len;
-        // Try workers starting at the round-robin index; fall through to the
-        // next one only if a worker has crashed (sender closed).
-        let mut task = Some(task);
-        for offset in 0..len {
-            let idx = (start + offset) % len;
-            let t = task.take().expect("task slot must be populated");
-            match self.workers[idx].send(t) {
-                Ok(()) => return Ok(()),
-                Err(err) => task = Some(err.0),
-            }
+        let mut state = self.shared.lock.lock().expect("worker pool mutex poisoned");
+        state.tasks.push_back(task);
+
+        // Grow the pool when there is no idle worker ready to take the task and we still have
+        // headroom. Otherwise wake a parked worker.
+        if state.idle == 0 && state.total < self.max_workers {
+            state.total += 1;
+            spawn_worker(self.shared, self.min_workers);
+        } else {
+            self.shared.available.notify_one();
         }
-        Err(task.expect("task slot must be populated on failure path"))
+
+        Ok(())
     }
 }
 
-fn spawn_worker(index: usize) -> mpsc::UnboundedSender<ScriptTask> {
-    let (tx, mut rx) = mpsc::unbounded_channel::<ScriptTask>();
-    let name = format!("js-runtime-worker-{index}");
+fn spawn_worker(shared: &'static Shared, min_workers: usize) {
     std::thread::Builder::new()
-        .name(name)
-        .spawn(move || {
-            let rt = Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("Failed to build JS runtime worker tokio runtime");
-            let local = LocalSet::new();
-            local.spawn_local(async move {
-                while let Some(task) = rx.recv().await {
-                    // Spawn each task onto the LocalSet so independent scripts
-                    // make progress concurrently on this single thread. A task
-                    // that awaits (DB round-trip, `kv.watch` long-poll, timer)
-                    // yields the thread to its peers instead of blocking them.
-                    let future = (task.build)();
-                    tokio::task::spawn_local(future);
-                }
-            });
-            rt.block_on(local);
-        })
+        .name("js-runtime-worker".to_string())
+        .spawn(move || worker_loop(shared, min_workers))
         .expect("Failed to spawn JS runtime worker thread");
-    tx
+}
+
+fn worker_loop(shared: &'static Shared, min_workers: usize) {
+    let rt = Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("Failed to build JS runtime worker tokio runtime");
+
+    loop {
+        // Acquire the next task, parking while the queue is empty. Overflow workers (those above
+        // the warm baseline) exit after an idle period.
+        let task = {
+            let mut state = shared.lock.lock().expect("worker pool mutex poisoned");
+            loop {
+                if let Some(task) = state.tasks.pop_front() {
+                    break Some(task);
+                }
+
+                state.idle += 1;
+                let (guard, timed_out) = {
+                    let (guard, wait_result) = shared
+                        .available
+                        .wait_timeout(state, IDLE_TIMEOUT)
+                        .expect("worker pool mutex poisoned");
+                    (guard, wait_result.timed_out())
+                };
+                state = guard;
+                state.idle -= 1;
+
+                // If a task showed up while we were waking, loop and take it.
+                if !state.tasks.is_empty() {
+                    continue;
+                }
+
+                // No work, and we timed out: retire this worker, but always keep the warm baseline
+                // alive.
+                if timed_out && state.total > min_workers {
+                    state.total -= 1;
+                    break None;
+                }
+            }
+        };
+
+        match task {
+            // `block_on` drives the script to completion on this thread, no other isolate can run
+            // here until it returns.
+            Some(task) => rt.block_on((task.build)()),
+            None => return,
+        }
+    }
 }
 
 /// Process-wide worker pool shared by every `JsRuntime::execute_script` call.
 static POOL: OnceLock<WorkerPool> = OnceLock::new();
 
-/// Worker count for the global pool. Overridable via `SECUTILS_JS_WORKERS` for
-/// local experimentation and CI; defaults to the parallelism reported by the
-/// OS, with a floor of 2 so even tiny CI boxes keep some concurrency.
-fn default_worker_count() -> usize {
+/// Warm baseline worker count. Overridable via `SECUTILS_JS_WORKERS` for local experimentation and
+/// CI, defaults to the parallelism reported by the OS, with a floor of 2 so even tiny CI boxes keep
+/// some concurrency.
+fn min_worker_count() -> usize {
     if let Ok(raw) = std::env::var("SECUTILS_JS_WORKERS")
         && let Ok(parsed) = raw.parse::<usize>()
         && parsed > 0
@@ -145,14 +219,34 @@ fn default_worker_count() -> usize {
         .max(2)
 }
 
+/// Hard ceiling on worker threads. The pool grows past the warm baseline to absorb bursts of
+/// long-parked scripts (`kv.watch` long-polls), but never beyond this. Overridable vi
+/// `SECUTILS_JS_MAX_WORKERS` environment variable.
+fn max_worker_count(min: usize) -> usize {
+    if let Ok(raw) = std::env::var("SECUTILS_JS_MAX_WORKERS")
+        && let Ok(parsed) = raw.parse::<usize>()
+        && parsed > 0
+    {
+        return parsed.max(min);
+    }
+
+    // Generous headroom for concurrent long-polls without risking unbounded thread growth, upstream
+    // semaphores bound real concurrency well below this.
+    min.max(512)
+}
+
+fn build_pool() -> WorkerPool {
+    let min = min_worker_count();
+    WorkerPool::new(min, max_worker_count(min))
+}
+
 /// Eagerly initialise the pool (called once from `JsRuntime::init_platform`).
 /// Safe to call multiple times: subsequent calls are no-ops.
 pub fn init() {
-    POOL.get_or_init(|| WorkerPool::new(default_worker_count()));
+    POOL.get_or_init(build_pool);
 }
 
-/// Return the shared pool, lazily initialising it with `default_worker_count`
-/// workers if `init` has not been called yet.
+/// Return the shared pool, lazily initialising it if `init` has not been called.
 pub fn global() -> &'static WorkerPool {
-    POOL.get_or_init(|| WorkerPool::new(default_worker_count()))
+    POOL.get_or_init(build_pool)
 }