perf: don't spawn the scheduling on a separate thread for small reads (#6637)

westonpace · web-flow · commit 6e40d78cc448 · 2026-04-29T09:17:26.000-07:00
The threading overhead is more than the benefit of an additional thread.
It also causes synchronization bottlenecks in systems with a high
request throughput.
diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs
@@ -263,13 +263,28 @@ const BATCH_SIZE_BYTES_WARNING: u64 = 10 * 1024 * 1024;
 const ENV_LANCE_STRUCTURAL_BATCH_DECODE_SPAWN_MODE: &str =
     "LANCE_STRUCTURAL_BATCH_DECODE_SPAWN_MODE";
 const ENV_LANCE_READ_CACHE_REPETITION_INDEX: &str = "LANCE_READ_CACHE_REPETITION_INDEX";
+const ENV_LANCE_INLINE_SCHEDULING_THRESHOLD: &str = "LANCE_INLINE_SCHEDULING_THRESHOLD";
+
+// If a request is for at most this many rows we skip the scheduler-task spawn
+// and run scheduling inline as part of the stream's first poll.
+const DEFAULT_INLINE_SCHEDULING_THRESHOLD: u64 = 16 * 1024;
 
 fn default_cache_repetition_index() -> bool {
     static DEFAULT_CACHE_REPETITION_INDEX: OnceLock<bool> = OnceLock::new();
     *DEFAULT_CACHE_REPETITION_INDEX
         .get_or_init(|| parse_env_as_bool(ENV_LANCE_READ_CACHE_REPETITION_INDEX, true))
 }
 
+fn inline_scheduling_threshold() -> u64 {
+    static THRESHOLD: OnceLock<u64> = OnceLock::new();
+    *THRESHOLD.get_or_init(|| {
+        std::env::var(ENV_LANCE_INLINE_SCHEDULING_THRESHOLD)
+            .ok()
+            .and_then(|v| v.trim().parse::<u64>().ok())
+            .unwrap_or(DEFAULT_INLINE_SCHEDULING_THRESHOLD)
+    })
+}
+
 /// Top-level encoding message for a page.  Wraps both the
 /// legacy pb::ArrayEncoding and the newer pb::PageLayout
 ///
@@ -1956,13 +1971,24 @@ pub struct DecoderConfig {
     pub cache_repetition_index: bool,
     /// Whether to validate decoded data
     pub validate_on_decode: bool,
+    /// Override the strategy used to dispatch the scheduling work in
+    /// [`schedule_and_decode`].
+    ///
+    /// * `None` - default behavior: scheduling runs inline on the stream's
+    ///   first poll when the request is small (controlled by the
+    ///   `LANCE_INLINE_SCHEDULING_THRESHOLD` env var) and on a spawned task
+    ///   otherwise.
+    /// * `Some(true)` - always run scheduling inline.
+    /// * `Some(false)` - always spawn a task for scheduling.
+    pub inline_scheduling: Option<bool>,
 }
 
 impl Default for DecoderConfig {
     fn default() -> Self {
         Self {
             cache_repetition_index: default_cache_repetition_index(),
             validate_on_decode: false,
+            inline_scheduling: None,
         }
     }
 }
@@ -2120,7 +2146,17 @@ fn create_scheduler_decoder(
         config.batch_size_bytes,
     )?;
 
-    let scheduler_handle = tokio::task::spawn(async move {
+    // For small requests the scheduling cost is dwarfed by the overhead of
+    // spawning a task, so run scheduling inline as part of the stream's first
+    // poll instead.  The threshold is configurable via
+    // `LANCE_INLINE_SCHEDULING_THRESHOLD`, and callers can force either
+    // strategy via `DecoderConfig::inline_scheduling`.
+    let inline_scheduling = config
+        .decoder_config
+        .inline_scheduling
+        .unwrap_or_else(|| num_rows <= inline_scheduling_threshold());
+
+    let scheduling = async move {
         let mut decode_scheduler = match DecodeBatchScheduler::try_new(
             target_schema.as_ref(),
             &column_indices,
@@ -2150,9 +2186,19 @@ fn create_scheduler_decoder(
                 decode_scheduler.schedule_take(&indices, &filter, tx, config.io)
             }
         }
-    });
+    };
 
-    Ok(check_scheduler_on_drop(decode_stream, scheduler_handle))
+    if inline_scheduling {
+        Ok(async move {
+            scheduling.await;
+            decode_stream
+        }
+        .flatten_stream()
+        .boxed())
+    } else {
+        let scheduler_handle = tokio::task::spawn(scheduling);
+        Ok(check_scheduler_on_drop(decode_stream, scheduler_handle))
+    }
 }
 
 /// Launches a scheduler on a dedicated (spawned) task and creates a decoder to