adapter: bound COPY FROM STDIN blocking-pool usage (#37037)

def- · claude · web-flow · commit 8645d658bde1 · 2026-06-15T19:26:07.000+02:00
setup_copy_from_stdin spawned available_parallelism() workers on the
shared tokio blocking pool eagerly at COPY start -- before any data --
and each worker held its blocking thread for the whole COPY by driving
an idle batch_rx.recv().await via block_on. With no cap on worker count
or concurrent COPYs, a low-privilege client could open ~512/cores idle,
zero-byte COPY ... FROM STDIN connections and pin the entire 512-thread
pool, stalling every other blocking-pool user (e.g. the mandatory
"optimize peek" stage of any SELECT) process-wide until the client
disconnected.

Run each worker as a regular async task instead, so it holds no thread
while parked waiting for the next chunk; offload only the CPU-bound
per-chunk decode to the blocking pool for the duration of that decode.
Also cap workers per COPY at COPY_FROM_STDIN_MAX_WORKERS (8) so one COPY
cannot reserve an unbounded share of the pool even while actively
decoding.

Closes SQL-372.

---------

Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/adapter/src/coord/sequencer/inner/copy_from.rs b/src/adapter/src/coord/sequencer/inner/copy_from.rs
@@ -43,6 +43,12 @@ use crate::{AdapterError, ExecuteContext, ExecuteResponse};
 /// unbounded in-memory growth in a single giant batch.
 const COPY_FROM_STDIN_MAX_BATCH_BYTES: usize = 32 * 1024 * 1024;
 
+/// Cap on the number of parallel decode workers spawned per COPY FROM STDIN.
+/// A single network-bound stream sees marginal gains past a handful of
+/// decoders, and capping bounds how much of the blocking pool any one COPY can
+/// occupy while actively decoding.
+const COPY_FROM_STDIN_MAX_WORKERS: usize = 8;
+
 impl Coordinator {
     pub(crate) async fn sequence_copy_from(
         &mut self,
@@ -415,10 +421,14 @@ impl Coordinator {
             .collect::<Vec<_>>()
             .into();
 
-        // Determine number of parallel workers.
-        let num_workers = std::thread::available_parallelism()
-            .map(|n| n.get())
-            .unwrap_or(1);
+        // Determine number of parallel workers, capped so that a single COPY
+        // cannot reserve an unbounded share of the shared blocking pool.
+        let num_workers = std::cmp::min(
+            std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1),
+            COPY_FROM_STDIN_MAX_WORKERS,
+        );
         tracing::info!(
             %target_id, num_workers,
             "starting parallel COPY FROM STDIN batch builders"
@@ -430,11 +440,12 @@ impl Coordinator {
         let collection_desc = Arc::new(collection_desc);
         let persist_client = self.persist_client.clone();
 
-        // Create per-worker channels and spawn workers on blocking threads.
-        // Each worker does CPU-intensive TSV decoding + columnar encoding,
-        // so they need dedicated OS threads (not tokio async tasks) for
-        // true parallelism.
-        let rt_handle = tokio::runtime::Handle::current();
+        // Create per-worker channels and spawn one async task per worker. Each
+        // worker offloads the CPU-intensive processing of a chunk (decode plus
+        // the per-row transform/constraint-check/columnar encode) to the
+        // blocking pool for the duration of that chunk (see
+        // `copy_from_stdin_batch_builder`), so workers run in parallel while
+        // doing CPU work but hold no thread while idle between chunks.
         let mut batch_txs = Vec::with_capacity(num_workers);
         let mut worker_handles = Vec::with_capacity(num_workers);
 
@@ -464,24 +475,21 @@ impl Coordinator {
             // Only worker 0 receives the first chunk (round-robin), so only
             // it needs to skip the CSV header on its first chunk.
             let skip_header_on_first_chunk = worker_id == 0 && first_chunk_has_header;
-            let rt = rt_handle.clone();
 
-            let handle = mz_ore::task::spawn_blocking(
+            let handle = mz_ore::task::spawn(
                 || format!("copy_from_stdin_worker:{target_id}:{worker_id}"),
-                move || {
-                    rt.block_on(Self::copy_from_stdin_batch_builder(
-                        persist_client,
-                        shard_id,
-                        collection_id,
-                        collection_desc,
-                        target_desc,
-                        column_transform,
-                        column_types,
-                        params,
-                        skip_header_on_first_chunk,
-                        batch_rx,
-                    ))
-                },
+                Self::copy_from_stdin_batch_builder(
+                    persist_client,
+                    shard_id,
+                    collection_id,
+                    collection_desc,
+                    target_desc,
+                    column_transform,
+                    column_types,
+                    params,
+                    skip_header_on_first_chunk,
+                    batch_rx,
+                ),
             );
             worker_handles.push(handle);
         }
@@ -555,10 +563,11 @@ impl Coordinator {
         let mut batch_bytes: usize = 0;
         let mut proto_batches = Vec::new();
 
+        let rt = tokio::runtime::Handle::current();
         let mut is_first_chunk = true;
         while let Some(raw_bytes) = batch_rx.recv().await {
-            // Decode raw bytes into rows. For the first chunk of worker 0,
-            // re-enable header skipping so the real CSV header line is skipped.
+            // For the first chunk of worker 0, re-enable header skipping so the
+            // real CSV header line is skipped.
             let chunk_params = if is_first_chunk && skip_header_on_first_chunk {
                 let mut p = params.clone();
                 if let CopyFormatParams::Csv(ref mut csv) = p {
@@ -569,34 +578,73 @@ impl Coordinator {
                 params.clone()
             };
             is_first_chunk = false;
-            let rows = mz_pgcopy::decode_copy_format(&raw_bytes, &column_types, chunk_params)
-                .map_err(|e| AdapterError::CopyFormatError(e.to_string()))?;
-
-            for row in rows {
-                // Apply column transform if needed (add defaults, reorder).
-                let full_row = if let Some(ref transform) = *column_transform {
-                    transform.apply(&row)
-                } else {
-                    row
-                };
-
-                // Check constraints.
-                for (i, datum) in full_row.iter().enumerate() {
-                    target_desc.constraints_met(i, &datum).map_err(|e| {
-                        AdapterError::Unstructured(anyhow::anyhow!("constraint violation: {e}"))
-                    })?;
-                }
-
-                let data = SourceData(Ok(full_row));
-                batch_builder
-                    .add(&data, &(), &lower, &1)
-                    .await
-                    .map_err(|e| AdapterError::Unstructured(anyhow::anyhow!("persist add: {e}")))?;
-                row_count += 1;
-                row_count_in_batch += 1;
-            }
+            let raw_len = raw_bytes.len();
+
+            // Offload the entire CPU-bound per-chunk pipeline -- decode, column
+            // transform, constraint checks, and the columnar persist encode
+            // (`BatchBuilder::add` -> `PartBuilder::push`) -- to the blocking
+            // pool. There is no yield point in the row loop until a batch fills
+            // (`add` only awaits `flush_part`, and only once an *encoded* part
+            // reaches `blob_target_size`, far beyond the 32 MiB *raw* batch
+            // boundary), so left on the async runtime each chunk's rows would
+            // run as one uninterrupted burst on a shared runtime worker thread,
+            // starving other connections. The blocking thread is held only
+            // while a chunk is in flight and released back to the pool between
+            // chunks (during `recv().await`), so idle workers still hold no
+            // thread. `block_on` is invoked once per chunk -- not per row -- to
+            // drive the row loop and the rare `flush_part` it may await.
+            let chunk_column_types = Arc::clone(&column_types);
+            let chunk_transform = Arc::clone(&column_transform);
+            let chunk_target_desc = Arc::clone(&target_desc);
+            let chunk_rt = rt.clone();
+            let (returned_builder, added_rows) = mz_ore::task::spawn_blocking(
+                || "copy_from_stdin_process_chunk",
+                move || {
+                    let rows = mz_pgcopy::decode_copy_format(
+                        &raw_bytes,
+                        &chunk_column_types,
+                        chunk_params,
+                    )
+                    .map_err(|e| AdapterError::CopyFormatError(e.to_string()))?;
+
+                    chunk_rt.block_on(async move {
+                        let mut added: u64 = 0;
+                        for row in rows {
+                            // Apply column transform if needed (add defaults, reorder).
+                            let full_row = if let Some(ref transform) = *chunk_transform {
+                                transform.apply(&row)
+                            } else {
+                                row
+                            };
+
+                            // Check constraints.
+                            for (i, datum) in full_row.iter().enumerate() {
+                                chunk_target_desc.constraints_met(i, &datum).map_err(|e| {
+                                    AdapterError::Unstructured(anyhow::anyhow!(
+                                        "constraint violation: {e}"
+                                    ))
+                                })?;
+                            }
+
+                            let data = SourceData(Ok(full_row));
+                            batch_builder
+                                .add(&data, &(), &lower, &1)
+                                .await
+                                .map_err(|e| {
+                                    AdapterError::Unstructured(anyhow::anyhow!("persist add: {e}"))
+                                })?;
+                            added += 1;
+                        }
+                        Ok::<_, AdapterError>((batch_builder, added))
+                    })
+                },
+            )
+            .await?;
+            batch_builder = returned_builder;
+            row_count += added_rows;
+            row_count_in_batch += added_rows;
 
-            batch_bytes = batch_bytes.saturating_add(raw_bytes.len());
+            batch_bytes = batch_bytes.saturating_add(raw_len);
             if batch_bytes >= COPY_FROM_STDIN_MAX_BATCH_BYTES {
                 let batch = batch_builder.finish(upper.clone()).await.map_err(|e| {
                     AdapterError::Unstructured(anyhow::anyhow!("persist finish: {e}"))
diff --git a/test/copy/mzcompose.py b/test/copy/mzcompose.py
@@ -17,6 +17,7 @@
 import json
 import random
 import string
+import threading
 import time
 from io import BytesIO, StringIO
 from textwrap import dedent
@@ -201,12 +202,24 @@ def workflow_nightly(c: Composition, parser: WorkflowArgumentParser) -> None:
 
 def workflow_ci(c: Composition, _parser: WorkflowArgumentParser) -> None:
     """
-    Workflows to run during CI
+    Run all workflows during CI.
+
+    Every workflow is run except for the exceptions below, so that a newly
+    added regression test gets CI coverage automatically instead of silently
+    needing to be added to a hand-maintained allowlist:
+      - "default": meta-workflow that runs everything (would recurse).
+      - "ci": this workflow itself (would recurse).
+      - "nightly": heavy TPC-H suite run separately via the `nightly` pipeline
+        step (`run: nightly`), not here.
     """
-    for name in ["auth", "http", "copy-from-csv-header", "copy-from-ssrf-redirect"]:
+    excluded = {"default", "ci", "nightly"}
+
+    def process(name: str) -> None:
         with c.test_case(name):
             c.workflow(name)
 
+    c.test_parts([name for name in c.workflows.keys() if name not in excluded], process)
+
 
 def workflow_auth(c: Composition) -> None:
     c.up(Service("mc", idle=True), "materialized", "minio")
@@ -373,18 +386,18 @@ def workflow_test_column_dedup(c: Composition):
         c.testdrive(dedent("""
                 $ postgres-execute connection=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr}
 
-                > CREATE SECRET aws_secret AS '${arg.aws-secret-access-key}'
-                > CREATE CONNECTION aws_conn
+                > CREATE SECRET aws_secret_column_dedup AS '${arg.aws-secret-access-key}'
+                > CREATE CONNECTION aws_conn_column_dedup
                   TO AWS (
                     ACCESS KEY ID = '${arg.aws-access-key-id}',
-                    SECRET ACCESS KEY = SECRET aws_secret,
+                    SECRET ACCESS KEY = SECRET aws_secret_column_dedup,
                     ENDPOINT = '${arg.aws-endpoint}',
                     REGION = 'us-east-1'
                   )
 
                 > COPY (SELECT 1::int4 AS a, 2::int4 AS a, 3::int4 AS a2, 4::int4 AS a)
                   TO 's3://copytos3/test/column_dedup/'
-                  WITH (AWS CONNECTION = aws_conn, FORMAT = 'parquet');
+                  WITH (AWS CONNECTION = aws_conn_column_dedup, FORMAT = 'parquet');
 
                 $ s3-verify-data bucket=copytos3 key=test/column_dedup
                 1 2 3 4
@@ -405,17 +418,17 @@ def workflow_test_github_9627(c: Composition):
                 > CREATE TABLE t (a int)
                 > INSERT INTO t VALUES (1)
 
-                > CREATE SECRET aws_secret AS '${arg.aws-secret-access-key}'
-                > CREATE CONNECTION aws_conn
+                > CREATE SECRET aws_secret_github_9627 AS '${arg.aws-secret-access-key}'
+                > CREATE CONNECTION aws_conn_github_9627
                   TO AWS (
                     ACCESS KEY ID = '${arg.aws-access-key-id}',
-                    SECRET ACCESS KEY = SECRET aws_secret,
+                    SECRET ACCESS KEY = SECRET aws_secret_github_9627,
                     ENDPOINT = '${arg.aws-endpoint}',
                     REGION = 'us-east-1'
                   )
 
                 > COPY (SELECT * FROM t) TO 's3://copytos3/test/github_9627/'
-                  WITH (AWS CONNECTION = aws_conn, FORMAT = 'csv');
+                  WITH (AWS CONNECTION = aws_conn_github_9627, FORMAT = 'csv');
                 """))
 
         # Check that the table's read frontier still advances.
@@ -533,7 +546,7 @@ def workflow_copy_from_csv_quoted_null(c: Composition) -> None:
         with cur.copy("COPY csv_null_default FROM STDIN WITH (FORMAT CSV)") as copy:
             copy.write('a,\nb,""\n"",c\n')
 
-        cur.execute("SELECT a, b FROM csv_null_default ORDER BY a NULLS LAST")
+        cur.execute("SELECT a, b FROM csv_null_default ORDER BY a IS NULL, a = '', a")
         rows = cur.fetchall()
         assert rows == [
             ("a", None),
@@ -549,7 +562,7 @@ def workflow_copy_from_csv_quoted_null(c: Composition) -> None:
         ) as copy:
             copy.write('a,NULL\nb,"NULL"\nNULL,c\n')
 
-        cur.execute("SELECT a, b FROM csv_null_custom ORDER BY a NULLS LAST")
+        cur.execute("SELECT a, b FROM csv_null_custom ORDER BY a IS NULL, a = '', a")
         rows = cur.fetchall()
         assert rows == [
             ("a", None),
@@ -626,7 +639,8 @@ def workflow_copy_from_csv_crlf(c: Composition) -> None:
             ) as copy:
                 copy.write(f'a,{eol}b,""{eol}"",c{eol}')
             cur.execute(
-                f"SELECT a, b FROM csv_{label}_null ORDER BY a NULLS LAST".encode()
+                f"SELECT a, b FROM csv_{label}_null "
+                "ORDER BY a IS NULL, a = '', a".encode()
             )
             rows = cur.fetchall()
             assert rows == [
@@ -700,3 +714,81 @@ def workflow_copy_from_csv_crlf_large_end_marker(c: Composition) -> None:
             f"expected count={rows_each_side}, max_id={rows_each_side - 1} "
             "(rows after the bare \\. leaked through parallel workers)"
         )
+
+
+# Must satisfy _NUM_IDLE_SESSIONS * effective_cores >= 512 (blocking-pool cap) to
+# re-starve SELECT 1 on a regression; 256 holds margin below the 4-core agent.
+_NUM_IDLE_SESSIONS = 256
+_SELECT_TIMEOUT_S = 30.0
+
+
+def _select_1_responsive(c: Composition, timeout_s: float) -> bool:
+    box: dict[str, object] = {}
+
+    def run() -> None:
+        try:
+            conn = c.sql_connection()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute("SELECT 1")
+                    cur.fetchall()
+                box["ok"] = True
+            finally:
+                conn.close()
+        except Exception as e:
+            box["err"] = e
+
+    t = threading.Thread(target=run, daemon=True)
+    t.start()
+    t.join(timeout_s)
+    if t.is_alive():
+        return False
+    if box.get("ok"):
+        return True
+    raise AssertionError(f"SELECT 1 probe errored unexpectedly: {box.get('err')!r}")
+
+
+def _open_idle_copy(c: Composition) -> tuple:
+    conn = c.sql_connection()
+    cur = conn.cursor()
+    cm = cur.copy("COPY copy_idle_target FROM STDIN")
+    cm.__enter__()
+    return (conn, cur, cm)
+
+
+def _close_idle_copies(held: list) -> None:
+    for conn, _cur, cm in held:
+        try:
+            conn.close()
+        except Exception:
+            pass
+        gen = getattr(cm, "gen", None)
+        if gen is not None:
+            try:
+                gen.close()
+            except Exception:
+                pass
+    held.clear()
+
+
+def workflow_copy_from_stdin_many_idle_sessions(c: Composition) -> None:
+    """Many idle COPY FROM STDIN sessions must not prevent other queries from
+    running."""
+    c.up("materialized")
+
+    setup_conn = c.sql_connection()
+    with setup_conn.cursor() as cur:
+        cur.execute("DROP TABLE IF EXISTS copy_idle_target")
+        cur.execute("CREATE TABLE copy_idle_target (a int4)")
+    setup_conn.close()
+
+    held: list[tuple] = []
+    try:
+        for _ in range(_NUM_IDLE_SESSIONS):
+            held.append(_open_idle_copy(c))
+        assert _select_1_responsive(c, _SELECT_TIMEOUT_S), (
+            f"SELECT 1 did not return within {_SELECT_TIMEOUT_S}s while "
+            f"{len(held)} idle COPY FROM STDIN sessions were open"
+        )
+    finally:
+        _close_idle_copies(held)