threads

robert3005 · robert3005 · commit e9d0b00d5b66 · 2026-05-13T18:29:57.000+01:00
Signed-off-by: Robert Kruszewski &lt;github@robertk.io&gt;
diff --git a/vortex-python/python/vortex/dataset.py b/vortex-python/python/vortex/dataset.py
@@ -5,6 +5,7 @@
 
 import warnings
 from collections.abc import Iterator
+from contextlib import contextmanager
 from functools import reduce
 from typing import final
 
@@ -17,13 +18,34 @@
 from .arrays import array
 from .arrow.expression import ensure_vortex_expression
 from .expr import Expr, and_
+from .runtime import set_worker_threads as _set_worker_threads
+from .runtime import set_worker_threads_to_available_parallelism as _set_worker_threads_to_available_parallelism
+from .runtime import worker_count as _worker_count
 
 
-def _warn_use_threads() -> None:
-    warnings.warn(
-        "Vortex threading is configured through vortex.runtime. Ignoring use_threads=True.",
-        stacklevel=2,
-    )
+@contextmanager
+def _temporary_worker_threads(use_threads: bool | None) -> Iterator[None]:
+    if use_threads is None:
+        yield
+        return
+
+    previous_workers = _worker_count()
+    if use_threads:
+        _set_worker_threads_to_available_parallelism()
+    else:
+        _set_worker_threads(0)
+
+    try:
+        yield
+    finally:
+        _set_worker_threads(previous_workers)
+
+
+def _read_batches_with_temporary_worker_threads(
+    reader: pyarrow.RecordBatchReader, use_threads: bool | None
+) -> Iterator[pyarrow.RecordBatch]:
+    with _temporary_worker_threads(use_threads):
+        yield from reader
 
 
 @final
@@ -72,14 +94,13 @@ def count_rows(
             raise ValueError("fragment_readahead not supported")
         if fragment_scan_options is not None:
             raise ValueError("fragment_scan_options not supported")
-        if use_threads:
-            _warn_use_threads()
         if cache_metadata is not None:
             warnings.warn("Vortex does not support cache_metadata. Ignoring cache_metadata setting.")
         del memory_pool
-        return self._dataset.count_rows(
-            row_filter=self._filter_expression(filter), split_by=batch_size, row_range=_row_range
-        )
+        with _temporary_worker_threads(use_threads):
+            return self._dataset.count_rows(
+                row_filter=self._filter_expression(filter), split_by=batch_size, row_range=_row_range
+            )
 
     def _filter_expression(self, expression: pyarrow.dataset.Expression | Expr | None) -> Expr | None:
         if expression is None:
@@ -140,7 +161,8 @@ def head(
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
             Not implemented.
 
@@ -157,23 +179,22 @@ def head(
             raise ValueError("fragment_readahead not supported")
         if fragment_scan_options is not None:
             raise ValueError("fragment_scan_options not supported")
-        if use_threads:
-            _warn_use_threads()
         if columns is not None and len(columns) == 0:
             raise ValueError("empty projections are not currently supported")
         if cache_metadata is not None:
             warnings.warn("Vortex does not support cache_metadata. Ignoring cache_metadata setting.")
         del memory_pool
 
-        return (
-            self._dataset.to_array(
-                columns=columns,
-                row_filter=self._filter_expression(filter),
-                row_range=_row_range,
+        with _temporary_worker_threads(use_threads):
+            return (
+                self._dataset.to_array(
+                    columns=columns,
+                    row_filter=self._filter_expression(filter),
+                    row_range=_row_range,
+                )
+                .slice(0, num_rows)
+                .to_arrow_table()
             )
-            .slice(0, num_rows)
-            .to_arrow_table()
-        )
 
     @override
     def join(
@@ -240,7 +261,8 @@ def scanner(
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
             Not implemented.
 
@@ -312,7 +334,8 @@ def take(  # pyright: ignore[reportIncompatibleMethodOverride]
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         cache_metadata : bool
             Not implemented.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
@@ -323,12 +346,13 @@ def take(  # pyright: ignore[reportIncompatibleMethodOverride]
         table : :class:`.pyarrow.Table`
 
         """
-        return self._dataset.to_array(
-            columns=columns,
-            row_filter=self._filter_expression(filter),
-            indices=array(indices.cast(pa.uint64())),
-            row_range=_row_range,
-        ).to_arrow_table()
+        with _temporary_worker_threads(use_threads):
+            return self._dataset.to_array(
+                columns=columns,
+                row_filter=self._filter_expression(filter),
+                indices=array(indices.cast(pa.uint64())),
+                row_range=_row_range,
+            ).to_arrow_table()
 
     def to_record_batch_reader(
         self,
@@ -361,7 +385,8 @@ def to_record_batch_reader(
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
             Not implemented.
 
@@ -376,15 +401,19 @@ def to_record_batch_reader(
             raise ValueError("fragment_readahead not supported")
         if fragment_scan_options is not None:
             raise ValueError("fragment_scan_options not supported")
-        if use_threads:
-            _warn_use_threads()
         if cache_metadata is not None:
             warnings.warn("Vortex does not support cache_metadata. Ignoring cache_metadata setting.")
         if columns is not None and len(columns) == 0:
             raise ValueError("empty projections are not currently supported")
         del memory_pool
-        return self._dataset.to_record_batch_reader(
-            columns=columns, row_filter=self._filter_expression(filter), split_by=batch_size, row_range=_row_range
+        with _temporary_worker_threads(use_threads):
+            reader = self._dataset.to_record_batch_reader(
+                columns=columns, row_filter=self._filter_expression(filter), split_by=batch_size, row_range=_row_range
+            )
+        if use_threads is None:
+            return reader
+        return pyarrow.RecordBatchReader.from_batches(
+            reader.schema, _read_batches_with_temporary_worker_threads(reader, use_threads)
         )
 
     @override
@@ -419,7 +448,8 @@ def to_batches(
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         cache_metadata : bool
             Not implemented.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
@@ -442,11 +472,7 @@ def to_batches(
             memory_pool,
             _row_range,
         )
-        while True:
-            try:
-                yield record_batch_reader.read_next_batch()
-            except StopIteration:
-                return
+        yield from record_batch_reader
 
     @override
     def to_table(
@@ -480,7 +506,8 @@ def to_table(
         fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
             Not implemented.
         use_threads : bool
-            Not implemented.
+            If ``True``, temporarily use available parallelism. If ``False``,
+            temporarily disable Vortex background workers.
         memory_pool : :class:`.pyarrow.MemoryPool` | None
             Not implemented.
 
@@ -497,8 +524,6 @@ def to_table(
             raise ValueError("fragment_readahead not supported")
         if fragment_scan_options is not None:
             raise ValueError("fragment_scan_options not supported")
-        if use_threads:
-            _warn_use_threads()
         if cache_metadata is not None:
             warnings.warn("Vortex does not support cache_metadata. Ignoring cache_metadata setting.")
         if columns is not None and len(columns) == 0:
@@ -510,9 +535,10 @@ def to_table(
                 "VortexDataset does not currently support a dict of expressions as the 'column' parameter."
             )
 
-        return self._dataset.to_array(
-            columns=columns, row_filter=self._filter_expression(filter), row_range=_row_range
-        ).to_arrow_table()
+        with _temporary_worker_threads(use_threads):
+            return self._dataset.to_array(
+                columns=columns, row_filter=self._filter_expression(filter), row_range=_row_range
+            ).to_arrow_table()
 
 
 def from_url(url: str) -> VortexDataset:
@@ -758,7 +784,8 @@ class VortexScanner(pyarrow.dataset.Scanner):
     fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
         Not implemented.
     use_threads : bool
-        Not implemented.
+        If ``True``, temporarily use available parallelism. If ``False``,
+        temporarily disable Vortex background workers.
     memory_pool : :class:`.pyarrow.MemoryPool` | None
         Not implemented.
 
diff --git a/vortex-python/test/test_dataset.py b/vortex-python/test/test_dataset.py
@@ -11,6 +11,7 @@
 import pyarrow.compute as pc
 import pyarrow.dataset as pd
 import pytest
+import vortex.dataset as vx_dataset
 
 import vortex as vx
 
@@ -71,6 +72,58 @@ def test_to_batches(ds: pd.Dataset):
     )
 
 
+def test_use_threads_configures_worker_pool(monkeypatch: pytest.MonkeyPatch):
+    current_workers = 3
+    calls: list[tuple[str, int]] = []
+
+    def fake_worker_count() -> int:
+        return current_workers
+
+    def fake_set_worker_threads(count: int) -> None:
+        nonlocal current_workers
+        calls.append(("set", count))
+        current_workers = count
+
+    def fake_set_worker_threads_to_available_parallelism() -> None:
+        nonlocal current_workers
+        calls.append(("available", current_workers))
+        current_workers = 11
+
+    monkeypatch.setattr(vx_dataset, "_worker_count", fake_worker_count)
+    monkeypatch.setattr(vx_dataset, "_set_worker_threads", fake_set_worker_threads)
+    monkeypatch.setattr(
+        vx_dataset,
+        "_set_worker_threads_to_available_parallelism",
+        fake_set_worker_threads_to_available_parallelism,
+    )
+
+    with vx_dataset._temporary_worker_threads(True):  # pyright: ignore[reportPrivateUsage]
+        assert current_workers == 11
+
+    assert current_workers == 3
+
+    with vx_dataset._temporary_worker_threads(False):  # pyright: ignore[reportPrivateUsage]
+        assert current_workers == 0
+
+    assert current_workers == 3
+    assert calls == [("available", 3), ("set", 3), ("set", 0), ("set", 3)]
+
+    calls.clear()
+    reader = pa.RecordBatchReader.from_batches(
+        pa.schema([("x", pa.int64())]),
+        [
+            pa.record_batch([pa.array([1])], names=["x"]),
+            pa.record_batch([pa.array([2])], names=["x"]),
+        ],
+    )
+
+    batches = list(vx_dataset._read_batches_with_temporary_worker_threads(reader, True))  # pyright: ignore[reportPrivateUsage]
+
+    assert [batch.to_pylist() for batch in batches] == [[{"x": 1}], [{"x": 2}]]
+    assert current_workers == 3
+    assert calls == [("available", 3), ("set", 3)]
+
+
 @pytest.mark.parametrize("batch_size", [1234, 8192, 1 << 31])
 def test_to_batch_size(ds: pd.Dataset, batch_size: int):
     batch_sizes = [len(x) for x in ds.to_batches(batch_size=batch_size)]