[Autotuner] Add long-lived benchmark worker pool

choijon5 · choijon5 · commit 2bcd8e4fda06 · 2026-05-07T19:50:55.000-07:00
stack-info: PR: #2289, branch: choijon5/stack/46
diff --git a/helion/autotuner/benchmark_worker.py b/helion/autotuner/benchmark_worker.py
@@ -7,10 +7,14 @@
 import ctypes.util
 import multiprocessing as mp
 import os
+import queue
 import signal
 import sys
+import threading
+import time
 from typing import TYPE_CHECKING
 from typing import Callable
+from typing import NamedTuple
 from typing import TypeVar
 
 from .logger import _UNRECOVERABLE_RUNTIME_ERROR_RE
@@ -21,6 +25,12 @@
 _T = TypeVar("_T")
 
 
+class WorkerPoolResult(NamedTuple):
+    worker_index: int
+    elapsed: float
+    result: object
+
+
 def _set_pdeathsig() -> None:
     """SIGTERM the child if the parent dies (Linux only, best-effort)."""
     if sys.platform != "linux":
@@ -146,3 +156,97 @@ def _kill(self) -> None:
                 connection.close()
         self._process = None
         self._parent_connection = None
+
+
+class BenchmarkWorkerPool:
+    """Pool of long-lived ``BenchmarkWorker`` processes."""
+
+    def __init__(self, num_workers: int) -> None:
+        if num_workers < 1:
+            raise ValueError(f"num_workers must be >= 1, got {num_workers}")
+        self.workers = [BenchmarkWorker(device=None) for _ in range(num_workers)]
+
+    @property
+    def num_workers(self) -> int:
+        return len(self.workers)
+
+    def run_job_on_worker(
+        self, worker_index: int, job: Callable[[], _T], timeout: float
+    ) -> _T:
+        return self.workers[worker_index % self.num_workers].run(job, timeout=timeout)
+
+    def run_jobs(
+        self, jobs: list[Callable[[], object]], timeout: float
+    ) -> list[WorkerPoolResult]:
+        """Run jobs across the worker pool while preserving input order.
+
+        Each worker thread owns one worker and pulls job indices from a shared
+        queue, so slow jobs do not block unrelated workers. Worker exceptions
+        are captured in ``WorkerPoolResult.result`` for the corresponding job.
+        """
+        if not jobs:
+            return []
+        active_workers = min(self.num_workers, len(jobs))
+        result_slots: list[WorkerPoolResult | None] = [None] * len(jobs)
+        work_queue: queue.Queue[int] = queue.Queue()
+        for i in range(len(jobs)):
+            work_queue.put(i)
+
+        def process_queue(worker_idx: int) -> None:
+            worker = self.workers[worker_idx]
+            while True:
+                try:
+                    i = work_queue.get_nowait()
+                except queue.Empty:
+                    return
+                start = time.perf_counter()
+                job_result = _run_job_capture_error(worker, jobs[i], timeout)
+                result_slots[i] = WorkerPoolResult(
+                    worker_index=worker_idx,
+                    elapsed=time.perf_counter() - start,
+                    result=job_result,
+                )
+
+        _run_worker_threads(process_queue, active_workers)
+        ordered_results: list[WorkerPoolResult] = []
+        for slot in result_slots:
+            assert slot is not None
+            ordered_results.append(slot)
+        return ordered_results
+
+    def start_all(self, limit: int | None = None) -> None:
+        """Start workers before threaded dispatch so their lifetime is not
+        tied to short-lived dispatch threads."""
+        if limit is None:
+            limit = self.num_workers
+        for worker in self.workers[:limit]:
+            if not worker.alive():
+                worker._start()
+
+    def shutdown(self) -> None:
+        for w in self.workers:
+            with contextlib.suppress(Exception):
+                w.shutdown()
+
+
+def _run_job_capture_error(
+    worker: BenchmarkWorker, job: Callable[[], object], timeout: float
+) -> object:
+    try:
+        return worker.run(job, timeout=timeout)
+    except BaseException as e:
+        e.__traceback__ = None
+        return e
+
+
+def _run_worker_threads(target: Callable[[int], None], n: int) -> None:
+    if n == 1:
+        target(0)
+        return
+    threads = [
+        threading.Thread(target=target, args=(i,), daemon=True) for i in range(n)
+    ]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
diff --git a/test/test_benchmark_worker.py b/test/test_benchmark_worker.py
@@ -24,6 +24,7 @@
 from helion.autotuner.benchmark_worker import BenchmarkTimeout
 from helion.autotuner.benchmark_worker import BenchmarkWorker
 from helion.autotuner.benchmark_worker import BenchmarkWorkerDied
+from helion.autotuner.benchmark_worker import BenchmarkWorkerPool
 from helion.autotuner.random_search import RandomSearch
 
 if TYPE_CHECKING:
@@ -69,6 +70,7 @@ def __call__(self) -> object:
 
 class TestBenchmarkWorkerFailureModes(unittest.TestCase):
     def test_timeout_kills_worker(self) -> None:
+        # A timed-out job should kill the worker and the next job should respawn it.
         worker = BenchmarkWorker()
         try:
             t0 = time.time()
@@ -82,8 +84,7 @@ def test_timeout_kills_worker(self) -> None:
             worker.shutdown()
 
     def test_sticky_error_kills_worker(self) -> None:
-        # Errors matching _UNRECOVERABLE_RUNTIME_ERROR_RE force the worker
-        # to be killed so the next call spawns a fresh CUDA context.
+        # Sticky CUDA-style errors should kill the worker before the next job.
         worker = BenchmarkWorker()
         try:
             with self.assertRaises(RuntimeError) as ctx:
@@ -95,6 +96,7 @@ def test_sticky_error_kills_worker(self) -> None:
             worker.shutdown()
 
     def test_worker_crash_raises_died(self) -> None:
+        # A worker process crash should surface as BenchmarkWorkerDied.
         worker = BenchmarkWorker()
         try:
             with self.assertRaises(BenchmarkWorkerDied):
@@ -103,6 +105,21 @@ def test_worker_crash_raises_died(self) -> None:
         finally:
             worker.shutdown()
 
+    def test_pool_run_jobs_reports_worker_and_elapsed(self) -> None:
+        # Pool job execution should preserve result order and report timing metadata.
+        pool = BenchmarkWorkerPool(2)
+        try:
+            results = pool.run_jobs(
+                [_ReturnValue("a"), _ReturnValue("b")],
+                timeout=30.0,
+            )
+        finally:
+            pool.shutdown()
+
+        self.assertEqual([r.result for r in results], ["a", "b"])
+        self.assertTrue(all(0 <= r.worker_index < 2 for r in results))
+        self.assertTrue(all(r.elapsed >= 0 for r in results))
+
 
 # Subprocess benchmarking depends on Backend.supports_precompile(); only the
 # Triton backend supports it (Pallas/CuTe return False).