[Autotuner] Add pool precompile manager

choijon5 · choijon5 · commit cf6342dd3edc · 2026-05-03T23:02:29.000-07:00
diff --git a/helion/autotuner/benchmark_pool.py b/helion/autotuner/benchmark_pool.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from typing import Callable
+from typing import Literal
+from typing import NamedTuple
+from typing import TypeVar
+from typing import cast
+
+import torch
+from torch.utils._pytree import tree_map_only
+
+from .benchmark_job import PrecompileJob
+from .benchmark_worker import BenchmarkSubprocessError
+from .benchmark_worker import BenchmarkWorkerPool
+
+if TYPE_CHECKING:
+    from ..runtime.config import Config
+    from ..runtime.kernel import CompiledConfig
+    from .logger import AutotuningLogger
+    from .metrics import AutotuneMetrics
+    from .precompile_future import SerializedCompiledFunction
+
+_T = TypeVar("_T")
+
+
+class PoolPrecompileResult(NamedTuple):
+    is_workings: list[bool]
+    statuses: list[Literal["ok", "error", "timeout"]]
+    compile_times: list[float | None]
+
+
+def estimate_tree_bytes(obj: object) -> int:
+    """Estimate pytree tensor storage, counting shared storage once."""
+    total = 0
+    seen_ptrs: set[int] = set()
+
+    def _accumulate(tensor: torch.Tensor) -> torch.Tensor:
+        nonlocal total
+        size = tensor.element_size() * tensor.numel()
+        try:
+            storage = tensor.untyped_storage()
+        except RuntimeError:
+            pass
+        else:
+            ptr = storage.data_ptr()
+            if ptr in seen_ptrs:
+                return tensor
+            seen_ptrs.add(ptr)
+            size = storage.nbytes()
+        total += size
+        return tensor
+
+    tree_map_only(torch.Tensor, _accumulate, obj)
+    return total
+
+
+class PoolBenchmarkManager:
+    """Owns the long-lived worker pool for one autotune call."""
+
+    def __init__(
+        self,
+        *,
+        num_workers: int,
+        log: AutotuningLogger,
+        autotune_metrics: AutotuneMetrics,
+    ) -> None:
+        self._pool = BenchmarkWorkerPool(num_workers)
+        self._log = log
+        self._autotune_metrics = autotune_metrics
+        self._precompile_worker_by_fn: dict[int, int] = {}
+
+    def shutdown(self) -> None:
+        self._pool.shutdown()
+        self._precompile_worker_by_fn.clear()
+
+    def worker_index_for_fn(self, fn: Callable[..., object]) -> int:
+        return self._precompile_worker_by_fn.get(id(fn), 0)
+
+    def run_on(self, worker_index: int, job: Callable[[], _T], timeout: float) -> _T:
+        return self._pool.run_on(worker_index, job, timeout=timeout)
+
+    def precompile(
+        self,
+        configs: list[Config],
+        fns: list[CompiledConfig],
+        *,
+        args_path: str,
+        timeout: float,
+        desc: str | None,
+        serialize_fn: Callable[[CompiledConfig], SerializedCompiledFunction | None],
+    ) -> PoolPrecompileResult:
+        """Compile each config in the worker pool."""
+        jobs: list[PrecompileJob | None] = []
+        for fn in fns:
+            fn_spec = serialize_fn(fn)
+            jobs.append(
+                PrecompileJob(fn_spec=fn_spec, args_path=args_path)
+                if fn_spec is not None
+                else None
+            )
+
+        live_idxs = [i for i, job in enumerate(jobs) if job is not None]
+        live_jobs = cast("list[Callable[[], object]]", [jobs[i] for i in live_idxs])
+        self._pool.start_all(limit=len(live_jobs))
+        live_results = self._pool.map_jobs(live_jobs, timeout=timeout)
+
+        is_workings = [False] * len(configs)
+        statuses: list[Literal["ok", "error", "timeout"]] = ["error"] * len(configs)
+        compile_times: list[float | None] = [None] * len(configs)
+        for idx, job in enumerate(jobs):
+            if job is None:
+                self._log.debug(
+                    f"Precompile worker could not serialize {configs[idx]!r}"
+                )
+                self._autotune_metrics.num_compile_failures += 1
+
+        for idx, result in zip(live_idxs, live_results, strict=True):
+            compile_times[idx] = result.elapsed
+            job_result = result.result
+            if isinstance(job_result, BaseException):
+                statuses[idx] = (
+                    "timeout"
+                    if isinstance(job_result, BenchmarkSubprocessError)
+                    and "timeout" in str(job_result).lower()
+                    else "error"
+                )
+                self._log.debug(
+                    f"Precompile worker failed for {configs[idx]!r}: "
+                    f"{type(job_result).__name__}: {job_result}"
+                )
+                self._autotune_metrics.num_compile_failures += 1
+            elif job_result is True:
+                is_workings[idx] = True
+                statuses[idx] = "ok"
+                self._precompile_worker_by_fn[id(fns[idx])] = result.worker_index
+            else:
+                self._log.debug(
+                    f"Precompile worker returned failure for {configs[idx]!r}: "
+                    f"{job_result!r}"
+                )
+                self._autotune_metrics.num_compile_failures += 1
+
+        if desc:
+            self._log(f"{desc} 100% via worker pool ({len(live_idxs)} configs)")
+        return PoolPrecompileResult(is_workings, statuses, compile_times)
diff --git a/test/test_benchmark_worker.py b/test/test_benchmark_worker.py
@@ -10,7 +10,10 @@
 import signal
 import tempfile
 import time
+from types import SimpleNamespace
 from typing import TYPE_CHECKING
+from typing import Any
+from typing import cast
 import unittest
 from unittest.mock import patch
 
@@ -22,15 +25,18 @@
 from helion._testing import onlyBackends
 from helion._testing import skipIfXPU
 from helion.autotuner.benchmark_job import _load_args
+from helion.autotuner.benchmark_pool import PoolBenchmarkManager
 from helion.autotuner.benchmark_provider import LocalBenchmarkProvider
 from helion.autotuner.benchmark_worker import BenchmarkTimeout
 from helion.autotuner.benchmark_worker import BenchmarkWorker
 from helion.autotuner.benchmark_worker import BenchmarkWorkerDied
 from helion.autotuner.benchmark_worker import BenchmarkWorkerPool
+from helion.autotuner.benchmark_worker import WorkerPoolResult
+from helion.autotuner.precompile_future import SerializedCompiledFunction
 from helion.autotuner.random_search import RandomSearch
+from helion.runtime.config import Config
 
 if TYPE_CHECKING:
-    from helion.runtime.config import Config
     from helion.runtime.kernel import CompiledConfig
 
 
@@ -141,6 +147,59 @@ def test_worker_arg_loading_allows_callable_kernel_args(self) -> None:
 
         self.assertIs(loaded[0], _callable_kernel_arg)
 
+    def test_false_precompile_result_is_failure(self) -> None:
+        # A worker precompile returning False should count as a real compile failure.
+        class FakePool:
+            def start_all(self, limit: int | None = None) -> None:
+                self.limit = limit
+
+            def map_jobs(
+                self,
+                jobs: list[object],
+                timeout: float,
+            ) -> list[WorkerPoolResult]:
+                return [
+                    WorkerPoolResult(worker_index=0, elapsed=0.25, result=False)
+                    for _ in jobs
+                ]
+
+        class FakeLog:
+            def debug(self, *_args: object, **_kwargs: object) -> None:
+                pass
+
+        def fake_fn() -> None:
+            pass
+
+        metrics = SimpleNamespace(num_compile_failures=0)
+        manager = cast("Any", PoolBenchmarkManager.__new__(PoolBenchmarkManager))
+        manager._pool = FakePool()
+        manager._log = FakeLog()
+        manager._autotune_metrics = metrics
+        manager._precompile_worker_by_fn = {}
+
+        def serialize_fn(_fn: object) -> SerializedCompiledFunction:
+            return SerializedCompiledFunction(
+                function_name="fake_fn",
+                source_code="def fake_fn(): pass",
+                filename=None,
+                module_name=None,
+            )
+
+        result = manager.precompile(
+            [Config()],
+            [fake_fn],
+            args_path="args.pt",
+            timeout=1,
+            desc=None,
+            serialize_fn=serialize_fn,
+        )
+
+        self.assertEqual(result.is_workings, [False])
+        self.assertEqual(result.statuses, ["error"])
+        self.assertEqual(result.compile_times, [0.25])
+        self.assertEqual(metrics.num_compile_failures, 1)
+        self.assertEqual(manager._precompile_worker_by_fn, {})
+
 
 # Subprocess benchmarking depends on Backend.supports_precompile(); only the
 # Triton backend supports it (Pallas/CuTe return False).