pytorch
diff --git a/‎helion/autotuner/benchmark_job.py‎
Lines changed: 66 additions & 1 deletion b/‎helion/autotuner/benchmark_job.py‎
Lines changed: 66 additions & 1 deletion
diff --git a/‎helion/autotuner/benchmark_provider.py‎
Lines changed: 138 additions & 7 deletions b/‎helion/autotuner/benchmark_provider.py‎
Lines changed: 138 additions & 7 deletions
@@ -1,7 +1,8 @@
-"""Picklable benchmark job executed inside a ``BenchmarkWorker``."""
+"""Picklable benchmark jobs executed inside a ``BenchmarkWorker``."""
 
 from __future__ import annotations
 
+import contextlib
 import dataclasses
 import functools
 from typing import TYPE_CHECKING
@@ -24,6 +25,70 @@ def _load_args(path: str) -> Sequence[object]:
     return cast("Sequence[object]", torch.load(path))
 
 
+@dataclasses.dataclass
+class WarmupJob:
+    """Pre-load args + init CUDA in a worker so the first PrecompileJob
+    doesn't pay cold-start cost."""
+
+    args_path: str
+
+    def __call__(self) -> bool:
+        _load_args(self.args_path)
+        if torch.cuda.is_available():
+            torch.cuda.init()
+        return True
+
+
+@dataclasses.dataclass
+class PrecompileJob:
+    """Compile-only precompile in a worker. Runs host-side helion code with
+    an extract_launcher that raises before any kernel launch, then triggers
+    Triton's compile (CPU + ptxas, no kernel execution). The binary lands
+    in Triton's on-disk cache for the benchmark phase to reuse.
+
+    Mirrors fork-mode children, but inside a long-lived spawn worker so the
+    parent never touches CUDA during prep."""
+
+    fn_spec: SerializedCompiledFunction
+    args_path: str
+
+    def __call__(self) -> bool:
+        from ..runtime.precompile_shim import already_compiled
+        from ..runtime.precompile_shim import already_compiled_fail
+        from ..runtime.precompile_shim import make_precompiler
+        from .precompile_future import _ExtractedLaunchArgs
+
+        fn = _load_compiled_fn(self.fn_spec)
+        args = _load_args(self.args_path)
+
+        captured: list[tuple[object, tuple[object, ...], dict[str, object]]] = []
+
+        def extract_launcher(
+            triton_kernel: object,
+            grid: tuple[int, ...],
+            *launch_args: object,
+            **launch_kwargs: object,
+        ) -> object:
+            captured.append((triton_kernel, launch_args, launch_kwargs))
+            raise _ExtractedLaunchArgs(triton_kernel, grid, launch_args, launch_kwargs)
+
+        with contextlib.suppress(_ExtractedLaunchArgs):
+            fn(*args, _launcher=extract_launcher)  # pyrefly: ignore[bad-argument-type]
+        if not captured:
+            # No kernel launch in host code -> nothing to compile.
+            return True
+
+        triton_fn, launch_args, launch_kwargs = captured[0]
+        precompiler = make_precompiler(  # pyrefly: ignore[bad-argument-type]
+            triton_fn, None, None
+        )(*launch_args, **launch_kwargs)
+        if precompiler is already_compiled:
+            return True
+        if precompiler is already_compiled_fail:
+            return False
+        return precompiler(in_child_process=False)
+
+
 @dataclasses.dataclass
 class BenchmarkJob:
     fn_spec: SerializedCompiledFunction
 
@@ -29,8 +29,10 @@
 from ..runtime.precompile_shim import already_compiled_fail
 from ..runtime.precompile_shim import make_precompiler
 from .benchmark_job import BenchmarkJob
+from .benchmark_job import PrecompileJob
 from .benchmark_worker import BenchmarkSubprocessError
 from .benchmark_worker import BenchmarkWorker
+from .benchmark_worker import BenchmarkWorkerPool
 from .benchmarking import do_bench
 from .benchmarking import synchronize_device
 from .logger import SUPPRESSED_TRITON_CODE_MSG
@@ -339,6 +341,7 @@ def __init__(
         self._precompile_args_path: str | None = None
         self._precompile_result_counter: count[int] = count()
         self._benchmark_worker: BenchmarkWorker | None = None
+        self._worker_pool: BenchmarkWorkerPool | None = None
 
         # TODO(hinriksnaer): baseline computation is expensive (compiles and runs
         # the kernel). Currently safe because the provider is only constructed
@@ -541,7 +544,12 @@ def _precompile_context(self) -> PrecompileContext:
         )
 
     def setup(self) -> None:
-        """Prepare precompile tmpdir and args for spawn mode."""
+        """Prepare precompile tmpdir and args. Eagerly warms the worker pool
+        when worker-pool precompile is enabled so spawn + ``torch.load`` cost
+        runs concurrently with the parent's other setup work, not in the
+        critical path of the first ``map_jobs``."""
+        from .benchmark_job import WarmupJob
+
         if self._precompile_tmpdir is None:
             self._precompile_tmpdir = tempfile.TemporaryDirectory()
         if (
@@ -552,6 +560,14 @@ def setup(self) -> None:
             torch.save(self.args, args_path)
             self._precompile_args_path = args_path
 
+        if self._worker_precompile_enabled():
+            assert self._precompile_args_path is not None
+            args_path = self._precompile_args_path
+            self._ensure_worker_pool().warmup(
+                lambda: WarmupJob(args_path=args_path),
+                timeout=float(self.settings.autotune_compile_timeout),
+            )
+
     def _next_precompile_result_path(self) -> str:
         """Return a fresh path for a precompile result file."""
         if self._precompile_tmpdir is None:
@@ -566,6 +582,9 @@ def cleanup(self) -> None:
         if self._benchmark_worker is not None:
             self._benchmark_worker.shutdown()
             self._benchmark_worker = None
+        if self._worker_pool is not None:
+            self._worker_pool.shutdown()
+            self._worker_pool = None
         if self._precompile_tmpdir is not None:
             self._precompile_tmpdir.cleanup()
             self._precompile_tmpdir = None
@@ -585,6 +604,43 @@ def _subprocess_benchmark_enabled(self) -> bool:
         _backend = getattr(self.config_spec, "backend", None)
         return not (_backend is not None and _backend.get_do_bench() is not None)
 
+    def _worker_precompile_enabled(self) -> bool:
+        """Worker-pool precompile is the default safe path when subprocess
+        benchmark is enabled and the kernel has args saved to disk. Pool size
+        auto-decides from GPU memory + cpu count; users can override via
+        ``HELION_AUTOTUNE_PRECOMPILE_WORKERS=<n>`` (or set ``< 0`` to disable)."""
+        return (
+            self.settings.autotune_precompile_workers >= 0
+            and self._subprocess_benchmark_enabled()
+            and self._precompile_args_path is not None
+            and self._pool_size() >= 1
+        )
+
+    def _pool_size(self) -> int:
+        """Resolve the effective pool size. ``autotune_precompile_workers > 0``
+        is honored verbatim. Otherwise pick ``min(cpu_count, free_mem // est)``
+        where ``est`` accounts for compile-only per-worker memory: args + a brief
+        output-allocation peak + CUDA driver overhead, with a 2x safety factor."""
+        explicit = self.settings.autotune_precompile_workers
+        if explicit > 0:
+            return explicit
+        cpu_cap = os.cpu_count() or 1
+        device = self.kernel.env.device
+        if device.type != "cuda":
+            return cpu_cap
+        args_bytes = _estimate_tree_bytes(self.args)
+        per_worker_bytes = (args_bytes + max(args_bytes, 1 * 1024**3)) * 2
+        if per_worker_bytes <= 0:
+            return cpu_cap
+        available_memory, _ = torch.cuda.mem_get_info(device)
+        memory_cap = max(1, int(available_memory * 0.9) // per_worker_bytes)
+        return min(cpu_cap, memory_cap)
+
+    def _ensure_worker_pool(self) -> BenchmarkWorkerPool:
+        if self._worker_pool is None:
+            self._worker_pool = BenchmarkWorkerPool(num_workers=self._pool_size())
+        return self._worker_pool
+
     def _validate_against_baseline(
         self, config: Config, output: object, args: Sequence[object]
     ) -> bool:
@@ -676,7 +732,17 @@ def benchmark(
         configs = [all_configs[i] for i in valid_indices]
 
         # Precompile phase
-        if self.settings.autotune_precompile:
+        precompile_status: list[Literal["ok", "error", "timeout"]] = []
+        compile_times: list[float | None] = [None] * len(configs)
+        if self._worker_precompile_enabled() and self.settings.autotune_precompile:
+            precompile_desc = (
+                f"{desc} precompiling" if self.settings.autotune_progress_bar else None
+            )
+            is_workings, precompile_status, compile_times = (
+                self._worker_pool_precompile(configs, fns, precompile_desc)
+            )
+            futures = None
+        elif self.settings.autotune_precompile:
             futures = list(
                 starmap(
                     self._create_precompile_future,
@@ -687,7 +753,6 @@ def benchmark(
                 f"{desc} precompiling" if self.settings.autotune_progress_bar else None
             )
             is_workings = PrecompileFuture.wait_for_all(futures, desc=precompile_desc)
-            precompile_status: list[Literal["ok", "error", "timeout"]] = []
             for future, ok in zip(futures, is_workings, strict=True):
                 reason = future.failure_reason
                 if ok:
@@ -697,6 +762,7 @@ def benchmark(
                 else:
                     precompile_status.append("error")
         else:
+            futures = None
             is_workings = [True] * len(configs)
             precompile_status = ["ok"] * len(configs)
 
@@ -725,7 +791,7 @@ def benchmark(
                     else None
                 )
             else:
-                compile_time = None
+                compile_time = compile_times[index]
             status: Literal[
                 "ok", "error", "timeout", "peer_compilation_fail", "filtered"
             ]
@@ -954,6 +1020,65 @@ def _benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             self._autotune_metrics.num_compile_failures += 1
             return inf
 
+    def _worker_pool_precompile(
+        self,
+        configs: list[Config],
+        fns: list[CompiledConfig],
+        desc: str | None,
+    ) -> tuple[
+        list[bool],
+        list[Literal["ok", "error", "timeout"]],
+        list[float | None],
+    ]:
+        """Compile each config in the long-lived worker pool. Returns
+        ``(is_workings, statuses, compile_times)`` aligned with ``configs``."""
+        assert self._precompile_args_path is not None
+        args_path = self._precompile_args_path
+        timeout = float(self.settings.autotune_compile_timeout)
+
+        # Build PrecompileJobs; serialization failures count as compile failures.
+        jobs: list[PrecompileJob | None] = []
+        for fn in fns:
+            try:
+                jobs.append(
+                    PrecompileJob(
+                        fn_spec=_serialize_compiled_fn(fn), args_path=args_path
+                    )
+                )
+            except RuntimeError:
+                jobs.append(None)
+
+        live_idxs = [i for i, j in enumerate(jobs) if j is not None]
+        live_jobs = cast("list[Callable[[], object]]", [jobs[i] for i in live_idxs])
+        t0 = time.perf_counter()
+        live_results = self._ensure_worker_pool().map_jobs(live_jobs, timeout=timeout)
+        elapsed = time.perf_counter() - t0
+
+        is_workings = [False] * len(configs)
+        statuses: list[Literal["ok", "error", "timeout"]] = ["error"] * len(configs)
+        compile_times: list[float | None] = [None] * len(configs)
+        for idx, result in zip(live_idxs, live_results, strict=True):
+            compile_times[idx] = elapsed
+            if isinstance(result, BaseException):
+                statuses[idx] = (
+                    "timeout"
+                    if isinstance(result, BenchmarkSubprocessError)
+                    and "timeout" in str(result).lower()
+                    else "error"
+                )
+                self.log.debug(
+                    f"Precompile worker failed for {configs[idx]!r}: "
+                    f"{type(result).__name__}: {result}"
+                )
+                self._autotune_metrics.num_compile_failures += 1
+            else:
+                is_workings[idx] = True
+                statuses[idx] = "ok"
+
+        if desc:
+            self.log(f"{desc} 100% via worker pool ({len(live_idxs)} configs)")
+        return is_workings, statuses, compile_times
+
     def _benchmark_function_subprocess(
         self, config: Config, fn: CompiledConfig
     ) -> float | None:
@@ -969,8 +1094,14 @@ def _benchmark_function_subprocess(
         except RuntimeError:
             return None
 
-        if self._benchmark_worker is None:
-            self._benchmark_worker = BenchmarkWorker(device=None)
+        # Prefer the pool's first worker if a pool is active so the same CUDA
+        # context that compiled also benchmarks (Triton cache hit, no recompile).
+        if self._worker_pool is not None:
+            run_in_worker = lambda j, t: self._worker_pool.run_one(j, timeout=t)  # noqa: E731
+        else:
+            if self._benchmark_worker is None:
+                self._benchmark_worker = BenchmarkWorker(device=None)
+            run_in_worker = lambda j, t: self._benchmark_worker.run(j, timeout=t)  # noqa: E731
 
         job = BenchmarkJob(
             fn_spec=fn_spec,
@@ -981,7 +1112,7 @@ def _benchmark_function_subprocess(
         timeout = float(self.settings.autotune_benchmark_timeout)
 
         try:
-            latency = self._benchmark_worker.run(job, timeout=timeout)
+            latency = run_in_worker(job, timeout)
         except BenchmarkSubprocessError as e:
             # Timeout or unexpected worker exit; skip config and continue.
             self.log.warning(f"Benchmark subprocess failed for {config!r}: {e}")