[Autotuner] Add pool benchmark subprocess mode

choijon5 · choijon5 · commit d308a732087d · 2026-05-07T19:53:43.000-07:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -39,10 +39,13 @@ jobs:
 
     env:
       HELION_AUTOTUNE_LOG_LEVEL: INFO
-      # Run perf CI through the long-lived worker pool so CUDA sticky errors
-      # stay isolated from the autotune parent process.
-      HELION_AUTOTUNE_PRECOMPILE: "pool"
-      HELION_AUTOTUNE_POOL_REUSE_PROCESS: "1"
+      # Keep fork precompile for B200 compile time, but route benchmark timing
+      # through an isolated pool worker so CUDA sticky errors stay out of the
+      # autotune parent process.
+      HELION_AUTOTUNE_PRECOMPILE: "fork"
+      HELION_AUTOTUNE_BENCHMARK_SUBPROCESS: "pool"
+      HELION_AUTOTUNE_POOL_REUSE_PROCESS: "0"
+      HELION_AUTOTUNE_POOL_REBENCHMARK_MODE: "grouped"
 
     container:
       image: ${{ inputs.image }}
diff --git a/helion/autotuner/benchmark_provider.py b/helion/autotuner/benchmark_provider.py
@@ -569,6 +569,12 @@ def setup(self) -> None:
             args_path = os.path.join(self._precompile_tmpdir.name, "args.pt")
             torch.save(self.args, args_path)
             self._precompile_args_path = args_path
+        if (
+            self._benchmark_via_pool()
+            and self.settings.autotune_precompile != "pool"
+            and self._pool_mode_unavailable_reason() is None
+        ):
+            self._ensure_pool_manager()
 
     def _next_precompile_result_path(self) -> str:
         """Return a fresh path for a precompile result file."""
@@ -606,10 +612,23 @@ def _needs_worker_args_file(self) -> bool:
 
     def _subprocess_benchmark_requested(self) -> bool:
         return (
-            self.settings.autotune_benchmark_subprocess
+            self._benchmark_subprocess_mode() != "off"
             or self.settings.autotune_precompile == "pool"
         )
 
+    def _benchmark_subprocess_mode(self) -> Literal["off", "single", "pool"]:
+        mode = self.settings.autotune_benchmark_subprocess
+        if mode is True:
+            return "single"
+        if mode is False:
+            return "off"
+        return mode
+
+    def _benchmark_via_pool(self) -> bool:
+        if self.settings.autotune_precompile == "pool":
+            return True
+        return self._benchmark_subprocess_mode() == "pool"
+
     def _subprocess_benchmark_unsupported_reason(self) -> str | None:
         if dist.is_initialized():
             return "distributed autotune"
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -45,6 +45,7 @@ def __call__(
 DotPrecision = Literal["tf32", "tf32x3", "ieee"]
 PrecompileMode = Literal["spawn", "fork", "pool"] | None
 PoolRebenchmarkMode = Literal["grouped", "owner_isolated"]
+BenchmarkSubprocessMode = Literal["off", "single", "pool"] | bool
 _TRUE_LITERALS = frozenset({"1", "true", "yes", "on"})
 _FALSE_LITERALS = frozenset({"0", "false", "no", "off"})
 
@@ -381,9 +382,25 @@ class _Settings:
             _env_get_int, "HELION_AUTOTUNE_COMPILE_TIMEOUT", 60
         )
     )
-    autotune_benchmark_subprocess: bool = dataclasses.field(
+    autotune_benchmark_subprocess: BenchmarkSubprocessMode = dataclasses.field(
         default_factory=functools.partial(
-            _env_get_bool, "HELION_AUTOTUNE_BENCHMARK_SUBPROCESS", False
+            _env_get_literal,
+            "HELION_AUTOTUNE_BENCHMARK_SUBPROCESS",
+            cast("BenchmarkSubprocessMode", "off"),
+            mapping={
+                # Back-compat: bool-style env values keep their old meanings.
+                "0": "off",
+                "false": "off",
+                "no": "off",
+                "": "off",
+                "off": "off",
+                "1": "single",
+                "true": "single",
+                "yes": "single",
+                "on": "single",
+                "single": "single",
+                "pool": "pool",
+            },
         )
     )
     autotune_benchmark_timeout: int = dataclasses.field(
@@ -609,7 +626,7 @@ class Settings(_Settings):
             "/tmp/run.csv and /tmp/run.log with per-config metrics and debug logs."
         ),
         "autotune_compile_timeout": "Timeout for Triton compilation in seconds used for autotuning. Default is 60 seconds.",
-        "autotune_benchmark_subprocess": "Run the autotune benchmark phase in a long-lived spawn subprocess so a hung/slow kernel can be killed without losing autotune progress. Opt-in via HELION_AUTOTUNE_BENCHMARK_SUBPROCESS=1, or enabled implicitly by HELION_AUTOTUNE_PRECOMPILE=pool. Default disabled.",
+        "autotune_benchmark_subprocess": "How the autotune benchmark phase is dispatched. 'off' (default) runs in-process; 'single' (or HELION_AUTOTUNE_BENCHMARK_SUBPROCESS=1) launches a long-lived single benchmark subprocess; 'pool' routes timing through the worker pool. Implicitly 'pool' when HELION_AUTOTUNE_PRECOMPILE=pool.",
         "autotune_benchmark_timeout": "Per-config wall-clock timeout in seconds for the subprocess benchmark phase. Applies when autotune_benchmark_subprocess is enabled or autotune_precompile='pool'. Default 30 seconds.",
         "autotune_precompile": "Autotuner precompile mode: 'fork', 'spawn', 'pool', or falsy/None to disable. 'pool' uses long-lived spawn workers and implies subprocess benchmarking. Defaults to 'fork' on non-Windows platforms.",
         "autotune_precompile_jobs": "Maximum concurrent Triton precompile processes, default to cpu count.",
diff --git a/test/test_benchmark_worker.py b/test/test_benchmark_worker.py
@@ -220,6 +220,15 @@ def test_pool_mode_env_value_is_supported(self) -> None:
         with patch.dict(os.environ, {"HELION_AUTOTUNE_PRECOMPILE": "pool"}):
             self.assertEqual(Settings().autotune_precompile, "pool")
 
+    def test_benchmark_subprocess_pool_env_value_is_supported(self) -> None:
+        # Benchmark subprocess mode accepts bool-style values plus explicit pool.
+        with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "pool"}):
+            self.assertEqual(Settings().autotune_benchmark_subprocess, "pool")
+        with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "1"}):
+            self.assertEqual(Settings().autotune_benchmark_subprocess, "single")
+        with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "0"}):
+            self.assertEqual(Settings().autotune_benchmark_subprocess, "off")
+
     def test_pool_owner_rebenchmark_env_value_is_supported(self) -> None:
         # Owner-isolated pool rebenchmarking should stay opt-in behind its env flag.
         with patch.dict(
@@ -327,6 +336,28 @@ def test_pool_mode_implies_subprocess_benchmark(self) -> None:
 
         self.assertTrue(provider._subprocess_benchmark_enabled())
 
+    def test_fork_precompile_can_request_pool_benchmarking(self) -> None:
+        # Hybrid mode keeps fork precompile but routes benchmark jobs to the pool.
+        provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))
+        provider.settings = Settings(
+            autotune_precompile="fork",
+            autotune_benchmark_subprocess="pool",
+        )
+        provider.config_spec = SimpleNamespace(backend=None)
+        provider.mutated_arg_indices = []
+
+        self.assertTrue(provider._subprocess_benchmark_enabled())
+        self.assertTrue(provider._benchmark_via_pool())
+
+    def test_bool_benchmark_subprocess_values_keep_old_meaning(self) -> None:
+        provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))
+        provider.settings = Settings(
+            autotune_precompile="fork",
+            autotune_benchmark_subprocess=False,
+        )
+
+        self.assertFalse(provider._subprocess_benchmark_requested())
+
     def test_pool_mode_reports_disabled_worker_reason(self) -> None:
         # Explicitly disabled pool workers should fail with an actionable reason.
         provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))