Skip to content

Commit d308a73

Browse files
committed
[Autotuner] Add pool benchmark subprocess mode
1 parent 4929c17 commit d308a73

4 files changed

Lines changed: 78 additions & 8 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,13 @@ jobs:
3939

4040
env:
4141
HELION_AUTOTUNE_LOG_LEVEL: INFO
42-
# Run perf CI through the long-lived worker pool so CUDA sticky errors
43-
# stay isolated from the autotune parent process.
44-
HELION_AUTOTUNE_PRECOMPILE: "pool"
45-
HELION_AUTOTUNE_POOL_REUSE_PROCESS: "1"
42+
# Keep fork precompile for B200 compile time, but route benchmark timing
43+
# through an isolated pool worker so CUDA sticky errors stay out of the
44+
# autotune parent process.
45+
HELION_AUTOTUNE_PRECOMPILE: "fork"
46+
HELION_AUTOTUNE_BENCHMARK_SUBPROCESS: "pool"
47+
HELION_AUTOTUNE_POOL_REUSE_PROCESS: "0"
48+
HELION_AUTOTUNE_POOL_REBENCHMARK_MODE: "grouped"
4649

4750
container:
4851
image: ${{ inputs.image }}

helion/autotuner/benchmark_provider.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,12 @@ def setup(self) -> None:
569569
args_path = os.path.join(self._precompile_tmpdir.name, "args.pt")
570570
torch.save(self.args, args_path)
571571
self._precompile_args_path = args_path
572+
if (
573+
self._benchmark_via_pool()
574+
and self.settings.autotune_precompile != "pool"
575+
and self._pool_mode_unavailable_reason() is None
576+
):
577+
self._ensure_pool_manager()
572578

573579
def _next_precompile_result_path(self) -> str:
574580
"""Return a fresh path for a precompile result file."""
@@ -606,10 +612,23 @@ def _needs_worker_args_file(self) -> bool:
606612

607613
def _subprocess_benchmark_requested(self) -> bool:
608614
return (
609-
self.settings.autotune_benchmark_subprocess
615+
self._benchmark_subprocess_mode() != "off"
610616
or self.settings.autotune_precompile == "pool"
611617
)
612618

619+
def _benchmark_subprocess_mode(self) -> Literal["off", "single", "pool"]:
620+
mode = self.settings.autotune_benchmark_subprocess
621+
if mode is True:
622+
return "single"
623+
if mode is False:
624+
return "off"
625+
return mode
626+
627+
def _benchmark_via_pool(self) -> bool:
628+
if self.settings.autotune_precompile == "pool":
629+
return True
630+
return self._benchmark_subprocess_mode() == "pool"
631+
613632
def _subprocess_benchmark_unsupported_reason(self) -> str | None:
614633
if dist.is_initialized():
615634
return "distributed autotune"

helion/runtime/settings.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __call__(
4545
DotPrecision = Literal["tf32", "tf32x3", "ieee"]
4646
PrecompileMode = Literal["spawn", "fork", "pool"] | None
4747
PoolRebenchmarkMode = Literal["grouped", "owner_isolated"]
48+
BenchmarkSubprocessMode = Literal["off", "single", "pool"] | bool
4849
_TRUE_LITERALS = frozenset({"1", "true", "yes", "on"})
4950
_FALSE_LITERALS = frozenset({"0", "false", "no", "off"})
5051

@@ -381,9 +382,25 @@ class _Settings:
381382
_env_get_int, "HELION_AUTOTUNE_COMPILE_TIMEOUT", 60
382383
)
383384
)
384-
autotune_benchmark_subprocess: bool = dataclasses.field(
385+
autotune_benchmark_subprocess: BenchmarkSubprocessMode = dataclasses.field(
385386
default_factory=functools.partial(
386-
_env_get_bool, "HELION_AUTOTUNE_BENCHMARK_SUBPROCESS", False
387+
_env_get_literal,
388+
"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS",
389+
cast("BenchmarkSubprocessMode", "off"),
390+
mapping={
391+
# Back-compat: bool-style env values keep their old meanings.
392+
"0": "off",
393+
"false": "off",
394+
"no": "off",
395+
"": "off",
396+
"off": "off",
397+
"1": "single",
398+
"true": "single",
399+
"yes": "single",
400+
"on": "single",
401+
"single": "single",
402+
"pool": "pool",
403+
},
387404
)
388405
)
389406
autotune_benchmark_timeout: int = dataclasses.field(
@@ -609,7 +626,7 @@ class Settings(_Settings):
609626
"/tmp/run.csv and /tmp/run.log with per-config metrics and debug logs."
610627
),
611628
"autotune_compile_timeout": "Timeout for Triton compilation in seconds used for autotuning. Default is 60 seconds.",
612-
"autotune_benchmark_subprocess": "Run the autotune benchmark phase in a long-lived spawn subprocess so a hung/slow kernel can be killed without losing autotune progress. Opt-in via HELION_AUTOTUNE_BENCHMARK_SUBPROCESS=1, or enabled implicitly by HELION_AUTOTUNE_PRECOMPILE=pool. Default disabled.",
629+
"autotune_benchmark_subprocess": "How the autotune benchmark phase is dispatched. 'off' (default) runs in-process; 'single' (or HELION_AUTOTUNE_BENCHMARK_SUBPROCESS=1) launches a long-lived single benchmark subprocess; 'pool' routes timing through the worker pool. Implicitly 'pool' when HELION_AUTOTUNE_PRECOMPILE=pool.",
613630
"autotune_benchmark_timeout": "Per-config wall-clock timeout in seconds for the subprocess benchmark phase. Applies when autotune_benchmark_subprocess is enabled or autotune_precompile='pool'. Default 30 seconds.",
614631
"autotune_precompile": "Autotuner precompile mode: 'fork', 'spawn', 'pool', or falsy/None to disable. 'pool' uses long-lived spawn workers and implies subprocess benchmarking. Defaults to 'fork' on non-Windows platforms.",
615632
"autotune_precompile_jobs": "Maximum concurrent Triton precompile processes, default to cpu count.",

test/test_benchmark_worker.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,15 @@ def test_pool_mode_env_value_is_supported(self) -> None:
220220
with patch.dict(os.environ, {"HELION_AUTOTUNE_PRECOMPILE": "pool"}):
221221
self.assertEqual(Settings().autotune_precompile, "pool")
222222

223+
def test_benchmark_subprocess_pool_env_value_is_supported(self) -> None:
224+
# Benchmark subprocess mode accepts bool-style values plus explicit pool.
225+
with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "pool"}):
226+
self.assertEqual(Settings().autotune_benchmark_subprocess, "pool")
227+
with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "1"}):
228+
self.assertEqual(Settings().autotune_benchmark_subprocess, "single")
229+
with patch.dict(os.environ, {"HELION_AUTOTUNE_BENCHMARK_SUBPROCESS": "0"}):
230+
self.assertEqual(Settings().autotune_benchmark_subprocess, "off")
231+
223232
def test_pool_owner_rebenchmark_env_value_is_supported(self) -> None:
224233
# Owner-isolated pool rebenchmarking should stay opt-in behind its env flag.
225234
with patch.dict(
@@ -327,6 +336,28 @@ def test_pool_mode_implies_subprocess_benchmark(self) -> None:
327336

328337
self.assertTrue(provider._subprocess_benchmark_enabled())
329338

339+
def test_fork_precompile_can_request_pool_benchmarking(self) -> None:
340+
# Hybrid mode keeps fork precompile but routes benchmark jobs to the pool.
341+
provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))
342+
provider.settings = Settings(
343+
autotune_precompile="fork",
344+
autotune_benchmark_subprocess="pool",
345+
)
346+
provider.config_spec = SimpleNamespace(backend=None)
347+
provider.mutated_arg_indices = []
348+
349+
self.assertTrue(provider._subprocess_benchmark_enabled())
350+
self.assertTrue(provider._benchmark_via_pool())
351+
352+
def test_bool_benchmark_subprocess_values_keep_old_meaning(self) -> None:
353+
provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))
354+
provider.settings = Settings(
355+
autotune_precompile="fork",
356+
autotune_benchmark_subprocess=False,
357+
)
358+
359+
self.assertFalse(provider._subprocess_benchmark_requested())
360+
330361
def test_pool_mode_reports_disabled_worker_reason(self) -> None:
331362
# Explicitly disabled pool workers should fail with an actionable reason.
332363
provider = cast("Any", LocalBenchmarkProvider.__new__(LocalBenchmarkProvider))

0 commit comments

Comments
 (0)