Skip to content

Commit ad2f500

Browse files
rascaniclaude
andauthored
Right-size operator registry from selective build metadata (#19118)
### Summary The runtime kernel registry is a fixed-size static array whose default capacity (MAX_KERNEL_NUM=2000) permanently occupies ~24 KiB of BSS on 32-bit machines even when only a handful of kernels are actually registered. When selective build is active the exact set of kernels needed by the model is already known at build time, so the registry can be sized to fit. A new codegen tool (gen_max_kernel_num.py) counts the (op, kernel_key) tuples in selected_operators.yaml, adds the prim ops registered by register_prim_ops.cpp, and writes the total into a generated header. operator_registry.cpp picks the header up via __has_include. A user-supplied -DMAX_KERNEL_NUM still takes precedence, and builds that don't use selective build keep the 2000 default. The below example demonstrates examples/selective_build/basic (two selected ops) on a 64-bit machine. The registry's BSS footprint drops from 48000 B (2000 slots) to 840 B (35 slots). Fixes #18618 Example generated header: executorch/runtime/kernel/selected_max_kernel_num.h ``` // @generated by executorch/codegen/tools/gen_max_kernel_num.py. Do not edit. #pragma once #define EXECUTORCH_SELECTED_MAX_KERNEL_NUM 27 ``` ### Test plan ``` # 1. Export a tiny model python -m examples.portable.scripts.export --model_name="add_mul" # 2. Auto-sized (no MAX_KERNEL_NUM flag) cmake -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_SELECT_OPS_MODEL="./add_mul.pte" \ -B/tmp/sb_auto examples/selective_build/basic cmake --build /tmp/sb_auto -j$(nproc) # 3. User override cmake -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_SELECT_OPS_MODEL="./add_mul.pte" \ -DMAX_KERNEL_NUM=100 \ -B/tmp/sb_pin examples/selective_build/basic cmake --build /tmp/sb_pin -j$(nproc) # 4. Baseline (no selective build) cmake -DCMAKE_BUILD_TYPE=Release -B/tmp/sb_default . cmake --build /tmp/sb_default --target executorch_core -j$(nproc) ``` ``` echo "=== auto-sized ===" cat /tmp/sb_auto/executorch/executorch_selected_kernels/executorch/runtime/kernel/selected_max_kernel_num.h nm -S /tmp/sb_auto/executorch/CMakeFiles/executorch_core.dir/runtime/kernel/operator_registry.cpp.o \ | grep registered_kernels_data /tmp/sb_auto/selective_build_test --model_path=./add_mul.pte 2>&1 | tail -1 echo "=== user override (MAX_KERNEL_NUM=100) ===" find /tmp/sb_pin -name selected_max_kernel_num.h 2>/dev/null || echo "(no header generated — expected)" nm -S /tmp/sb_pin/executorch/CMakeFiles/executorch_core.dir/runtime/kernel/operator_registry.cpp.o \ | grep registered_kernels_data echo "=== baseline (no selective build) ===" nm -S /tmp/sb_default/CMakeFiles/executorch_core.dir/runtime/kernel/operator_registry.cpp.o \ | grep registered_kernels_data ``` Expected output (BSS size is the 4-hex column after the address): ``` === auto-sized === #define EXECUTORCH_SELECTED_MAX_KERNEL_NUM 27 0... 0000000000000288 b ...registered_kernels_data... # 648 B = 27 × 24 OutputX 0: tensor(sizes=[2, 2], [3., 3., 3., 3.]) === user override (MAX_KERNEL_NUM=100) === (no header generated — expected) 0... 0000000000000960 b ...registered_kernels_data... # 2400 B = 100 × 24 === baseline (no selective build) === 0... 000000000000bb80 b ...registered_kernels_data... # 48000 B = 2000 × 24 ``` Co-authored-by: Claude <noreply@anthropic.com>
1 parent 922adad commit ad2f500

7 files changed

Lines changed: 552 additions & 2 deletions

File tree

CMakeLists.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,32 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
12281228
)
12291229
list(APPEND _executorch_kernels executorch_selected_kernels)
12301230

1231+
# Auto-right-size the kernel registry unless the user has pinned
1232+
# MAX_KERNEL_NUM.
1233+
if(NOT DEFINED CACHE{MAX_KERNEL_NUM} AND NOT DEFINED MAX_KERNEL_NUM)
1234+
gen_selected_max_kernel_num(
1235+
LIB_NAME "executorch_selected_kernels" OPLIST_YAMLS
1236+
${gen_selected_ops_output_yaml}
1237+
)
1238+
target_include_directories(
1239+
executorch_core
1240+
PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
1241+
)
1242+
add_dependencies(
1243+
executorch_core executorch_selected_kernels_max_kernel_num_header
1244+
)
1245+
if(TARGET executorch_core_shared)
1246+
target_include_directories(
1247+
executorch_core_shared
1248+
PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
1249+
)
1250+
add_dependencies(
1251+
executorch_core_shared
1252+
executorch_selected_kernels_max_kernel_num_header
1253+
)
1254+
endif()
1255+
endif()
1256+
12311257
install(
12321258
TARGETS executorch_selected_kernels
12331259
EXPORT ExecuTorchTargets
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
Compute a right-sized MAX_KERNEL_NUM for the ExecuTorch operator registry from
9+
one or more selected_operators.yaml files (produced by gen_oplist.py) and emit
10+
it as a C header.
11+
12+
Total = sum of (op, kernel_key) variants across all input YAMLs
13+
+ prim ops always registered by kernels/prim_ops/register_prim_ops.cpp.
14+
15+
See runtime/kernel/operator_registry.cpp for how the emitted header is
16+
consumed and the full precedence order. Users that register kernels outside
17+
the selective-build YAML should pin the registry explicitly with
18+
-DMAX_KERNEL_NUM=N.
19+
"""
20+
21+
import argparse
22+
import re
23+
import sys
24+
from pathlib import Path
25+
from typing import Any, Dict, List, Optional
26+
27+
import yaml
28+
29+
30+
HEADER_TEMPLATE = """\
31+
// @generated by executorch/codegen/tools/gen_max_kernel_num.py. Do not edit.
32+
#pragma once
33+
#define EXECUTORCH_SELECTED_MAX_KERNEL_NUM {count}
34+
"""
35+
36+
# When a YAML opts into include_all_operators, we still need to write *some*
37+
# output file to keep CMake's add_custom_command contract honest, but without
38+
# defining EXECUTORCH_SELECTED_MAX_KERNEL_NUM so that operator_registry.cpp
39+
# falls through to its compile-time default.
40+
OPT_OUT_HEADER = """\
41+
// @generated by executorch/codegen/tools/gen_max_kernel_num.py. Do not edit.
42+
#pragma once
43+
// Selective build opted into all operators; registry uses compile-time default.
44+
"""
45+
46+
# Locates the `static Kernel prim_ops[] = { ... };` array literal. The count
47+
# lives in the array itself (`kernel_span` uses `sizeof(prim_ops)/sizeof(Kernel)`
48+
# at compile time), so we just bracket-match the array body and count Kernel(
49+
# entries inside it, ignoring the rest of the file.
50+
PRIM_OPS_ARRAY_RE = re.compile(
51+
r"static\s+Kernel\s+prim_ops\s*\[\s*\]\s*=\s*\{(.*?)^\};",
52+
re.DOTALL | re.MULTILINE,
53+
)
54+
PRIM_OPS_KERNEL_RE = re.compile(r"\bKernel\s*\(")
55+
56+
57+
def _count_prim_ops(prim_ops_source: Path) -> int:
58+
source = prim_ops_source.read_text()
59+
match = PRIM_OPS_ARRAY_RE.search(source)
60+
if match is None:
61+
raise RuntimeError(
62+
f"Failed to locate `static Kernel prim_ops[] = {{ ... }};` in "
63+
f"{prim_ops_source}. The array may have been renamed; update "
64+
"PRIM_OPS_ARRAY_RE in gen_max_kernel_num.py."
65+
)
66+
count = len(PRIM_OPS_KERNEL_RE.findall(match.group(1)))
67+
if count == 0:
68+
raise RuntimeError(
69+
f"Found `prim_ops[]` in {prim_ops_source} but it contains zero "
70+
"Kernel(...) entries. The array layout may have changed."
71+
)
72+
return count
73+
74+
75+
def _count_yaml_kernels(yaml_path: Path) -> Optional[int]:
76+
"""Returns the kernel count for one YAML, or None if the YAML opts into
77+
include_all_operators / include_all_overloads (callers should skip the
78+
auto-size header in that case)."""
79+
with open(yaml_path, "r") as f:
80+
data = yaml.safe_load(f) or {}
81+
82+
if data.get("include_all_operators"):
83+
return None
84+
85+
operators: Dict[str, Dict[str, Any]] = data.get("operators") or {}
86+
for _op_name, op_info in operators.items():
87+
if isinstance(op_info, dict) and op_info.get("include_all_overloads"):
88+
return None
89+
90+
et_kernel_metadata: Dict[str, List[str]] = data.get("et_kernel_metadata") or {}
91+
92+
count = 0
93+
seen = set()
94+
for op_name, variants in et_kernel_metadata.items():
95+
seen.add(op_name)
96+
if isinstance(variants, list) and variants:
97+
count += len(variants)
98+
else:
99+
count += 1
100+
101+
# Operators listed but missing from et_kernel_metadata still register one
102+
# default kernel each.
103+
for op_name in operators:
104+
if op_name not in seen:
105+
count += 1
106+
107+
return count
108+
109+
110+
def _write_if_different(path: Path, content: str) -> None:
111+
if path.exists() and path.read_text() == content:
112+
return
113+
path.parent.mkdir(parents=True, exist_ok=True)
114+
path.write_text(content)
115+
116+
117+
def gen_max_kernel_num(
118+
oplist_yamls: List[Path],
119+
prim_ops_source: Path,
120+
output_path: Path,
121+
) -> Optional[int]:
122+
total = 0
123+
for yaml_path in oplist_yamls:
124+
yaml_count = _count_yaml_kernels(yaml_path)
125+
if yaml_count is None:
126+
print(
127+
f"gen_max_kernel_num: {yaml_path} opts into all operators; "
128+
"emitting opt-out header (registry will use default size).",
129+
file=sys.stderr,
130+
)
131+
_write_if_different(output_path, OPT_OUT_HEADER)
132+
return None
133+
total += yaml_count
134+
135+
total += _count_prim_ops(prim_ops_source)
136+
137+
_write_if_different(output_path, HEADER_TEMPLATE.format(count=total))
138+
return total
139+
140+
141+
def main(argv: List[str]) -> None:
142+
parser = argparse.ArgumentParser(description=__doc__)
143+
parser.add_argument(
144+
"--oplist-yaml",
145+
"--oplist_yaml",
146+
action="append",
147+
required=True,
148+
help="Path to a selected_operators.yaml. May be repeated.",
149+
)
150+
parser.add_argument(
151+
"--prim-ops-source",
152+
"--prim_ops_source",
153+
required=True,
154+
help="Path to kernels/prim_ops/register_prim_ops.cpp.",
155+
)
156+
parser.add_argument(
157+
"--output-path",
158+
"--output_path",
159+
required=True,
160+
help="Path to the header file to emit.",
161+
)
162+
args = parser.parse_args(argv)
163+
164+
count = gen_max_kernel_num(
165+
oplist_yamls=[Path(p) for p in args.oplist_yaml],
166+
prim_ops_source=Path(args.prim_ops_source),
167+
output_path=Path(args.output_path),
168+
)
169+
if count is not None:
170+
print(f"gen_max_kernel_num: wrote {args.output_path} (count={count})")
171+
172+
173+
if __name__ == "__main__":
174+
main(sys.argv[1:])

0 commit comments

Comments
 (0)