Specialize AOT strides and sizes as int32 with int64 fallback

voltjia · voltjia · commit 3ce88817c8eb · 2026-05-11T10:44:04.000+08:00
The AOT path previously declared all size and stride parameters as
`int64` in the Triton signature, causing the compiled kernel to use
64-bit integer arithmetic throughout the address computation chain.
For typical tensor dimensions (&lt; 2^31 elements), `int32` suffices and
matches what Triton's JIT path auto-selects, yielding significantly
fewer PTX 64-bit instructions (1246 → 9 in conv2d, ~1.46× speedup).

This change introduces an `index_dtype` axis to variant enumeration:
all divisibility × contiguity combinations use `int32` by default, and
a single `int64` fallback variant with no hints is appended. The C++
dispatcher checks whether any shape or stride value exceeds `int32`
range before dispatching to an `int32` variant; if overflow is
detected, it falls back to the `int64` kernel.
diff --git a/src/ninetoothed/aot.py b/src/ninetoothed/aot.py
@@ -81,10 +81,18 @@ def _find_tensor_by_source_name(tensors, name):
     variant_specs = _enumerate_variant_specs(
         launch_arg_names, tensors, _find_tensor_by_source_name
     )
+    _, tensor_ndims, _ = _per_tensor_dim_options(
+        launch_arg_names, tensors, _find_tensor_by_source_name
+    )
 
     output_contents = {}
 
-    for variant_suffix, divisibility_spec, contiguity_spec in variant_specs:
+    for (
+        variant_suffix,
+        divisibility_spec,
+        contiguity_spec,
+        index_dtype,
+    ) in variant_specs:
         variant_outputs = _build_variant(
             source_file,
             kernel_func,
@@ -99,11 +107,12 @@ def _find_tensor_by_source_name(tensors, name):
             num_stages=num_stages,
             divisibility_spec=divisibility_spec,
             contiguity_spec=contiguity_spec,
+            index_dtype=index_dtype,
         )
         output_contents.update(variant_outputs)
 
     dispatcher_source, dispatcher_header = _generate_dispatcher(
-        kernel_name, launch_arg_names, variant_specs
+        kernel_name, launch_arg_names, variant_specs, tensor_ndims
     )
 
     output_contents[f"{kernel_name}.cpp"] = dispatcher_source
@@ -112,7 +121,7 @@ def _find_tensor_by_source_name(tensors, name):
     return output_contents
 
 
-def _generate_dispatcher(kernel_name, launch_arg_names, variant_specs):
+def _generate_dispatcher(kernel_name, launch_arg_names, variant_specs, tensor_ndims):
     tensor_params = ", ".join(f"NineToothedTensor {name}" for name in launch_arg_names)
     signature_params = (
         f"NineToothedStream stream, {tensor_params}"
@@ -138,14 +147,25 @@ def _generate_dispatcher(kernel_name, launch_arg_names, variant_specs):
     externs = []
     branches = []
 
-    for variant_suffix, divisibility_spec, contiguity_spec in variant_specs:
+    fallback_call = None
+
+    for (
+        variant_suffix,
+        divisibility_spec,
+        contiguity_spec,
+        index_dtype,
+    ) in variant_specs:
         variant_name = f"launch_{kernel_name}_{variant_suffix}"
         externs.append(
             f'extern "C" NineToothedResult {variant_name}({signature_params});'
         )
 
         call = f"return {variant_name}({call_args});"
 
+        if index_dtype == ninetoothed.dtype.int64:
+            fallback_call = call
+            continue
+
         checks = tuple(
             f"{name}.shape[{dim}] % 16 == 0" for name, dim in divisibility_spec
         ) + tuple(f"{name}.strides[{dim}] == 1" for name, dim in contiguity_spec)
@@ -155,11 +175,26 @@ def _generate_dispatcher(kernel_name, launch_arg_names, variant_specs):
         else:
             branches.append(f"{_INDENTATION}{call}")
 
+    prelude_lines = []
+    if fallback_call is not None and launch_arg_names:
+        overflow_terms = []
+        for name, ndim in zip(launch_arg_names, tensor_ndims):
+            for d in range(ndim):
+                overflow_terms.append(f"{name}.shape[{d}] > 2147483647ULL")
+                overflow_terms.append(f"{name}.strides[{d}] > 2147483647LL")
+                overflow_terms.append(f"{name}.strides[{d}] < -2147483648LL")
+        if overflow_terms:
+            prelude_lines.append(
+                f"{_INDENTATION}if ({' || '.join(overflow_terms)}) {fallback_call}"
+            )
+
+    body_lines = prelude_lines + branches
+
     source = (
         f'#include "{_HEADER_PATH}"\n\n'
         + "\n".join(externs)
         + f'\n\nextern "C" {signature} {{\n'
-        + "\n".join(branches)
+        + "\n".join(body_lines)
         + "\n}\n"
     )
 
@@ -181,6 +216,7 @@ def _build_variant(
     num_stages,
     divisibility_spec,
     contiguity_spec,
+    index_dtype=ninetoothed.dtype.int32,
 ):
     divisibility_set = {
         (naming.remove_prefixes(name), dim) for name, dim in divisibility_spec
@@ -211,9 +247,9 @@ def _build_variant(
             bare_source_name = naming.remove_prefixes(source_name)
 
             if (bare_source_name, dim_index) in divisibility_set:
-                param_types.append(f"{ninetoothed.dtype.int64}:16")
+                param_types.append(f"{index_dtype}:16")
             else:
-                param_types.append(ninetoothed.dtype.int64)
+                param_types.append(index_dtype)
         elif match := Tensor.stride_pattern().fullmatch(param):
             source_name = match.group(1)
             dim_index = int(match.group(3))
@@ -224,7 +260,7 @@ def _build_variant(
                 constexpr_param_indices.append(len(param_types) - 1)
                 constexpr_strides.append((source_name, dim_index))
             else:
-                param_types.append(f"{ninetoothed.dtype.int64}:16")
+                param_types.append(f"{index_dtype}:16")
         else:
             source_name = param
             tensor = find_tensor(tensors, source_name)
@@ -331,15 +367,21 @@ def _spec_from_combo(combo):
     for divisibility_spec in dim_specs:
         for contiguity_spec in dim_specs:
             suffix = _variant_suffix(
-                divisibility_spec, contiguity_spec, launch_arg_names, tensor_ndims
+                divisibility_spec,
+                contiguity_spec,
+                launch_arg_names,
+                tensor_ndims,
+                index_dtype=ninetoothed.dtype.int32,
+            )
+            specs.append(
+                (suffix, divisibility_spec, contiguity_spec, ninetoothed.dtype.int32)
             )
-            specs.append((suffix, divisibility_spec, contiguity_spec))
 
     def _num_innermost(spec):
         return sum(1 for name, dim in spec if innermost_dims.get(name) == dim)
 
     def _specificity(entry):
-        _, divisibility_spec, contiguity_spec = entry
+        _, divisibility_spec, contiguity_spec, _ = entry
 
         return (
             -len(divisibility_spec),
@@ -350,6 +392,11 @@ def _specificity(entry):
 
     specs.sort(key=_specificity)
 
+    fallback_suffix = _variant_suffix(
+        (), (), launch_arg_names, tensor_ndims, index_dtype=ninetoothed.dtype.int64
+    )
+    specs.append((fallback_suffix, (), (), ninetoothed.dtype.int64))
+
     return specs
 
 
@@ -381,15 +428,21 @@ def _per_tensor_dim_options(launch_arg_names, tensors, find_tensor):
     return per_tensor_dims, tensor_ndims, innermost_dims
 
 
-def _variant_suffix(divisibility_spec, contiguity_spec, launch_arg_names, tensor_ndims):
+def _variant_suffix(
+    divisibility_spec,
+    contiguity_spec,
+    launch_arg_names,
+    tensor_ndims,
+    index_dtype=ninetoothed.dtype.int32,
+):
     divisibility_part = _divisibility_suffix(
         divisibility_spec, launch_arg_names, tensor_ndims
     )
     contiguity_part = _contiguity_suffix(
         contiguity_spec, launch_arg_names, tensor_ndims
     )
 
-    return f"{divisibility_part}_{contiguity_part}"
+    return f"{divisibility_part}_{contiguity_part}_index_{index_dtype}"
 
 
 def _divisibility_suffix(divisibility_spec, launch_arg_names, tensor_ndims):