InfiniTensor
diff --git a/‎ops/ninetoothed/kernels/addmm.py‎
Lines changed: 32 additions & 50 deletions b/‎ops/ninetoothed/kernels/addmm.py‎
Lines changed: 32 additions & 50 deletions
diff --git a/‎ops/ninetoothed/kernels/bmm.py‎
Lines changed: 33 additions & 70 deletions b/‎ops/ninetoothed/kernels/bmm.py‎
Lines changed: 33 additions & 70 deletions
diff --git a/‎ops/ninetoothed/kernels/conv2d.py‎
Lines changed: 36 additions & 60 deletions b/‎ops/ninetoothed/kernels/conv2d.py‎
Lines changed: 36 additions & 60 deletions
@@ -40,57 +40,39 @@ def premake(m, n, k, dtype, block_size_m, block_size_n, block_size_k):
     return arrangement_, application, tensors
 
 
-# Compile the square shapes used by the benchmark; other shapes use the generic
-# make-based fallback below.
-configs = tuple(
-    (
-        (),
-        {
-            "m": s,
-            "n": s,
-            "k": s,
-            "dtype": ninetoothed.float16,
-            "block_size_m": bm,
-            "block_size_n": bn,
-            "block_size_k": bk,
-        },
-        {"num_warps": nw, "num_stages": 3},
+def _configs(m, n, k, dtype):
+    return tuple(
+        (
+            (),
+            {
+                "m": m,
+                "n": n,
+                "k": k,
+                "dtype": dtype,
+                "block_size_m": bm,
+                "block_size_n": bn,
+                "block_size_k": bk,
+            },
+            {"num_warps": nw, "num_stages": 3},
+        )
+        for bm in (64, 128)
+        for bn in (64, 128)
+        for bk in (32, 64)
+        for nw in (4, 8)
     )
-    for s in (128 * i for i in range(2, 33))
-    for bm in (64, 128)
-    for bn in (64, 128)
-    for bk in (32, 64)
-    for nw in (4, 8)
-)
-
-_build_kernel = build(
-    premake,
-    configs,
-    meta_parameters=("block_size_m", "block_size_n", "block_size_k"),
-    kernel_name="addmm",
-)
-
-_BUILD_CONFIGS = frozenset(
-    (kwargs["m"], kwargs["n"], kwargs["k"], kwargs["dtype"])
-    for _, kwargs, _ in configs
-)
-
-_fallback_kernel = ninetoothed.make(
-    arrangement,
-    application,
-    (
-        Tensor(2),
-        Tensor(2),
-        Tensor(2),
-        Tensor(0),
-        Tensor(0),
-        Tensor(2),
-    ),
-)
 
 
-def kernel(input, mat1, mat2, beta, alpha, output, m, n, k, dtype):
-    if (m, n, k, dtype) in _BUILD_CONFIGS:
-        return _build_kernel(input, mat1, mat2, beta, alpha, output, m, n, k, dtype)
+@functools.cache
+def _kernel(m, n, k, dtype):
+    return build(
+        premake,
+        _configs(m, n, k, dtype),
+        meta_parameters=("block_size_m", "block_size_n", "block_size_k"),
+        kernel_name=f"addmm_{m}_{n}_{k}",
+    )
+
 
-    return _fallback_kernel(input, mat1, mat2, beta, alpha, output)
+def kernel(input, mat1, mat2, beta, alpha, output, m, n, k, dtype):
+    return _kernel(m, n, k, dtype)(
+        input, mat1, mat2, beta, alpha, output, m, n, k, dtype
+    )
@@ -1,9 +1,8 @@
 import functools
 
-import ninetoothed
-from ninetoothed import Tensor, block_size
+from ninetoothed import Tensor
 
-from ops.ninetoothed.kernels._common import DTYPES, build
+from ops.ninetoothed.kernels._common import build
 from ops.ninetoothed.kernels.mm import application
 
 
@@ -33,87 +32,51 @@ def arrangement(
     return input_arranged, other_arranged, output_arranged
 
 
-def premake(k, n, dtype, block_size_m, block_size_n, block_size_k):
+def premake(batch, m, k, n, dtype, block_size_m, block_size_n, block_size_k):
     arrangement_ = functools.partial(
         arrangement,
         block_size_m=block_size_m,
         block_size_n=block_size_n,
         block_size_k=block_size_k,
     )
-    shape_options = ({"upper_bound": 4}, None, None)
     tensors = (
-        Tensor(shape=(None, None, k), shape_options=shape_options, dtype=dtype),
-        Tensor(shape=(None, k, n), shape_options=shape_options, dtype=dtype),
-        Tensor(shape=(None, None, n), shape_options=shape_options, dtype=dtype),
+        Tensor(shape=(batch, m, k), dtype=dtype),
+        Tensor(shape=(batch, k, n), dtype=dtype),
+        Tensor(shape=(batch, m, n), dtype=dtype),
     )
 
     return arrangement_, application, tensors
 
 
-_SHAPES = (
-    (4096, 4096),
-    (4096, 1024),
-    (4096, 14336),
-    (14336, 4096),
-    (4096, 128256),
-)
-
-configs = tuple(
-    (
-        (),
-        {
-            "k": k,
-            "n": n,
-            "dtype": dtype,
-            "block_size_m": bm,
-            "block_size_n": bn,
-            "block_size_k": bk,
-        },
-        {"num_warps": nw, "num_stages": ns},
+def _configs(batch, m, k, n, dtype):
+    return (
+        (
+            (),
+            {
+                "batch": batch,
+                "m": m,
+                "k": k,
+                "n": n,
+                "dtype": dtype,
+                "block_size_m": 16,
+                "block_size_n": 64,
+                "block_size_k": 32,
+            },
+            {"num_warps": 4, "num_stages": 3},
+        ),
     )
-    for k, n in _SHAPES
-    for dtype in DTYPES
-    for bm in (16, 64)
-    for bn in (64, 128)
-    for bk in (32, 64)
-    for nw in (4, 8)
-    for ns in (3, 4)
-)
 
-_build_kernel = build(
-    premake,
-    configs,
-    meta_parameters=("block_size_m", "block_size_n", "block_size_k"),
-    kernel_name="bmm",
-)
-
-
-_BUILD_KN = frozenset(_SHAPES)
-
-
-_BLOCK_SIZE_M = block_size()
-_BLOCK_SIZE_N = block_size()
-_BLOCK_SIZE_K = block_size()
-
-
-def _fallback_arrangement(
-    input,
-    other,
-    output,
-    BLOCK_SIZE_M=_BLOCK_SIZE_M,
-    BLOCK_SIZE_N=_BLOCK_SIZE_N,
-    BLOCK_SIZE_K=_BLOCK_SIZE_K,
-):
-    return arrangement(input, other, output, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K)
-
-
-_fallback_kernel = ninetoothed.make(
-    _fallback_arrangement, application, (Tensor(3), Tensor(3), Tensor(3))
-)
 
+@functools.cache
+def _kernel(batch, m, k, n, dtype):
+    return build(
+        premake,
+        _configs(batch, m, k, n, dtype),
+        kernel_name=f"bmm_{batch}_{m}_{k}_{n}",
+    )
 
-def kernel(lhs, rhs, output, k, n, dtype):
-    if (k, n) in _BUILD_KN:
-        return _build_kernel(lhs, rhs, output, k, n, dtype)
 
-    return _fallback_kernel(lhs, rhs, output)
+def kernel(lhs, rhs, output, batch, m, k, n, dtype):
+    return _kernel(batch, m, k, n, dtype)(
+        lhs, rhs, output, batch, m, k, n, dtype, 16, 64, 32, 4, 3
+    )
@@ -1,6 +1,5 @@
 import functools
 
-import ninetoothed
 from ninetoothed import Tensor
 
 import ops.ninetoothed.kernels.mm as mm
@@ -31,10 +30,8 @@ def premake(n, c, h, w, k, r, s, dtype, block_size_m, block_size_n, block_size_k
         block_size_n=block_size_n,
         block_size_k=block_size_k,
     )
-
     p = h - r + 1
     q = w - s + 1
-
     tensors = (
         Tensor(shape=(n, c, h, w), dtype=dtype),
         Tensor(shape=(k, c, r, s), dtype=dtype),
@@ -44,65 +41,44 @@ def premake(n, c, h, w, k, r, s, dtype, block_size_m, block_size_n, block_size_k
     return arrangement_, mm.application, tensors
 
 
-# Block sweep approximating the JIT auto-tuner default range. Conv2d's im2col
-# arrangement produces a tall-skinny matmul, so wider bn (up to 256) and
-# longer-pipelined num_stages help match the JIT-tuned kernel.
-configs = tuple(
-    (
-        (),
-        {
-            "n": n,
-            "c": 512,
-            "h": 14,
-            "w": 14,
-            "k": 512,
-            "r": 3,
-            "s": 3,
-            "dtype": ninetoothed.float16,
-            "block_size_m": bm,
-            "block_size_n": bn,
-            "block_size_k": bk,
-        },
-        {"num_warps": 8, "num_stages": ns},
+def _configs(n, c, h, w, k, r, s, dtype):
+    return tuple(
+        (
+            (),
+            {
+                "n": n,
+                "c": c,
+                "h": h,
+                "w": w,
+                "k": k,
+                "r": r,
+                "s": s,
+                "dtype": dtype,
+                "block_size_m": bm,
+                "block_size_n": bn,
+                "block_size_k": bk,
+            },
+            {"num_warps": 8, "num_stages": ns},
+        )
+        for bm in (64, 128)
+        for bn in (128, 256)
+        for bk in (32, 64)
+        for ns in (3, 5)
+        if bm * bn <= 32768 and bm * bk <= 32768 and bn * bk <= 32768
     )
-    for n in (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024)
-    for bm in (64, 128)
-    for bn in (128, 256)
-    for bk in (32, 64)
-    for ns in (3, 5)
-    if bm * bn <= 32768 and bm * bk <= 32768 and bn * bk <= 32768
-)
-
-_build_kernel = build(
-    premake,
-    configs,
-    meta_parameters=("block_size_m", "block_size_n", "block_size_k"),
-    kernel_name="conv2d",
-)
-
-_BUILD_CONFIGS = frozenset(
-    (
-        kwargs["n"],
-        kwargs["c"],
-        kwargs["h"],
-        kwargs["w"],
-        kwargs["k"],
-        kwargs["r"],
-        kwargs["s"],
-        kwargs["dtype"],
-    )
-    for _, kwargs, _ in configs
-)
 
-_fallback_kernel = ninetoothed.make(
-    arrangement,
-    mm.application,
-    tuple(Tensor(4, shape_options={"constexpr": True}) for _ in range(3)),
-)
 
+@functools.cache
+def _kernel(n, c, h, w, k, r, s, dtype):
+    return build(
+        premake,
+        _configs(n, c, h, w, k, r, s, dtype),
+        meta_parameters=("block_size_m", "block_size_n", "block_size_k"),
+        kernel_name=f"conv2d_{n}_{c}_{h}_{w}_{k}_{r}_{s}",
+    )
 
-def kernel(input, filter, output, n, c, h, w, k, r, s, dtype):
-    if (n, c, h, w, k, r, s, dtype) in _BUILD_CONFIGS:
-        return _build_kernel(input, filter, output, n, c, h, w, k, r, s, dtype)
 
-    return _fallback_kernel(input, filter, output)
+def kernel(input, filter, output, n, c, h, w, k, r, s, dtype):
+    return _kernel(n, c, h, w, k, r, s, dtype)(
+        input, filter, output, n, c, h, w, k, r, s, dtype
+    )