Merge branch 'master' of github.com:InfiniTensor/ninetoothed-examples into end-to-end-model-inference

voltjia · voltjia · commit 19dbae536e34 · 2025-05-10T12:10:29.000+08:00
diff --git a/attention.py b/attention.py
@@ -63,7 +63,10 @@ def application(q, k, v, scale, o):
     o = acc  # noqa: F841
 
 
-q, k, v, o = (Tensor(4, constexpr_shape=True) for _ in range(4))
+q, k, v, o = (
+    Tensor(4, shape_options=(None, None, None, {"constexpr": True, "upper_bound": 128}))
+    for _ in range(4)
+)
 attention_kernel = ninetoothed.make(arrangement, application, (q, k, v, Tensor(0), o))
 
 
diff --git a/conv2d.py b/conv2d.py
@@ -23,11 +23,14 @@ def arrangement(input, filter, output):
     return matmul.arrangement(input_flattened, filter_permuted, output_flattened)
 
 
-conv2d_kernel = ninetoothed.make(
-    arrangement,
-    matmul.application,
-    (Tensor(4), Tensor(4, constexpr_shape=True), Tensor(4)),
+filter_shape_options = (
+    None,
+    None,
+    {"constexpr": True, "upper_bound": 16},
+    {"constexpr": True, "upper_bound": 16},
 )
+tensors = (Tensor(4), Tensor(4, shape_options=filter_shape_options), Tensor(4))
+conv2d_kernel = ninetoothed.make(arrangement, matmul.application, tensors)
 
 
 def conv2d(input, filter):
diff --git a/max_pool2d.py b/max_pool2d.py
@@ -11,8 +11,8 @@
 def arrangement(input, output):
     BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
 
-    WINDOW_HEIGHT = Symbol("WINDOW_HEIGHT", constexpr=True)
-    WINDOW_WIDTH = Symbol("WINDOW_WIDTH", constexpr=True)
+    WINDOW_HEIGHT = Symbol("WINDOW_HEIGHT", constexpr=True, upper_bound=16)
+    WINDOW_WIDTH = Symbol("WINDOW_WIDTH", constexpr=True, upper_bound=16)
 
     input_arranged = input.tile((1, 1, WINDOW_HEIGHT, WINDOW_WIDTH))
     input_arranged = input_arranged.ravel()
diff --git a/swiglu.py b/swiglu.py
@@ -6,89 +6,61 @@
 import triton.language as tl
 from ninetoothed import Symbol, Tensor
 
-BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
-BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
+BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
 
 
 @ninetoothed.jit
 def swiglu_kernel(
-    a: Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N)),
-    b: Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N)),
-    c: Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N)),
+    a: Tensor(1).tile((BLOCK_SIZE,)),
+    b: Tensor(1).tile((BLOCK_SIZE,)),
+    c: Tensor(1).tile((BLOCK_SIZE,)),
 ):
     b_loaded = b
     gate = b_loaded * ntl.sigmoid(ntl.cast(b_loaded, ntl.float32))
     c = a * gate  # noqa: F841
 
 
-def ninetoothed_swiglu(a, b):
-    c = torch.empty_like(a)
+def swiglu(a, b):
+    a_1d = a.flatten()
+    b_1d = b.flatten()
 
-    swiglu_kernel(a, b, c)
+    c = torch.empty_like(a_1d)
 
-    return c
+    swiglu_kernel(a_1d, b_1d, c)
+
+    return c.view_as(a)
 
 
 @triton.jit
 def triton_swiglu_kernel(
-    a_ptr,
-    b_ptr,
-    c_ptr,
-    m,
-    n,
-    a_stride_m,
-    a_stride_n,
-    b_stride_m,
-    b_stride_n,
-    c_stride_m,
-    c_stride_n,
-    BLOCK_SIZE: tl.constexpr,
+    a_ptr, b_ptr, c_ptr, num_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr
 ):
     pid = tl.program_id(0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-
-    rows = offsets // n
-    cols = offsets % n
-
-    mask = (rows < m) & (cols < n)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < num_elements
 
-    a_offsets = rows * a_stride_m + cols * a_stride_n
-    b_offsets = rows * b_stride_m + cols * b_stride_n
-    c_offsets = rows * c_stride_m + cols * c_stride_n
-
-    a = tl.load(a_ptr + a_offsets, mask=mask, other=0.0)
-    b = tl.load(b_ptr + b_offsets, mask=mask, other=0.0)
+    a = tl.load(a_ptr + offsets, mask=mask, other=0.0)
+    b = tl.load(b_ptr + offsets, mask=mask, other=0.0)
 
     silu_b = b * tl.sigmoid(tl.cast(b, tl.float32))
     c = a * silu_b
 
-    tl.store(c_ptr + c_offsets, c, mask=mask)
+    tl.store(c_ptr + offsets, c, mask=mask)
 
 
 def triton_swiglu(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-    m, n = a.shape
-    c = torch.empty_like(a)
+    # Flatten the inputs so that the kernel always works on 1D tensors
+    a_flat = a.flatten()
+    b_flat = b.flatten()
+    c_flat = torch.empty_like(a_flat)
+    num_elements = a_flat.numel()
 
     def grid(meta):
-        return (triton.cdiv(m * n, meta["BLOCK_SIZE"]),)
-
-    triton_swiglu_kernel[grid](
-        a,
-        b,
-        c,
-        m,
-        n,
-        a.stride(0),
-        a.stride(1),
-        b.stride(0),
-        b.stride(1),
-        c.stride(0),
-        c.stride(1),
-        BLOCK_SIZE=1024,
-    )
+        return (triton.cdiv(num_elements, meta["BLOCK_SIZE"]),)
+
+    triton_swiglu_kernel[grid](a_flat, b_flat, c_flat, num_elements, BLOCK_SIZE=1024)
 
-    return c
+    return c_flat.view_as(a)
 
 
 def torch_swiglu(
@@ -108,7 +80,7 @@ def torch_swiglu(
     b = torch.rand(shape, dtype=dtype, device=device)
     c = torch.rand(shape, dtype=dtype, device=device)
 
-    ninetoothed_output = ninetoothed_swiglu(a, b)
+    ninetoothed_output = swiglu(a, b)
     torch_output = torch_swiglu(a, b)
     triton_output = triton_swiglu(a, b)
     print(ninetoothed_output)
@@ -149,7 +121,7 @@ def benchmark(m, n, provider):
 
         if provider == "ninetoothed":
             ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: ninetoothed_swiglu(a, b), quantiles=quantiles
+                lambda: swiglu(a, b), quantiles=quantiles
             )
         elif provider == "torch":
             ms, min_ms, max_ms = triton.testing.do_bench(