Add max_pool2d operator

voltjia · voltjia · commit 4f2640d20c73 · 2026-01-21T12:45:29.000Z
diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
@@ -20,6 +20,7 @@
     layer_norm,
     le,
     lt,
+    max_pool2d,
     mm,
     mul,
     ne,
@@ -60,6 +61,7 @@
     "layer_norm",
     "le",
     "lt",
+    "max_pool2d",
     "mm",
     "mul",
     "ne",
diff --git a/src/ntops/kernels/max_pool2d.py b/src/ntops/kernels/max_pool2d.py
@@ -0,0 +1,118 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+BLOCK_SIZE = ninetoothed.block_size()
+
+KERNEL_SIZE_H = Symbol("kernel_size_h", constexpr=True, upper_bound=16)
+KERNEL_SIZE_W = Symbol("kernel_size_w", constexpr=True, upper_bound=16)
+STRIDE_H = Symbol("stride_h", constexpr=True)
+STRIDE_W = Symbol("stride_w", constexpr=True)
+PADDING_H = Symbol("padding_h", constexpr=True)
+PADDING_W = Symbol("padding_w", constexpr=True)
+DILATION_H = Symbol("dilation_h", constexpr=True)
+DILATION_W = Symbol("dilation_w", constexpr=True)
+
+
+def arrangement(
+    input,
+    output,
+    kernel_size_h=None,
+    kernel_size_w=None,
+    stride_h=None,
+    stride_w=None,
+    padding_h=None,
+    padding_w=None,
+    dilation_h=None,
+    dilation_w=None,
+    ceil_mode=None,
+    block_size=None,
+):
+    if kernel_size_h is None:
+        kernel_size_h = KERNEL_SIZE_H
+
+    if kernel_size_w is None:
+        kernel_size_w = KERNEL_SIZE_W
+
+    if stride_h is None:
+        stride_h = STRIDE_H
+
+    if stride_w is None:
+        stride_w = STRIDE_W
+
+    if padding_h is None:
+        padding_h = PADDING_H
+
+    if padding_w is None:
+        padding_w = PADDING_W
+
+    if dilation_h is None:
+        dilation_h = DILATION_H
+
+    if dilation_w is None:
+        dilation_w = DILATION_W
+
+    if ceil_mode is None:
+        ceil_mode = False
+
+    if block_size is None:
+        block_size = BLOCK_SIZE
+
+    input_arranged = input.pad(
+        ((0, 0), (0, 0), (padding_h, padding_h), (padding_w, padding_w))
+    )
+    input_arranged = input_arranged.tile(
+        (1, 1, kernel_size_h, kernel_size_w),
+        strides=(-1, -1, stride_h, stride_w),
+        dilation=(1, 1, dilation_h, dilation_w),
+        floor_mode=not ceil_mode,
+    )
+    input_arranged = input_arranged.ravel()
+    input_arranged = input_arranged.flatten(end_dim=4).flatten(start_dim=1)
+    input_arranged = input_arranged.tile((block_size, -1))
+
+    output_arranged = output.tile((1, 1, 1, 1))
+    output_arranged = output_arranged.ravel()
+    output_arranged = output_arranged.flatten(end_dim=4).flatten(start_dim=1)
+    output_arranged = output_arranged.tile((block_size, -1))
+    output_arranged.dtype = output_arranged.dtype.squeeze(1)
+
+    return input_arranged, output_arranged
+
+
+def application(input, output):
+    output = ntl.max(input, axis=1)  # noqa: F841
+
+
+def premake(
+    kernel_size_h=None,
+    kernel_size_w=None,
+    stride_h=None,
+    stride_w=None,
+    padding_h=None,
+    padding_w=None,
+    dilation_h=None,
+    dilation_w=None,
+    ceil_mode=None,
+    dtype=None,
+    block_size=None,
+):
+    arrangement_ = functools.partial(
+        arrangement,
+        kernel_size_h=kernel_size_h,
+        kernel_size_w=kernel_size_w,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        padding_h=padding_h,
+        padding_w=padding_w,
+        dilation_h=dilation_h,
+        dilation_w=dilation_w,
+        ceil_mode=ceil_mode,
+        block_size=block_size,
+    )
+
+    tensors = (Tensor(4, dtype=dtype, other=float("-inf")), Tensor(4, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
@@ -20,6 +20,7 @@
 from ntops.torch.le import le
 from ntops.torch.lt import lt
 from ntops.torch.matmul import matmul
+from ntops.torch.max_pool2d import max_pool2d
 from ntops.torch.mm import mm
 from ntops.torch.mul import mul
 from ntops.torch.ne import ne
@@ -60,6 +61,7 @@
     "le",
     "lt",
     "matmul",
+    "max_pool2d",
     "mm",
     "mul",
     "ne",
diff --git a/src/ntops/torch/max_pool2d.py b/src/ntops/torch/max_pool2d.py
@@ -0,0 +1,72 @@
+import math
+
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def max_pool2d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+    return_indices=False,
+):
+    if stride is None:
+        stride = kernel_size
+
+    if isinstance(stride, int):
+        stride = (stride, stride)
+
+    if isinstance(padding, int):
+        padding = (padding, padding)
+
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation)
+
+    assert not return_indices, "`return_indices == True` is not supported yet."
+
+    n, c, h, w = input.shape
+
+    def _calculate_output_size(
+        input_size, kernel_size, stride, padding, dilation, ceil_mode
+    ):
+        int_ = math.ceil if ceil_mode else math.floor
+
+        result = int_(
+            (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1
+        )
+
+        if ceil_mode and (result - 1) * stride >= input_size + padding:
+            result -= 1
+
+        return result
+
+    h_ = _calculate_output_size(
+        h, kernel_size[0], stride[0], padding[0], dilation[0], ceil_mode
+    )
+    w_ = _calculate_output_size(
+        w, kernel_size[1], stride[1], padding[1], dilation[1], ceil_mode
+    )
+
+    output = torch.empty((n, c, h_, w_), dtype=input.dtype, device=input.device)
+
+    kernel = _cached_make(ntops.kernels.max_pool2d.premake, ceil_mode=ceil_mode)
+
+    kernel(
+        input,
+        output,
+        kernel_size_h=kernel_size[0],
+        kernel_size_w=kernel_size[1],
+        stride_h=stride[0],
+        stride_w=stride[1],
+        padding_h=padding[0],
+        padding_w=padding[1],
+        dilation_h=dilation[0],
+        dilation_w=dilation[1],
+    )
+
+    return output
diff --git a/tests/test_max_pool2d.py b/tests/test_max_pool2d.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("device", ("cuda",))
+@pytest.mark.parametrize("dtype", (torch.float32, torch.float16))
+@pytest.mark.parametrize("ceil_mode", (False, True))
+@pytest.mark.parametrize("dilation", (1, 2, (2, 3)))
+@pytest.mark.parametrize("padding", (0, 1, (2, 3)))
+@pytest.mark.parametrize("stride", (None, 1, (2, 3)))
+@pytest.mark.parametrize("kernel_size", ((1, 1), (3, 3)))
+@pytest.mark.parametrize("n, c, h, w", ((2, 3, 112, 112),))
+def test_max_pool2d(
+    n, c, h, w, kernel_size, stride, padding, dilation, ceil_mode, dtype, device
+):
+    padding_ = padding
+
+    if isinstance(padding_, int):
+        padding_ = (padding_, padding_)
+
+    dilation_ = dilation
+
+    if isinstance(dilation_, int):
+        dilation_ = (dilation_, dilation_)
+
+    if padding_[0] > kernel_size[0] / 2 or padding_[1] > kernel_size[1] / 2:
+        pytest.skip(reason="Invalid padding.")
+
+    input = torch.randn((n, c, h, w), dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.max_pool2d(
+        input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+    )
+    reference_output = F.max_pool2d(
+        input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+    )
+
+    assert torch.allclose(ninetoothed_output, reference_output)