[lang] Support unary float math operations

ashermancinelli · ashermancinelli · commit 14e93532bf0a · 2026-06-09T08:03:19.000-07:00
This MR does not expose all the math operations from cuda.tile, just an
initial subset of unary floating point math operations. They are also
not imported by default; this is because I am not sure how we want to
expose the math operations.

Signed-off-by: Asher Mancinelli &lt;amancinelli@nvidia.com&gt;
diff --git a/experimental/cuda-lang/src/cuda/lang/_datatype.py b/experimental/cuda-lang/src/cuda/lang/_datatype.py
@@ -31,6 +31,8 @@
     is_boolean,
     is_integral,
     is_signed,
+    unsigned_integral_dtypes,
+    signed_integral_dtypes,
     get_signedness,
     default_int_type,
     integer_dtype,
@@ -98,6 +100,8 @@ def to_torch_dtype(dtype: DType, /):
     "opaque_pointer_dtype",
     "get_signedness",
     "integer_dtype",
+    "unsigned_integral_dtypes",
+    "signed_integral_dtypes",
     "bool_",
     "uint8",
     "uint16",
diff --git a/experimental/cuda-lang/src/cuda/lang/_ir/ops.py b/experimental/cuda-lang/src/cuda/lang/_ir/ops.py
@@ -84,9 +84,15 @@
 )
 from cuda.tile._exception import TileValueError
 import cuda.lang._mlir as mlir
-from .type_checking_helpers import require_array_indices, require_scalar_type, \
-    require_pointer_type, require_signed_int_scalar_or_tuple, \
-    require_clusterlaunchcontrol_token_type
+from .type_checking_helpers import (
+    require_array_indices,
+    require_scalar_or_vector_float_type,
+    require_scalar_or_vector_type,
+    require_scalar_type,
+    require_pointer_type,
+    require_signed_int_scalar_or_tuple,
+    require_clusterlaunchcontrol_token_type,
+)
 
 from .type import (
     LocalArrayContextManagerTy, ContextManagerState, TensorMapTy,
@@ -111,6 +117,7 @@
     format_var,
     LocalArrayContextManagerValue,
 )
+from .._stub import math as cl_math
 from .._stub.cluster_launch_control import clusterlaunchcontrol_try_cancel, \
     clusterlaunchcontrol_is_canceled, clusterlaunchcontrol_get_first_block_idx
 from .._stub.tensor_map import TensorMapSwizzle
@@ -1163,6 +1170,58 @@ class RawMLIROperation(Operation, opcode="mlir.operation",
     mlir_attributes: tuple[tuple[str, mlir.Attribute], ...] = attribute(default=())
 
 
+def _get_dtype(ty: ScalarTy | VectorTy):
+    match ty:
+        case ScalarTy() as st:
+            return st.dtype
+        case VectorTy() as vt:
+            return vt.element_dtype
+        case _:
+            assert False, "Match should have been exhaustive"
+
+
+@impl(cl_math.ceil, fixed_args=["math.ceil"])
+@impl(cl_math.sin, fixed_args=["math.sin"])
+@impl(cl_math.cos, fixed_args=["math.cos"])
+@impl(cl_math.tan, fixed_args=["math.tan"])
+@impl(cl_math.sinh, fixed_args=["math.sinh"])
+@impl(cl_math.cosh, fixed_args=["math.cosh"])
+@impl(cl_math.tanh, fixed_args=["math.tanh"])
+@impl(cl_math.sqrt, fixed_args=["math.sqrt"])
+@impl(cl_math.floor, fixed_args=["math.floor"])
+@impl(cl_math.log, fixed_args=["math.log"])
+@impl(cl_math.log2, fixed_args=["math.log2"])
+def math_float_unary_impl(op_name: str, x: Var):
+    x_ty = require_scalar_or_vector_float_type(x)
+    return add_operation(
+        RawMLIROperation,
+        x_ty,
+        op_name=op_name,
+        operands_=(x,),
+    )
+
+
+@impl(cl_math.abs)
+def abs_impl(x: Var) -> Var:
+    x_ty = require_scalar_or_vector_type(x)
+    x_dtype = _get_dtype(x_ty)
+    if datatype.is_float(x_dtype):
+        op_name = "math.absf"
+    elif datatype.is_integral(x_dtype):
+        # If it's unsigned, then the absolute value is the identity
+        if not datatype.is_signed(x_dtype):
+            return x
+        op_name = "math.absi"
+    else:
+        raise TileTypeError(f"abs() expects an arithmetic scalar, got {x_ty}")
+    return add_operation(
+        RawMLIROperation,
+        x_ty,
+        op_name=op_name,
+        operands_=(x,),
+    )
+
+
 def _is_none(var: Var):
     return var.is_constant() and var.get_constant() is None
 
diff --git a/experimental/cuda-lang/src/cuda/lang/_ir/type_checking_helpers.py b/experimental/cuda-lang/src/cuda/lang/_ir/type_checking_helpers.py
@@ -9,7 +9,7 @@
 from cuda.tile._ir.ops import implicit_cast
 from cuda.tile._ir.type import TupleTy, TupleValue
 from cuda.tile._datatype import is_integral, is_signed
-from cuda.lang._datatype import clusterlaunchcontrol_token
+from cuda.lang._datatype import clusterlaunchcontrol_token, is_float
 
 
 def require_array_indices(array: Var, indices: Var) -> tuple[Var, ...]:
@@ -85,6 +85,35 @@ def require_vector_type(var: Var, length: int | None = None) -> VectorTy:
     return ty
 
 
+def require_scalar_or_vector_float_type(var: Var) -> VectorTy | ScalarTy:
+    ty = var.get_type()
+
+    def err():
+        return make_type_checking_error(
+            f"Expected a scalar or vector float type, but got {ty}", var
+        )
+
+    match ty:
+        case ScalarTy() as st:
+            dtype = st.dtype
+        case VectorTy() as vt:
+            dtype = vt.element_dtype
+        case _:
+            raise err()
+
+    if not is_float(dtype):
+        raise err()
+
+    return ty
+
+
+def require_scalar_or_vector_type(var: Var) -> VectorTy | ScalarTy:
+    ty = var.get_type()
+    if not isinstance(ty, ScalarTy | VectorTy):
+        raise make_type_checking_error(f"Expected scalar or vector type but got {ty}", var)
+    return ty
+
+
 def make_type_checking_error(message: str, culprit: Var | None = None):
     # TODO: recover the context similarly to _make_type_error in cutile
     raise TileTypeError(message)
diff --git a/experimental/cuda-lang/src/cuda/lang/_stub/math.py b/experimental/cuda-lang/src/cuda/lang/_stub/math.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from cuda.lang._execution import stub
+@stub
+def ceil(x, /): ...
+@stub
+def sin(x, /): ...
+@stub
+def cos(x, /): ...
+@stub
+def tan(x, /): ...
+@stub
+def sinh(x, /): ...
+@stub
+def cosh(x, /): ...
+@stub
+def tanh(x, /): ...
+@stub
+def sqrt(x, /): ...
+@stub
+def floor(x, /): ...
+@stub
+def log(x, /): ...
+@stub
+def log2(x, /): ...
+@stub
+def abs(x, /): ...
+
+
+__all__ = (
+    "ceil",
+    "sin",
+    "cos",
+    "tan",
+    "sinh",
+    "cosh",
+    "tanh",
+    "sqrt",
+    "floor",
+    "log",
+    "log2",
+    "abs",
+)
diff --git a/experimental/cuda-lang/test/test_math.py b/experimental/cuda-lang/test/test_math.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import cuda.lang as cl
+import cuda.lang._datatype as datatype
+import builtins
+import math as host_math
+import torch
+import pytest
+from cuda.lang import compile_simt
+from cuda.lang._stub import math as device_math
+from cuda.lang.compilation import KernelSignature
+from cuda.lang._exception import TileTypeError
+
+
+FLOAT_TYPES = (
+    cl.float16,
+    cl.float32,
+    cl.float64,
+)
+SIGNED_INT_TYPES = datatype.signed_integral_dtypes
+UNSIGNED_INT_TYPES = datatype.unsigned_integral_dtypes
+
+UNARY_FLOAT_OPS = (
+    (device_math.ceil, host_math.ceil),
+    (device_math.sin, host_math.sin),
+    (device_math.cos, host_math.cos),
+    (device_math.tan, host_math.tan),
+    (device_math.sinh, host_math.sinh),
+    (device_math.cosh, host_math.cosh),
+    (device_math.tanh, host_math.tanh),
+    (device_math.sqrt, host_math.sqrt),
+    (device_math.floor, host_math.floor),
+    (device_math.log, host_math.log),
+    (device_math.log2, host_math.log2),
+    (device_math.abs, builtins.abs),
+)
+
+
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("device_op, host_op", UNARY_FLOAT_OPS)
+def test_math_unary_float(dtype, device_op, host_op):
+    rng = torch.Generator().manual_seed(0)
+
+    @cl.kernel
+    def kernel(inp, out):
+        out[0] = device_op(inp[0])
+
+    torch_dt = datatype.to_torch_dtype(dtype)
+    host_inp = torch.rand((), generator=rng).item() + 0.5
+    expected = host_op(host_inp)
+    inp = torch.tensor([host_inp], dtype=torch_dt, device="cuda")
+    out = torch.tensor([0.0], dtype=torch_dt, device="cuda")
+    cl.launch(torch.cuda.current_stream(), (1,), (1,), kernel, (inp, out))
+    assert out[0].item() == pytest.approx(expected, rel=1e-3, abs=1e-3)
+
+
+@pytest.mark.parametrize("dtype", SIGNED_INT_TYPES)
+@pytest.mark.parametrize("host_inp", (-5, 0, 5))
+def test_math_abs_signed_int(dtype, host_inp):
+    @cl.kernel
+    def kernel(inp, out):
+        out[0] = device_math.abs(dtype(inp[0]))
+
+    torch_dt = datatype.to_torch_dtype(dtype)
+    expected = builtins.abs(host_inp)
+    inp = torch.tensor([host_inp], dtype=torch_dt, device="cuda")
+    out = torch.tensor([0], dtype=torch_dt, device="cuda")
+    cl.launch(torch.cuda.current_stream(), (1,), (1,), kernel, (inp, out))
+    assert out[0].item() == expected
+
+
+def test_math_abs_unsigned_int():
+    # absolute value of unsigned number should be identity
+    @cl.kernel
+    def kernel():
+        device_math.abs(cl.uint32(5.0))
+
+    result = compile_simt(kernel, [KernelSignature([])])
+    assert "math.abs" not in result.mlir
+
+
+def test_vector():
+    @cl.kernel
+    def kernel(out):
+        with cl.local_array(4, cl.float32) as arr:
+            arr[0] = 0.5
+            arr[1] = 1.5
+            arr[2] = 2.5
+            arr[3] = 3.5
+            v = arr.get_base_pointer().load(count=4)
+            v = device_math.floor(v)
+            out.get_base_pointer().store(v)
+
+    out = torch.zeros(4, dtype=torch.float32).cuda()
+    cl.launch(torch.cuda.current_stream(), (1,), (1,), kernel, (out,))
+    print(out.cpu().tolist())
+    torch.testing.assert_close(out.cpu().tolist(), [0.0, 1.0, 2.0, 3.0])
+
+
+def test_type_error():
+    @cl.kernel
+    def kernel():
+        device_math.sin(cl.int32(5.0))
+
+    with pytest.raises(
+        TileTypeError, match="Expected a scalar or vector float type, but got int32"
+    ):
+        cl.launch(torch.cuda.current_stream(), (1,), (1,), kernel, ())