Properly catch restricted dtype in simple scan and reduce op

haijieg · haijieg · commit f20e5cd5a4e5 · 2026-03-24T10:01:53.000-07:00
Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/changelog.d/restricted-dtype.md b/changelog.d/restricted-dtype.md
@@ -0,0 +1 @@
+- Fix a bug where restricted float dtype with simple reduce and scan did not raise proper TileTypeError
diff --git a/src/cuda/tile/_datatype.py b/src/cuda/tile/_datatype.py
@@ -278,6 +278,10 @@ def get_signedness(t: DType) -> bc.Signedness:
 
 
 def is_float(t: DType) -> bool:
+    return t in NumericDTypeCategories.Float or t in NumericDTypeCategories.RestrictedFloat
+
+
+def is_unrestricted_float(t: DType) -> bool:
     return t in NumericDTypeCategories.Float
 
 
diff --git a/src/cuda/tile/_ir/op_impl.py b/src/cuda/tile/_ir/op_impl.py
@@ -10,7 +10,7 @@
 from typing import Optional, NamedTuple, Tuple, Sequence, Any, Union, Callable
 
 from cuda.tile._datatype import (
-        is_integral, is_float, is_restricted_float,
+        is_integral, is_float,
         is_boolean, is_signed, DType)
 from cuda.tile._bytecode.version import BytecodeVersion
 from cuda.tile._exception import TileTypeError, TileUnsupportedFeatureError
@@ -430,7 +430,7 @@ class PrintfValidator:
     def infer_format(cls, dtype: DType) -> str:
         if is_boolean(dtype) or is_integral(dtype):
             return '%d'
-        elif is_float(dtype) or is_restricted_float(dtype):
+        elif is_float(dtype):
             return '%f'
         else:
             raise TileTypeError(f"print(): cannot infer format for dtype {dtype}")
@@ -439,7 +439,7 @@ def infer_format(cls, dtype: DType) -> str:
     def validate_dtype(cls, dtype: DType, specifier: str) -> bool:
         if is_boolean(dtype) or is_integral(dtype):
             return specifier in cls.int_specifiers
-        elif is_float(dtype) or is_restricted_float(dtype):
+        elif is_float(dtype):
             return specifier in cls.float_specifiers
         else:
             return False
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -63,7 +63,7 @@
     StringFormat, FormattedPiece,
 )
 from cuda.tile._datatype import (
-    DType, is_integral, is_float, is_signed, is_boolean, is_restricted_float,
+    DType, is_integral, is_float, is_signed, is_boolean,
 )
 from cuda.tile._ir2bytecode import (
     BytecodeContext, typeid,
@@ -1026,16 +1026,20 @@ def binary_arithmetic(fn: str, x: Var, y: Var, rounding_mode: Optional[RoundingM
     x_ty = require_tile_maybe_loose_type(x)
     y_ty = require_tile_maybe_loose_type(y)
 
-    if get_dtype(x_ty) == get_dtype(y_ty) == datatype.bool_:
-        raise TileTypeError(f'Binary arithmetic op `{fn}` does not support bool, '
-                            f'please cast bool to int')
-
     if isinstance(x_ty, LooselyTypedScalar) and isinstance(y_ty, LooselyTypedScalar):
         return _binop_propagate_constant(fn, x_ty.value, y_ty.value, None)
 
     force_float = (fn == "truediv")
     common_ty = promote_types(x_ty, y_ty, force_float=force_float)
 
+    common_dtype = get_dtype(common_ty)
+    if common_dtype == datatype.bool_:
+        raise TileTypeError(f'Binary arithmetic op `{fn}` does not support bool, '
+                            f'please cast bool to int')
+    if datatype.is_restricted_float(common_dtype):
+        raise TileTypeError(
+            f'Binary arithmetic op `{fn}` does not support restricted float dtype {common_dtype}')
+
     x = _promote_and_broadcast_to(x, common_ty)
     y = _promote_and_broadcast_to(y, common_ty)
 
@@ -1386,7 +1390,7 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
         flush_to_zero = self.flush_to_zero
         input_type = ctx.typeof(self.operand)
         input_dtype = get_dtype(input_type)
-        flt = is_float(input_dtype) or is_restricted_float(input_dtype)
+        flt = is_float(input_dtype)
         res_type_id = ctx.typeid_of(self.result_var)
 
         match self.fn, flt:
@@ -1467,7 +1471,7 @@ def unary(fn: str, behavior: _UnaryBehavior, x: Var,
         if behavior.int_handler is None:
             raise TileTypeError("Integer inputs are not supported")
         x = behavior.int_handler(x)
-    elif is_float(input_dtype) or is_restricted_float(input_dtype):
+    elif is_float(input_dtype):
         if behavior.float_handler is None:
             raise TileTypeError("Float inputs are not supported")
         x = behavior.float_handler(x)
@@ -1569,7 +1573,7 @@ def isnan_impl(x: Var) -> Var:
         return loosely_typed_const(res)
 
     ty = x.get_type()
-    if isinstance(x_type, TileTy) and (is_float(ty.dtype) or is_restricted_float(ty.dtype)):
+    if isinstance(x_type, TileTy) and is_float(ty.dtype):
         if x.is_constant():
             res = math.isnan(x.get_constant())
             return strictly_typed_const(res, make_tile_ty(datatype.bool_, ty.shape))
@@ -3240,8 +3244,8 @@ async def reduce_simple(fn: str, x: Var, axis: int | None | tuple[int, ...], kee
 
     async def body(lhs: tuple[Var], rhs: tuple[Var]) -> tuple[Var]:
         [lhs], [rhs] = lhs, rhs
-        ret = raw_binary_arithmetic(fn, lhs, rhs,
-                                    rounding_mode=rounding_mode, flush_to_zero=flush_to_zero)
+        ret = binary_arithmetic(fn, lhs, rhs,
+                                rounding_mode=rounding_mode, flush_to_zero=flush_to_zero)
         return (ret,)
 
     [ret] = await reduce((x,), (id_val,), axis, keepdims, body)
@@ -3453,8 +3457,8 @@ async def scan_simple(fn: str, x: Var, axis: int, reverse: bool,
 
     async def body(lhs: tuple[Var], rhs: tuple[Var]) -> tuple[Var]:
         [lhs], [rhs] = lhs, rhs
-        ret = raw_binary_arithmetic(fn, lhs, rhs,
-                                    rounding_mode=rounding_mode, flush_to_zero=flush_to_zero)
+        ret = binary_arithmetic(fn, lhs, rhs,
+                                rounding_mode=rounding_mode, flush_to_zero=flush_to_zero)
         return (ret,)
 
     [ret] = await raw_scan((x,), (id_val,), axis, reverse, body)
diff --git a/src/cuda/tile/_ir/ops_utils.py b/src/cuda/tile/_ir/ops_utils.py
@@ -161,9 +161,9 @@ def check_rd_and_ftz(fn: str, rounding_mode: Optional[RoundingMode], flush_to_ze
                     f'{fn} rounding_mode={rounding_mode.value} requires tileiras '
                     f'{min_version.major()}.{min_version.minor()} or later. '
                     f'Current version is {cur_version.major()}.{cur_version.minor()}.')
-        if not datatype.is_float(dtype):
+        if not datatype.is_unrestricted_float(dtype):
             raise TileTypeError(
-                f'Rounding mode can only be used for float types, '
+                f'Rounding mode can only be used for unrestricted float types, '
                 f'but got {dtype}')
         if rounding_mode in [RoundingMode.APPROX, RoundingMode.FULL]:
             if dtype != datatype.float32:
diff --git a/src/cuda/tile/_ir2bytecode.py b/src/cuda/tile/_ir2bytecode.py
@@ -81,7 +81,7 @@ def _constant_to_bytes(value: int | float, dtype: DType) -> bytes:
         return b"\xff" if value else b"\x00"
     elif datatype.is_integral(dtype):
         return int(value).to_bytes((dtype.bitwidth + 7) // 8, "little", signed=value < 0)
-    elif datatype.is_float(dtype) or datatype.is_restricted_float(dtype):
+    elif datatype.is_float(dtype):
         # Note that TF32 is stored as 3 bytes despite the "32" in its name.
         # Its float_bit_size() is 19 bits, which is rounded up to 24 bits.
         bits = bc.float_to_bits(value, dtype._bytecode_type)
@@ -94,7 +94,7 @@ def _constant_to_bytes(value: int | float, dtype: DType) -> bytes:
 def _get_type_conversion_encoder(from_dtype: Type, to_dtype: Type):
 
     def kind(t):
-        if datatype.is_float(t) or datatype.is_restricted_float(t):
+        if datatype.is_float(t):
             return 'f'
         if datatype.is_integral(t) or datatype.is_boolean(t):
             return 'si' if datatype.is_signed(t) else 'ui'
diff --git a/src/cuda/tile/_passes/rewrite_patterns.py b/src/cuda/tile/_passes/rewrite_patterns.py
@@ -84,8 +84,8 @@ def match_float_mul(op: RawBinaryArithmeticOperation,
                     ctx: MatchContext) -> RawBinaryArithmeticOperation:
     if op.fn != "mul":
         raise NoMatch("not a mul binop")
-    if not datatype.is_float(get_dtype(ctx.typeof(op.result_var))):
-        raise NoMatch("not a float mul")
+    if not datatype.is_unrestricted_float(get_dtype(ctx.typeof(op.result_var))):
+        raise NoMatch("not an unrestricted float mul")
     return op
 
 
diff --git a/test/test_binary_elementwise.py b/test/test_binary_elementwise.py
@@ -208,7 +208,7 @@ def test_array_core_arithmetic_rounding_mode(
             launch_binary(kernel, x, y, z, tile)
     elif should_raise_dtype:
         with pytest.raises(TileTypeError,
-                           match=r"Rounding mode can only be used for float types"):
+                           match=r"Rounding mode can only be used for unrestricted float types"):
             launch_binary(kernel, x, y, z, tile)
     else:
         bytecode = get_bytecode(kernel, (x, y, z, tile))
diff --git a/test/test_ir_types.py b/test/test_ir_types.py
@@ -17,7 +17,7 @@
     int64, int32, int16, int8,
     uint64, uint32, uint16, uint8, bfloat16,
     tfloat32, float8_e4m3fn, float8_e5m2,
-    is_boolean, is_integral, is_float, is_restricted_float, is_signed,
+    is_boolean, is_integral, is_float, is_unrestricted_float, is_restricted_float, is_signed,
 )
 from cuda.tile._ir.ops_utils import promote_dtypes, check_implicit_cast
 from cuda.tile._ir.typing_support import to_dtype, typeof_pyval
@@ -51,6 +51,9 @@ def test_builtin_types():
     assert is_boolean(bool_)
     assert is_float(bfloat16)
     assert not is_float(uint32)
+    assert is_unrestricted_float(bfloat16)
+    assert not is_unrestricted_float(tfloat32)
+    assert is_float(tfloat32)
     assert is_restricted_float(tfloat32)
     assert is_restricted_float(float8_e4m3fn)
     assert is_restricted_float(float8_e5m2)
diff --git a/test/test_reduction.py b/test/test_reduction.py
@@ -190,6 +190,17 @@ def kernel(x):
         ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))
 
 
+def test_reduce_sum_restricted_dtype_error():
+    @ct.kernel
+    def kernel(x):
+        tx = ct.load(x, (0,), (16,))
+        ct.sum(tx, axis=0)
+
+    x = torch.rand((16,), dtype=torch.float32, device="cuda").to(torch.float8_e4m3fn)
+    with pytest.raises(TileTypeError, match="does not support restricted float dtype"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))
+
+
 sumprod_cases = [
     pytest.param(ct.sum, torch.sum, id="sum"),
     pytest.param(ct.prod, torch.prod, id="prod"),
diff --git a/test/test_scan.py b/test/test_scan.py
@@ -64,6 +64,17 @@ def test_cumsumb(shape, reverse):
     torch.testing.assert_close(y, ref_result)
 
 
+def test_cumsum_restricted_dtype_error():
+    @ct.kernel
+    def kernel(x):
+        tx = ct.load(x, (0,), (16,))
+        ct.cumsum(tx, axis=0)
+
+    x = torch.rand((16,), dtype=torch.float32, device="cuda").to(torch.float8_e4m3fn)
+    with pytest.raises(TileTypeError, match="does not support restricted float dtype"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))
+
+
 @ct.kernel
 def cumprod_axis0(input, output, reverse: ct.Constant[bool],
                   T: ct.Constant[int], N: ct.Constant[int]):

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+- Fix a bug where restricted float dtype with simple reduce and scan did not raise proper TileTypeError`