floordiv to support float operands

gunnersdeng · gunnersdeng · commit cbda4840416f · 2026-03-16T17:43:30.000-07:00
Signed-off-by: Ziheng Deng &lt;zihengd@nvidia.com&gt;
diff --git a/changelog.d/future/floordiv-float.md b/changelog.d/future/floordiv-float.md
@@ -0,0 +1,4 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- `ct.floordiv()` and the `//` operator now support floating-point operands.
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -944,6 +944,11 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
                 return bc.encode_DivIOp(ctx.builder, res_typeid, lhs, rhs,
                                         signedness=datatype.get_signedness(dtype),
                                         rounding=bc.RoundingMode.NEGATIVE_INF)
+            case "floordiv", "float":
+                quotient = bc.encode_DivFOp(ctx.builder, res_typeid, lhs, rhs,
+                                            rounding_mode=rounding_mode,
+                                            flush_to_zero=self.flush_to_zero)
+                return bc.encode_FloorOp(ctx.builder, res_typeid, quotient)
             case "cdiv", "int":
                 return bc.encode_DivIOp(ctx.builder, res_typeid, lhs, rhs,
                                         signedness=datatype.get_signedness(dtype),
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -1616,9 +1616,48 @@ def truediv(x, y, /, *, rounding_mode: Optional[RoundingMode] = None,
     pass
 
 
-@_doc_binary_op('//')
 @function
 def floordiv(x, y, /) -> TileOrScalar:
+    """Elementwise floordiv on two tiles.
+
+    Can also use builtin operation ``x // y``.
+
+    Supports both integer and floating-point operands. For float inputs,
+    the result is ``floor(x / y)`` as a float (e.g. ``5.5 // 2.2 == 2.0``).
+
+    Args:
+        x (Tile): LHS tile.
+        y (Tile): RHS tile.
+
+    The ``shape`` of ``x`` and ``y`` will be broadcasted and
+    ``dtype`` promoted to common dtype.
+
+    Returns:
+        Tile:
+
+    Examples:
+
+        >>> # integer tile and tile
+        >>> tx = ct.full((2, 4), 7, dtype=ct.int32)
+        >>> ty = ct.full((2, 4), 3, dtype=ct.int32)
+        >>> tz = ct.floordiv(tx, ty)
+
+        >>> # Can also use the builtin op
+        >>> tz = tx // ty
+
+        >>> # float tile and tile
+        >>> tx = ct.full((2, 4), 5.5, dtype=ct.float32)
+        >>> ty = ct.full((2, 4), 2.2, dtype=ct.float32)
+        >>> tz = tx // ty  # result is ct.float32 with value 2.0
+
+        >>> # tile and scalar
+        >>> tx = ct.full((2, 4), 7, dtype=ct.int32)
+        >>> y = 2
+        >>> tz = tx // y
+
+        >>> # scalar and scalar
+        >>> z = 7 // 2
+    """
     pass
 
 
diff --git a/test/test_binary_elementwise.py b/test/test_binary_elementwise.py
@@ -498,14 +498,16 @@ def test_array_scalar_div(shape, tile, int_dtype, tmp_path, op_symbol, ref_impl,
 @pytest.mark.parametrize("op_symbol, ref_impl", [
     ("/", lambda x, y: x / y),
     ("ct.truediv", lambda x, y: x / y),
+    ("//", lambda x, y: x // y),
+    ("ct.floordiv", lambda x, y: x // y),
     ])
-def test_array_scalar_truediv_float(shape, tile, float_dtype, tmp_path, op_symbol, ref_impl):
+def test_array_scalar_div_float(shape, tile, float_dtype, tmp_path, op_symbol, ref_impl):
     x = make_tensor(shape, dtype=float_dtype, device='cuda')
     y = 23.0
     res_dtype = torch.promote_types(x.dtype, torch.float32)
     ref = ref_impl(x.to(res_dtype), y)
     z = torch.zeros_like(ref)
-    kernel = array_scalar_kernel('truediv',
+    kernel = array_scalar_kernel('div_float',
                                  f'tz = {op_symbol}(tx, y)' if op_symbol.startswith("ct.") else
                                  f'tz = tx {op_symbol} y',
                                  tmp_path)
@@ -540,17 +542,19 @@ def test_array_div(shape, tile, x_dtype, y_dtype, tmp_path, op_symbol, ref_impl,
 @pytest.mark.parametrize("op_symbol, ref_impl", [
     ("/", lambda x, y: x / y),
     ("ct.truediv", lambda x, y: x / y),
+    ("//", lambda x, y: torch.floor(x / y)),
+    ("ct.floordiv", lambda x, y: torch.floor(x / y)),
     ])
 @pytest.mark.parametrize("x_dtype", float_dtypes, ids=dtype_id)
 @pytest.mark.parametrize("y_dtype", float_dtypes, ids=dtype_id)
-def test_array_truediv_float(shape, tile, x_dtype, y_dtype, tmp_path, op_symbol, ref_impl):
+def test_array_div_float(shape, tile, x_dtype, y_dtype, tmp_path, op_symbol, ref_impl):
     should_raise = {x_dtype, y_dtype} == {torch.float16, torch.bfloat16}
     x = (torch.rand(*shape, device="cuda") * 100).to(dtype=x_dtype)
     y = (torch.rand(*shape, device="cuda") * 100 + 1).to(dtype=y_dtype)
     result_type = torch.promote_types(x.dtype, y.dtype)
     z = torch.zeros_like(x).to(result_type)
     ref = ref_impl(x, y).to(result_type)
-    kernel = array_kernel('truediv',
+    kernel = array_kernel('div_float',
                           f"tz = {op_symbol}(tx, ty)" if op_symbol.startswith("ct.") else
                           f"tz = tx {op_symbol} ty",
                           tmp_path)