pack_to_bytes and unpack_from_bytes

blinxt · blinxt · commit fb2bdd05ed78 · 2026-03-05T10:16:01.000-08:00
Signed-off-by: Boyan Li &lt;boyanl@nvidia.com&gt;
diff --git a/changelog.d/future/pack-unpack-bytes.md b/changelog.d/future/pack-unpack-bytes.md
@@ -0,0 +1,7 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- New `ct.pack_to_bytes()` operation that flattens a tile and reinterprets its
+  raw bytes as a 1D uint8 tile.
+- New `ct.unpack_from_bytes()` operation that reinterprets a 1D uint8 tile as a
+  1D tile of the target dtype. Inverse of `ct.pack_to_bytes()`.
diff --git a/docs/source/operations.rst b/docs/source/operations.rst
@@ -53,6 +53,7 @@ Shape & DType
    permute
    transpose
    astype
+   bitcast
 
 
 Reduction
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -3642,9 +3642,17 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
 def bitcast(x: Var, dtype: DType) -> Var:
     tile_ty = require_tile_type(x)
     x_dtype = tile_ty.dtype
+    if x_dtype == datatype.bool_ or dtype == datatype.bool_:
+        raise TileTypeError(f"Cannot bitcast from {x_dtype} to {dtype}: "
+                            f"bitcast to or from bool is not supported")
+
     if x_dtype.bitwidth != dtype.bitwidth:
         raise TileTypeError(f"Cannot bitcast from {x_dtype} to {dtype}: "
-                            f"bit width is different ({x_dtype.bitwidth} vs. {dtype.bitwidth}")
+                            f"bit width is different ({x_dtype.bitwidth} vs. {dtype.bitwidth})")
+
+    if x_dtype == dtype:
+        return x
+
     res_ty = make_tile_ty(dtype, tile_ty.shape_value)
     return add_operation(TileBitCast, res_ty, x=x)
 
@@ -3655,6 +3663,90 @@ def bitcast_impl(x: Var, dtype: Var) -> Var:
     return bitcast(x, dtype_val)
 
 
+@dataclass(eq=False)
+class TilePack(Operation, opcode="tile_pack"):
+    x: Var = operand()
+
+    @override
+    def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
+        res_type_id = ctx.typeid_of(self.result_var)
+        x_value = ctx.get_value(self.x)
+        return bc.encode_PackOp(ctx.builder, res_type_id, x_value)
+
+
+def pack(x: Var) -> Var:
+    tile_ty = require_tile_type(x)
+    assert tile_ty.ndim == 1
+    assert tile_ty.dtype.bitwidth != 8
+    old_dim = tile_ty.shape_value[0]
+    new_dim, rem = divmod(old_dim * tile_ty.dtype.bitwidth, 8)
+    if rem != 0:
+        raise TileTypeError(f"Cannot pack tile {tile_ty}: "
+                            f"total bits ({old_dim} * {tile_ty.dtype.bitwidth}) "
+                            f"not divisible by 8")
+    res_ty = make_tile_ty(datatype.uint8, (new_dim,))
+    return add_operation(TilePack, res_ty, x=x)
+
+
+@impl(ct.pack_to_bytes, min_version=BytecodeVersion.V_13_3)
+def pack_to_bytes_impl(x: Var):
+    tile_ty = require_tile_type(x)
+    x_dtype = tile_ty.dtype
+    x = reshape(x, (-1,))
+    if x_dtype == datatype.bool_:
+        raise TileTypeError(f"pack_to_bytes from a {x_dtype} tile is not supported")
+
+    if x_dtype.bitwidth == 8:
+        return bitcast(x, datatype.uint8)
+    return pack(x)
+
+
+@dataclass(eq=False)
+class TileUnpack(Operation, opcode="tile_unpack"):
+    x: Var = operand()
+
+    @override
+    def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
+        res_type_id = ctx.typeid_of(self.result_var)
+        x_value = ctx.get_value(self.x)
+        return bc.encode_UnpackOp(ctx.builder, res_type_id, x_value)
+
+
+def unpack(x: Var, dtype: DType) -> Var:
+    tile_ty = require_tile_type(x)
+    assert tile_ty.ndim == 1
+    assert tile_ty.dtype == datatype.uint8
+    assert dtype.bitwidth != 8
+    old_dim = tile_ty.shape_value[0]
+    new_dim, rem = divmod(old_dim * 8, dtype.bitwidth)
+    if rem != 0:
+        raise TileTypeError(
+            f"Cannot unpack tile {tile_ty} to {dtype}: "
+            f"total bits ({old_dim} * 8) not divisible by {dtype.bitwidth}")
+    res_ty = make_tile_ty(dtype, (new_dim,))
+    return add_operation(TileUnpack, res_ty, x=x)
+
+
+@impl(ct.unpack_from_bytes, min_version=BytecodeVersion.V_13_3)
+def unpack_from_bytes_impl(x: Var, dtype: Var):
+    tile_ty = require_tile_type(x)
+    x_dtype = tile_ty.dtype
+    dtype = require_dtype_spec(dtype)
+    if tile_ty.ndim != 1:
+        raise TileTypeError(
+            f"unpack_from_bytes requires a 1D tile, "
+            f"got {tile_ty.ndim}D tile with shape {tile_ty.shape_value}")
+    if x_dtype != datatype.uint8:
+        raise TileTypeError(
+            f"unpack_from_bytes requires uint8 tile, got {x_dtype} tile")
+    if dtype == datatype.bool_:
+        raise TileTypeError(f"unpack_from_bytes to a {dtype} tile is not supported")
+
+    if dtype.bitwidth == 8:
+        return bitcast(x, dtype)
+    return unpack(x, dtype)
+
+
 @dataclass(eq=False)
 class TileArange(Operation, opcode="tile_arange"):
     @override
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -1232,6 +1232,55 @@ def bitcast(x, /, dtype) -> Tile:
     """
 
 
+@function
+def pack_to_bytes(x, /) -> Tile:
+    """Flattens a tile and reinterprets its raw bytes as uint8 elements.
+
+    The total number of bits of the input tile must be divisible by 8.
+
+    Args:
+        x (Tile): input tile.
+
+    Returns:
+        Tile: a 1D uint8 tile with ``total_elements * bit width // 8`` elements.
+
+    Examples:
+
+        >>> tx = ct.full((2, 4), 0, dtype=ct.int32)
+        >>> ty = ct.pack_to_bytes(tx)
+        >>> ty.dtype
+        uint8
+        >>> ty.shape
+        (32,)
+    """
+
+
+@function
+def unpack_from_bytes(x, /, dtype) -> Tile:
+    """Reinterprets a 1D uint8 byte tile as a 1D tile of the target data type.
+
+    The inverse of :py:func:`pack_to_bytes`. The input must be a 1D tile of
+    dtype uint8, and the total number of bits must be divisible by the
+    target data type bit width.
+
+    Args:
+        x (Tile): a 1D tile of dtype uint8.
+        dtype (DType): target data type.
+
+    Returns:
+        Tile: a 1D tile of ``dtype`` with ``num_bytes * 8 // bit width`` elements.
+
+    Examples:
+
+        >>> tx = ct.full((16,), 0, dtype=ct.uint8)
+        >>> ty = ct.unpack_from_bytes(tx, ct.float32)
+        >>> ty.dtype
+        float32
+        >>> ty.shape
+        (4,)
+    """
+
+
 def _math_op_extra_block(f, indent):
     base = inspect.unwrap(f)
     sig = inspect.signature(base)
diff --git a/test/conftest.py b/test/conftest.py
@@ -106,7 +106,7 @@ def shape_size_id(shape):
 float_dtypes = [torch.float16, torch.bfloat16, torch.float32]
 int_dtypes = [torch.int32, torch.int64, torch.int16, torch.int8]
 bool_dtypes = [torch.bool]
-uint_dtypes = [torch.uint32, torch.uint64]
+uint_dtypes = [torch.uint8, torch.uint32, torch.uint64]
 arithmetic_dtypes = int_dtypes + uint_dtypes + float_dtypes + bool_dtypes
 
 
diff --git a/test/test_cast.py b/test/test_cast.py
@@ -95,17 +95,24 @@ def test_cast_tf32(dtype):
     (torch.int32, torch.int64),
     (torch.int64, torch.float32),
     (torch.float16, torch.int32),
+    # failing pairs with bool
+    (torch.bool, torch.int8),
+    (torch.uint8, torch.bool),
+    (torch.bool, torch.bool),
 ])
 def test_array_bitcast(shape, tile, dtype_x, dtype_y):
     # avoid inputs that could produce nans of infs to not break assert
-    if dtype_x in (torch.int32, torch.int64):
+    if dtype_x == torch.bool:
+        x = torch.randint(0, 2, shape, dtype=dtype_x, device='cuda')
+    elif dtype_x in (torch.int32, torch.int64, torch.int8, torch.uint8):
         x = torch.randint(0, 100, shape, dtype=dtype_x, device='cuda')
     else:
         x = torch.randn(shape, dtype=dtype_x, device='cuda')
     ref = x.view(dtype=dtype_y)
     y = torch.zeros_like(ref)
     grid = (ceil(shape[0] / tile), 1, 1)
-    if dtype_x.itemsize != dtype_y.itemsize:
+    if (dtype_x == torch.bool or dtype_y == torch.bool
+            or dtype_x.itemsize != dtype_y.itemsize):
         with pytest.raises(TileTypeError):
             ct.launch(torch.cuda.current_stream(), grid, array_bitcast, (x, y, tile))
 
diff --git a/test/test_pack_unpack.py b/test/test_pack_unpack.py
@@ -0,0 +1,181 @@
+# SPDX-FileCopyrightText: Copyright (c) <2025> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+from torch.testing import make_tensor
+
+import cuda.tile as ct
+from cuda.tile._bytecode.version import BytecodeVersion
+from util import assert_equal
+from cuda.tile._exception import TileTypeError
+from conftest import float_dtypes, int_dtypes, requires_tileiras, uint_dtypes, dtype_id
+
+# TODO: remove when feature is out of development only
+from cuda.tile._stub import pack_to_bytes, unpack_from_bytes
+ct.pack_to_bytes = pack_to_bytes
+ct.unpack_from_bytes = unpack_from_bytes
+
+pytestmark = requires_tileiras(BytecodeVersion.V_13_3)
+
+test_dtypes = float_dtypes + int_dtypes + uint_dtypes + [torch.float64]
+
+
+@ct.kernel
+def pack_unpack_1d(x, y, TILE: ct.Constant[int]):
+    tx = ct.load(x, index=(0,), shape=(TILE,))
+    packed = ct.pack_to_bytes(tx)
+    ty = ct.unpack_from_bytes(packed, y.dtype)
+    ct.store(y, index=(0,), tile=ty)
+
+
+@pytest.mark.parametrize("dtype", test_dtypes, ids=dtype_id)
+def test_pack_to_bytes(dtype):
+    @ct.kernel
+    def kernel(x, y, TILE: ct.Constant[int]):
+        tx = ct.load(x, index=(0,), shape=(TILE,))
+        ty = ct.pack_to_bytes(tx)
+        ct.store(y, index=(0,), tile=ty)
+
+    tile = 128
+    x = make_tensor((tile,), dtype=dtype, device='cuda')
+    nbytes = tile * x.element_size()
+    y = torch.zeros(nbytes, dtype=torch.uint8, device='cuda')
+    ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, tile))
+    ref = x.view(torch.uint8)
+    assert_equal(y, ref)
+
+
+@pytest.mark.parametrize("dtype", test_dtypes, ids=dtype_id)
+def test_unpack_from_bytes(dtype):
+    @ct.kernel
+    def kernel(x, y, TILE: ct.Constant[int]):
+        tx = ct.load(x, index=(0,), shape=(TILE,))
+        ty = ct.unpack_from_bytes(tx, y.dtype)
+        ct.store(y, index=(0,), tile=ty)
+
+    ref = make_tensor((32,), dtype=dtype, device='cuda')
+    x = ref.view(torch.uint8)
+    y = torch.zeros_like(ref)
+    tile = x.shape[0]
+    ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, tile))
+    assert_equal(y, ref)
+
+
+@pytest.mark.parametrize("dtype", test_dtypes, ids=dtype_id)
+def test_pack_unpack_roundtrip(dtype):
+    tile = 128
+    x = make_tensor((tile,), dtype=dtype, device='cuda')
+    y = torch.zeros_like(x)
+    ct.launch(torch.cuda.current_stream(), (1,), pack_unpack_1d, (x, y, tile))
+    assert_equal(y, x)
+
+
+@pytest.mark.parametrize("dtype", test_dtypes, ids=dtype_id)
+def test_pack_unpack_roundtrip_0d(dtype):
+    @ct.kernel
+    def kernel(x, y):
+        tx = ct.gather(x, ())
+        packed = ct.pack_to_bytes(tx)
+        ty = ct.unpack_from_bytes(packed, x.dtype)
+        ty = ty.reshape(())
+        ct.scatter(y, (), ty)
+
+    x = make_tensor((), dtype=dtype, device='cuda')
+    y = torch.zeros_like(x)
+    ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))
+    assert_equal(y, x)
+
+
+@pytest.mark.parametrize("dtype", test_dtypes, ids=dtype_id)
+def test_pack_unpack_roundtrip_2d(dtype):
+    @ct.kernel
+    def kernel(x, y, TILE_M: ct.Constant[int], TILE_N: ct.Constant[int]):
+        bidm = ct.bid(0)
+        bidn = ct.bid(1)
+        tx = ct.load(x, index=(bidm, bidn), shape=(TILE_M, TILE_N))
+        packed = ct.pack_to_bytes(tx)
+        ty = ct.unpack_from_bytes(packed, x.dtype)
+        ty = ct.reshape(ty, (TILE_M, TILE_N))
+        ct.store(y, index=(bidm, bidn), tile=ty)
+
+    shape = (64, 128)
+    tiles = (32, 64)
+    x = make_tensor(shape, dtype=dtype, device='cuda')
+    y = torch.zeros_like(x)
+    grid = (ct.cdiv(shape[0], tiles[0]), ct.cdiv(shape[1], tiles[1]))
+    ct.launch(torch.cuda.current_stream(), grid,
+              kernel, (x, y, tiles[0], tiles[1]))
+    assert_equal(y, x)
+
+
+@pytest.mark.parametrize("dtype_x", test_dtypes, ids=dtype_id)
+@pytest.mark.parametrize("dtype_y", test_dtypes, ids=dtype_id)
+def test_cross_type_pack_unpack(dtype_x, dtype_y):
+    tile = 128
+    x = make_tensor((tile,), dtype=dtype_x, device='cuda')
+    ref = x.view(torch.uint8).view(dtype_y)
+    y = torch.zeros_like(ref)
+    ct.launch(torch.cuda.current_stream(), (1,), pack_unpack_1d, (x, y, tile))
+    assert_equal(y, ref)
+
+
+def test_unpack_from_bytes_not_divisible():
+    @ct.kernel
+    def kernel(x, y):
+        tx = ct.load(x, index=(0,), shape=(2,))
+        ct.unpack_from_bytes(tx, y.dtype)
+
+    x = torch.ones(2, dtype=torch.uint8, device='cuda')
+    y = torch.zeros(1, dtype=torch.int32, device='cuda')
+    with pytest.raises(TileTypeError, match="not divisible by 32"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))
+
+
+def test_unpack_from_bytes_wrong_input_dtype():
+    @ct.kernel
+    def kernel(x, y):
+        tx = ct.load(x, index=(0,), shape=(4,))
+        ct.unpack_from_bytes(tx, y.dtype)
+
+    x = torch.ones(4, dtype=torch.int32, device='cuda')
+    y = torch.zeros(4, dtype=torch.int32, device='cuda')
+    with pytest.raises(TileTypeError, match="unpack_from_bytes requires uint8 tile"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))
+
+
+def test_unpack_from_bytes_not_1d():
+    @ct.kernel
+    def kernel(x, y):
+        tx = ct.load(x, index=(0, 0), shape=(4, 4))
+        ct.unpack_from_bytes(tx, y.dtype)
+
+    x = torch.ones((4, 4), dtype=torch.uint8, device='cuda')
+    y = torch.zeros(4, dtype=torch.int32, device='cuda')
+    with pytest.raises(TileTypeError, match="unpack_from_bytes requires a 1D tile"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))
+
+
+def test_pack_to_bytes_bool():
+    @ct.kernel
+    def kernel(x, y, TILE: ct.Constant[int]):
+        tx = ct.load(x, index=(0,), shape=(TILE,))
+        ct.pack_to_bytes(tx)
+
+    x = torch.ones(4, dtype=torch.bool, device='cuda')
+    y = torch.zeros(4, dtype=torch.uint8, device='cuda')
+    with pytest.raises(TileTypeError, match="pack_to_bytes from a bool_ tile"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, 4))
+
+
+def test_unpack_from_bytes_bool():
+    @ct.kernel
+    def kernel(x, y):
+        tx = ct.load(x, index=(0,), shape=(4,))
+        ct.unpack_from_bytes(tx, y.dtype)
+
+    x = torch.ones(4, dtype=torch.uint8, device='cuda')
+    y = torch.zeros(4, dtype=torch.bool, device='cuda')
+    with pytest.raises(TileTypeError, match="unpack_from_bytes to a bool_ tile"):
+        ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))