NVIDIA
diff --git a/‎cext/tile_kernel.cpp‎
Lines changed: 147 additions & 88 deletions b/‎cext/tile_kernel.cpp‎
Lines changed: 147 additions & 88 deletions
diff --git a/‎changelog.d/int64-annotations.md‎
Lines changed: 8 additions & 0 deletions b/‎changelog.d/int64-annotations.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/cuda/tile/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎src/cuda/tile/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/cuda/tile/_annotated_function.py‎
Lines changed: 33 additions & 2 deletions b/‎src/cuda/tile/_annotated_function.py‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎src/cuda/tile/_compile.py‎
Lines changed: 2 additions & 5 deletions b/‎src/cuda/tile/_compile.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/cuda/tile/_execution.py‎
Lines changed: 2 additions & 1 deletion b/‎src/cuda/tile/_execution.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/cuda/tile/_ir/load_store_impl.py‎
Lines changed: 0 additions & 126 deletions b/‎src/cuda/tile/_ir/load_store_impl.py‎
Lines changed: 0 additions & 126 deletions
diff --git a/‎src/cuda/tile/_ir/ops.py‎
Lines changed: 30 additions & 5 deletions b/‎src/cuda/tile/_ir/ops.py‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎src/cuda/tile/_ir2bytecode.py‎
Lines changed: 4 additions & 1 deletion b/‎src/cuda/tile/_ir2bytecode.py‎
Lines changed: 4 additions & 1 deletion
@@ -0,0 +1,8 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- New `ct.IndexedWithInt64` annotation for array kernel parameters whose shape
+  or stride values exceed the range of a 32-bit integer. Arrays without the
+  annotation continue to use `int32` for shape and stride.
+- New `ct.ScalarInt64` annotation that forces a scalar integer kernel parameter
+  to be inferred as `int64` instead of the default `int32`.
@@ -55,9 +55,13 @@
 
 from cuda.tile._stub import (
     Array,
+    ArrayAnnotation,
     Constant,
     ConstantAnnotation,
+    IndexedWithInt64,
+    ListAnnotation,
     Scalar,
+    ScalarInt64,
     Tile,
     TiledView,
 
@@ -204,9 +208,13 @@
     "TileValueError",
 
     "Array",
+    "ArrayAnnotation",
     "Constant",
     "ConstantAnnotation",
+    "IndexedWithInt64",
+    "ListAnnotation",
     "Scalar",
+    "ScalarInt64",
     "Tile",
     "TiledView",
 
 
@@ -7,27 +7,58 @@
 from types import FunctionType
 from typing import (get_origin, get_args, Annotated, Any, Sequence)
 
-from cuda.tile._stub import ConstantAnnotation
+from cuda.tile._stub import ConstantAnnotation, ArrayAnnotation, ScalarAnnotation, ListAnnotation
+from cuda.tile._datatype import int64
 
 
 @dataclass
 class AnnotatedFunction:
     pyfunc: FunctionType
     pysig: inspect.Signature
     constant_parameter_mask: Sequence[bool]
+    # array index dtype and scalar integer dtype can only be int64 or int32 now.
+    int64_index_parameter_mask: Sequence[bool]
+    int64_parameter_mask: Sequence[bool]
 
 
 def get_annotated_function(pyfunc: FunctionType) -> AnnotatedFunction:
     sig = inspect.signature(pyfunc)
     constant_parameter_mask = tuple(_has_constant_annotation(param.annotation)
                                     for param in sig.parameters.values())
+    int64_index_parameter_mask = tuple(_has_int64_index_annotation(param.annotation)
+                                       for param in sig.parameters.values())
+    int64_parameter_mask = tuple(_has_int64_annotation(param.annotation)
+                                 for param in sig.parameters.values())
     return AnnotatedFunction(pyfunc=pyfunc,
                              pysig=sig,
-                             constant_parameter_mask=constant_parameter_mask)
+                             constant_parameter_mask=constant_parameter_mask,
+                             int64_index_parameter_mask=int64_index_parameter_mask,
+                             int64_parameter_mask=int64_parameter_mask)
 
 
 def _has_constant_annotation(annotation: Any) -> bool:
     if get_origin(annotation) is Annotated:
         _, *metadata = get_args(annotation)
         return any(isinstance(m, ConstantAnnotation) for m in metadata)
     return False
+
+
+def _has_int64_index_annotation(annotation: Any) -> bool:
+    if get_origin(annotation) is Annotated:
+        _, *metadata = get_args(annotation)
+        for m in metadata:
+            if isinstance(m, ArrayAnnotation) and m.index_dtype is int64:
+                return True
+            if (isinstance(m, ListAnnotation)
+                    and isinstance(m.element, ArrayAnnotation)
+                    and m.element.index_dtype is int64):
+                return True
+        return False
+    return False
+
+
+def _has_int64_annotation(annotation: Any) -> bool:
+    if get_origin(annotation) is Annotated:
+        _, *metadata = get_args(annotation)
+        return any(isinstance(m, ScalarAnnotation) and m.dtype is int64 for m in metadata)
+    return False
@@ -23,7 +23,6 @@
 from typing import Optional, Sequence
 import zipfile
 
-from cuda.tile import int32
 from cuda.tile._annotated_function import AnnotatedFunction, get_annotated_function
 from cuda.tile._bytecode.version import BytecodeVersion
 from cuda.tile._cext import get_compute_capability, TileContext, default_tile_context
@@ -157,9 +156,6 @@ def _create_kernel_parameters(parameter_constraints: Sequence[ParameterConstrain
 
 
 def _get_array_ty(param: ArrayConstraint):
-    if param.index_dtype != int32:
-        raise NotImplementedError("Only int32 is currently supported as array index type")
-
     for static_stride, bound in zip(param.stride_constant, param.stride_lower_bound_incl,
                                     strict=True):
         if static_stride is not None:
@@ -170,7 +166,8 @@ def _get_array_ty(param: ArrayConstraint):
 
     return ArrayTy(make_tile_ty(param.dtype, ()),
                    shape=(None,) * param.ndim,
-                   strides=param.stride_constant)
+                   strides=param.stride_constant,
+                   index_dtype=param.index_dtype)
 
 
 def _log_mlir(bytecode_buf):
 
@@ -105,7 +105,8 @@ def __init__(self,
             occupancy=occupancy,
             opt_level=opt_level
         )
-        super().__init__(ann_func.constant_parameter_mask)
+        super().__init__(ann_func.constant_parameter_mask, ann_func.int64_index_parameter_mask,
+                         ann_func.int64_parameter_mask)
         self._annotated_function = ann_func
         self._compiler_options = compiler_options
 
 
@@ -1326,12 +1326,20 @@ def generate_bytecode(self, ctx: BytecodeContext):
         )
 
         # Cast each of the i64 words to appropriate types
+        if list_ty.item_type.index_dtype.bitwidth >= 64:
+            # Already i64, no truncation needed
+            shape_stride_results = list(extracted_words[1:])
+        else:
+            shape_stride_results = [
+                bc.encode_TruncIOp(ctx.builder, ty_id, w, bc.IntegerOverflow.NONE)
+                for ty_id, w in zip(item_typeid_tuple[1:], extracted_words[1:], strict=True)
+            ]
+
         return (
             # Cast the first word to data pointer
             bc.encode_IntToPtrOp(ctx.builder, item_typeid_tuple[0], extracted_words[0]),
-            # Cast the remaining words to i32 shape/strides
-            *(bc.encode_TruncIOp(ctx.builder, ty, w, bc.IntegerOverflow.NONE)
-              for ty, w in zip(item_typeid_tuple[1:], extracted_words[1:], strict=True))
+            # Cast the remaining words to shape/stride types (i32 or i64)
+            *shape_stride_results
         )
 
 
@@ -2260,6 +2268,7 @@ def maybe_const_int(v: Var):
         array_ty.element_type,
         shape=new_shape_ty,
         strides=array_ty.strides,
+        index_dtype=array_ty.index_dtype,
     )
 
     array_val = array.get_aggregate()
@@ -2334,12 +2343,15 @@ class TileLoad(Operation, opcode="tile_load", memory_effect=MemoryEffect.LOAD):
     @override
     def generate_bytecode(self, ctx: BytecodeContext) -> tuple[bc.Value, bc.Value]:
         tile_type: TileTy = self.result_vars[0].get_type()
+        view_ty = self.view.get_type()
+        keep_i64 = (isinstance(view_ty, PartitionViewTy)
+                    and view_ty.array_ty.index_dtype.bitwidth > 32)
         res, res_token = bc.encode_LoadViewTkoOp(
             ctx.builder,
             tile_type=typeid(ctx.type_table, tile_type),
             result_token_type=ctx.type_table.Token,
             view=ctx.get_value(self.view),
-            index=ctx.index_tuple(self.index),
+            index=ctx.index_tuple(self.index, keep_i64=keep_i64),
             token=None if self.token is None else ctx.get_value(self.token),
             memory_ordering_semantics=memory_order_to_bytecode[self.memory_order],
             memory_scope=memory_scope_to_bytecode[self.memory_scope],
@@ -2359,6 +2371,11 @@ def _tile_load_impl_inner(array: Var, index_items: tuple[Var, ...], shape: Seque
     allow_tma = require_optional_constant_bool(allow_tma)
     _check_load_store_hints(latency, allow_tma)
 
+    # Promote indices to i64 for big arrays so that blockId * tileSize
+    # doesn't overflow i32 in the backend's address computation.
+    if array_ty.index_dtype.bitwidth > 32:
+        index_items = tuple(astype(idx, array_ty.index_dtype) for idx in index_items)
+
     view = make_partition_view(array, broadcasted_shape, order, padding_mode)
     res_ty = make_tile_ty(array_ty.dtype, broadcasted_shape)
     result, _token = add_operation(TileLoad, (res_ty, TokenTy()),
@@ -2482,12 +2499,15 @@ class TileStore(Operation, opcode="tile_store", memory_effect=MemoryEffect.STORE
 
     @override
     def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
+        view_ty = self.view.get_type()
+        keep_i64 = (isinstance(view_ty, PartitionViewTy)
+                    and view_ty.array_ty.index_dtype.bitwidth > 32)
         return bc.encode_StoreViewTkoOp(
             ctx.builder,
             result_token_type=ctx.type_table.Token,
             tile=ctx.get_value(self.tile),
             view=ctx.get_value(self.view),
-            index=ctx.index_tuple(self.index),
+            index=ctx.index_tuple(self.index, keep_i64=keep_i64),
             token=None if self.token is None else ctx.get_value(self.token),
             memory_ordering_semantics=memory_order_to_bytecode[self.memory_order],
             memory_scope=memory_scope_to_bytecode[self.memory_scope],
@@ -2517,6 +2537,11 @@ def _tile_store_impl_inner(array: Var, index_items: tuple[Var, ...], tile: Var,
     allow_tma = require_optional_constant_bool(allow_tma)
     _check_load_store_hints(latency, allow_tma)
 
+    # Promote indices to i64 for big arrays so that blockId * tileSize
+    # doesn't overflow i32 in the backend's address computation.
+    if array_ty.index_dtype.bitwidth > 32:
+        index_items = tuple(astype(idx, array_ty.index_dtype) for idx in index_items)
+
     tile = reshape(tile, broadcasted_shape)
     view = make_partition_view(array, broadcasted_shape, order, PaddingMode.UNDETERMINED)
     [_token] = add_operation(TileStore, (TokenTy(),), view=view, index=index_items, tile=tile,
 
@@ -362,10 +362,13 @@ def constant_tuple(self, value, ty: Type) -> Tuple[bc.Value, ...]:
                         for item_ty, item_val in zip(ty.value_types, value, strict=True)), ())
         return self.constant(value, ty),
 
-    def index_tuple(self, index: tuple[Var, ...]) -> Tuple[bc.Value, ...]:
+    def index_tuple(self,
+                    index: tuple[Var, ...], *, keep_i64: bool = False) -> Tuple[bc.Value, ...]:
         i32_tile_ty = self.type_table.tile(self.type_table.I32, ())
         item_types = tuple(x.get_type() for x in index)
         index_values = tuple(self.get_value(x) for x in index)
+        if keep_i64:
+            return index_values
         return tuple(
             bc.encode_TruncIOp(self.builder, i32_tile_ty, v, bc.IntegerOverflow.NONE)
             if (t.dtype if isinstance(t, TileTy) else t).bitwidth > 32 else v
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,8 @@ def __init__(self,`
`105`	`105`	`occupancy=occupancy,`
`106`	`106`	`opt_level=opt_level`
`107`	`107`	`)`
`108`		`- super().__init__(ann_func.constant_parameter_mask)`
	`108`	`+ super().__init__(ann_func.constant_parameter_mask, ann_func.int64_index_parameter_mask,`
	`109`	`+ ann_func.int64_parameter_mask)`
`109`	`110`	`self._annotated_function = ann_func`
`110`	`111`	`self._compiler_options = compiler_options`
`111`	`112`