NVIDIA
diff --git a/‎changelog.d/tileiras-13-2-ops.md‎
Lines changed: 8 additions & 0 deletions b/‎changelog.d/tileiras-13-2-ops.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/cuda/tile/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/cuda/tile/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cuda/tile/_compile.py‎
Lines changed: 4 additions & 3 deletions b/‎src/cuda/tile/_compile.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/cuda/tile/_ir/ir.py‎
Lines changed: 4 additions & 3 deletions b/‎src/cuda/tile/_ir/ir.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/cuda/tile/_ir/op_impl.py‎
Lines changed: 16 additions & 3 deletions b/‎src/cuda/tile/_ir/op_impl.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎src/cuda/tile/_ir/ops.py‎
Lines changed: 24 additions & 7 deletions b/‎src/cuda/tile/_ir/ops.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎src/cuda/tile/_ir/ops_utils.py‎
Lines changed: 33 additions & 31 deletions b/‎src/cuda/tile/_ir/ops_utils.py‎
Lines changed: 33 additions & 31 deletions
diff --git a/‎src/cuda/tile/_ir2bytecode.py‎
Lines changed: 2 additions & 1 deletion b/‎src/cuda/tile/_ir2bytecode.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,8 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+Add support for tileiras 13.2 features:
+- New `ct.atan2(y, x)` operation for computing the arctangent of y/x
+- Optional `rounding_mode` parameter for `ct.tanh()` (supports `RoundingMode.FULL` and `RoundingMode.APPROX`)
+
+Both features require tileiras 13.2 and will raise a clear error message when used with older versions.
@@ -65,6 +65,7 @@
     argmin,
     assert_,
     astype,
+    atan2,
     atomic_add,
     atomic_and,
     atomic_cas,
@@ -198,6 +199,7 @@
     "argmin",
     "assert_",
     "astype",
+    "atan2",
     "atomic_add",
     "atomic_and",
     "atomic_cas",
 
@@ -79,10 +79,11 @@ def wrapper(*args, **kwargs):
 
 def _get_final_ir(pyfunc,
                   args: Sequence[ir.KernelArgument],
-                  config: TileContextConfig) -> ir.Function:
+                  config: TileContextConfig,
+                  tileiras_version: BytecodeVersion = BytecodeVersion.V_13_1) -> ir.Function:
     func_hir: hir.Function = get_function_hir(pyfunc, entry_point=True)
 
-    ir_ctx = ir.IRContext(config)
+    ir_ctx = ir.IRContext(config, tileiras_version)
     func_body = hir2ir(func_hir, args, ir_ctx)
     eliminate_assign_ops(func_body)
     dead_code_elimination_pass(func_body)
@@ -188,7 +189,7 @@ def compile_tile(pyfunc,
 
     param_names = tuple(inspect.signature(pyfunc).parameters.keys())
     ir_args = _bind_kernel_arguments(param_names, args, get_constant_annotations(pyfunc))
-    func_ir = _get_final_ir(pyfunc, ir_args, context.config)
+    func_ir = _get_final_ir(pyfunc, ir_args, context.config, bytecode_version)
 
     if 'CUTILEIR' in context.config.log_keys:
         code = (f"==== CuTile IR for {func_ir.name}==== \n\n"
 
@@ -22,23 +22,24 @@
     TileTypeError, Loc, TileInternalError
 )
 from .. import TileSyntaxError
-from .._cext import TileContext
 from .._context import TileContextConfig
+from cuda.tile._bytecode.version import BytecodeVersion
 
 if TYPE_CHECKING:
     from cuda.tile._ir2bytecode import BytecodeContext
 
 
 class IRContext:
-    def __init__(self, config: TileContextConfig):
+    def __init__(self, config: TileContextConfig, tileiras_version: BytecodeVersion):
         self._all_vars: Dict[str, str] = {}
         self._counter_by_name: Dict[str, Iterator[int]] = defaultdict(itertools.count)
         self._temp_counter = itertools.count()
         self.typemap: Dict[str, Type] = dict()
         self.constants: Dict[str, Any] = dict()
         self._loose_typemap: Dict[str, Type] = dict()
-        self.config: TileContext = config
+        self.config: TileContextConfig = config
         self._aggregate_values: Dict[str, Any] = dict()
+        self.tileiras_version: BytecodeVersion = tileiras_version
 
     #  Make a Var with a unique name based on `name`.
     def make_var(self, name: str, loc: Loc, undefined: bool = False) -> Var:
 
@@ -12,11 +12,12 @@
 from cuda.tile._datatype import (
         is_integral, is_float, is_restricted_float,
         is_boolean, is_signed, DType)
-from cuda.tile._exception import TileTypeError
+from cuda.tile._bytecode.version import BytecodeVersion
+from cuda.tile._exception import TileTypeError, TileUnsupportedFeatureError
 from cuda.tile._ir.ops_utils import get_dtype
 
 from .typing_support import datatype, get_signature
-from .ir import Var, TupleValue
+from .ir import Var, TupleValue, Builder
 from .type import TupleTy, TileTy, DTypeSpec, EnumTy, StringTy, ArrayTy, SliceType, \
     ListTy, LooselyTypedScalar, RangeIterType, FunctionTy, ClosureTy, BoundMethodTy, \
     DTypeConstructor, Type
@@ -36,9 +37,19 @@ def _verify_params_match(stub_sig: inspect.Signature, func_sig: inspect.Signatur
 op_implementations = dict()
 
 
-def impl(stub, *, fixed_args: Sequence[Any] = ()):
+def impl(stub, *, fixed_args: Sequence[Any] = (),
+         min_version: Optional[BytecodeVersion] = None):
     stub_sig = get_signature(stub)
 
+    def _check_version():
+        cur_version = Builder.get_current().ir_ctx.tileiras_version
+        if min_version is not None and cur_version < min_version:
+            raise TileUnsupportedFeatureError(
+                f"{stub.__name__} requires tileiras "
+                f"{min_version.major()}.{min_version.minor()} or later. "
+                f"Current version is {cur_version.major()}.{cur_version.minor()}."
+            )
+
     def decorate(func):
         orig_func = func
         if len(fixed_args) > 0:
@@ -50,6 +61,7 @@ def decorate(func):
         if is_coroutine:
             @functools.wraps(func)
             async def wrapper(*args, **kwargs):
+                _check_version()
                 # Memorize the stub and the args so that we can automatically
                 # provide context for error messages.
                 old = _current_stub.stub_and_args
@@ -61,6 +73,7 @@ async def wrapper(*args, **kwargs):
         else:
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
+                _check_version()
                 # Memorize the stub and the args so that we can automatically
                 # provide context for error messages.
                 old = _current_stub.stub_and_args
 
@@ -45,7 +45,8 @@
 from .ops_utils import (
     BINOP_REGISTRY, UNARYOP_REGISTRY,
     check_rd_and_ftz, PaddingMode,
-    rounding_mode_to_bytecode, get_dtype, change_dtype, memory_order_to_bytecode,
+    rounding_mode_to_bytecode, get_default_rounding_mode, get_dtype,
+    change_dtype, memory_order_to_bytecode,
     memory_scope_to_bytecode, broadcast_shapes2, is_shape_broadcastable_to, BroadcastError,
     promote_types, promote_dtypes, check_implicit_cast
 )
@@ -68,6 +69,7 @@
     get_list_partition_view_tile_size, tensor_view_typeid, tensor_view_typeid_for_list, dtype_typeid
 )
 import cuda.tile._bytecode as bc
+from cuda.tile._bytecode.version import BytecodeVersion
 from .._debug import CUDA_TILE_TESTING_DISABLE_DIV
 
 
@@ -689,10 +691,11 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
         lhs = ctx.cast(ctx.get_value(self.lhs), ctx.typeof(self.lhs), result_type)
         rhs = ctx.cast(ctx.get_value(self.rhs), ctx.typeof(self.rhs), result_type)
         acc = ctx.cast(ctx.get_value(self.acc), ctx.typeof(self.acc), result_type)
+        rm = self.rounding_mode if self.rounding_mode is not None else get_default_rounding_mode()
         return bc.encode_FmaOp(ctx.builder,
                                ctx.typeid_of(self.result_var),
                                lhs, rhs, acc,
-                               rounding_mode_to_bytecode[self.rounding_mode],
+                               rounding_mode_to_bytecode[rm],
                                self.flush_to_zero)
 
 
@@ -966,7 +969,8 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
         dtype = get_dtype(result_ty)
         kind = "float" if datatype.is_float(dtype) else "int"
         res_typeid = typeid(ctx.type_table, result_ty)
-        rounding_mode = rounding_mode_to_bytecode[self.rounding_mode]
+        rm = self.rounding_mode if self.rounding_mode is not None else get_default_rounding_mode()
+        rounding_mode = rounding_mode_to_bytecode[rm]
         lhs = ctx.get_value(self.lhs)
         rhs = ctx.get_value(self.rhs)
 
@@ -1006,6 +1010,8 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
                                         flush_to_zero=self.flush_to_zero)
             case "pow", "float":
                 return bc.encode_PowOp(ctx.builder, res_typeid, lhs, rhs)
+            case "atan2", "float":
+                return bc.encode_Atan2Op(ctx.builder, res_typeid, lhs, rhs)
             case "min", "int":
                 return bc.encode_MinIOp(ctx.builder, res_typeid, lhs, rhs,
                                         signedness=datatype.get_signedness(dtype))
@@ -1087,6 +1093,11 @@ def binary_arithmetic_impl_with_ftz(fn: str, x: Var, y: Var, flush_to_zero: Var)
     return binary_arithmetic(fn, x, y, flush_to_zero=flush_to_zero)
 
 
+@impl(ct.atan2, min_version=BytecodeVersion.V_13_2)
+def atan2_impl(x1: Var, x2: Var) -> Var:
+    return binary_arithmetic("atan2", x1, x2)
+
+
 @impl(ct.add, fixed_args=["add"])
 @impl(ct.sub, fixed_args=["sub"])
 @impl(ct.mul, fixed_args=["mul"])
@@ -1347,7 +1358,9 @@ def __init__(self, fn: str, operand: Var,
     @override
     def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
         x = ctx.get_value(self.operand)
-        rounding_mode = rounding_mode_to_bytecode[self.rounding_mode]
+        rm = (self.rounding_mode if self.rounding_mode is not None
+              else get_default_rounding_mode(self.fn))
+        rounding_mode = rounding_mode_to_bytecode[rm]
         flush_to_zero = self.flush_to_zero
         input_type = ctx.typeof(self.operand)
         input_dtype = get_dtype(input_type)
@@ -1368,9 +1381,8 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
             case "sinh", True: return bc.encode_SinHOp(ctx.builder, res_type_id, x)
             case "cosh", True: return bc.encode_CosHOp(ctx.builder, res_type_id, x)
             case "tan", True: return bc.encode_TanOp(ctx.builder, res_type_id, x)
-            # TODO: rounding mode support depending on bytecode version
             case "tanh", True: return bc.encode_TanHOp(ctx.builder, res_type_id, x,
-                                                       rounding_mode=bc.RoundingMode.FULL)
+                                                       rounding_mode=rounding_mode)
             case "log", True: return bc.encode_LogOp(ctx.builder, res_type_id, x)
             case "log2", True: return bc.encode_Log2Op(ctx.builder, res_type_id, x)
             case "sqrt", True: return bc.encode_SqrtOp(ctx.builder, res_type_id, x,
@@ -1486,7 +1498,6 @@ def pos_impl(x: Var):
 @impl(ct.log, fixed_args=["log", _UNARY_FLOAT])
 @impl(ct.log2, fixed_args=["log2", _UNARY_FLOAT])
 @impl(ct.tan, fixed_args=["tan", _UNARY_FLOAT])
-@impl(ct.tanh, fixed_args=["tanh", _UNARY_FLOAT])
 @impl(ct.sin, fixed_args=["sin", _UNARY_FLOAT])
 @impl(ct.sinh, fixed_args=["sinh", _UNARY_FLOAT])
 @impl(ct.cos, fixed_args=["cos", _UNARY_FLOAT])
@@ -1519,6 +1530,12 @@ def unary_impl_with_rd_and_ftz(fn: str, behavior: _UnaryBehavior,
     return unary(fn, behavior, x, rounding_mode=rounding_mode, flush_to_zero=flush_to_zero)
 
 
+@impl(ct.tanh, fixed_args=["tanh", _UNARY_FLOAT])
+def unary_impl_with_rd(fn: str, behavior: _UnaryBehavior, x: Var, rounding_mode: Var) -> Var:
+    rounding_mode = require_optional_constant_enum(rounding_mode, RoundingMode)
+    return unary(fn, behavior, x, rounding_mode=rounding_mode)
+
+
 @impl(getattr)
 def getattr_impl(object: Var, name: Var) -> Var:
     ty = object.get_type()
 
@@ -4,18 +4,19 @@
 import itertools
 import math
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional, Tuple, Dict, Any, Sequence
 from enum import Enum
 
 from cuda.tile import _datatype as datatype
 
+from cuda.tile._bytecode.version import BytecodeVersion
 from cuda.tile._numeric_semantics import RoundingMode, PaddingMode
-from cuda.tile._exception import Loc, TileTypeError, TileValueError
+from cuda.tile._exception import Loc, TileTypeError, TileValueError, TileUnsupportedFeatureError
 from cuda.tile._memory_model import MemoryOrder, MemoryScope
 import cuda.tile._bytecode as bc
 
-from .ir import Operation
+from .ir import Operation, Builder
 from .type import TileTy, PointerTy, LooselyTypedScalar, make_tile_ty
 from .typing_support import typeof_pyval
 from .._datatype import DType, _DTypePromotionImpl, NumericDTypeCategory, NumericDTypeCategories, \
@@ -34,30 +35,29 @@ class ComparisonPredicates(Enum):
 @dataclass
 class MathOpDef:
     impl: callable    # Python scalar fallback
-    supported_rounding_modes: Tuple[RoundingMode, ...] = ()
+    supported_rounding_modes: Dict[RoundingMode, Optional[BytecodeVersion]] = field(
+        default_factory=dict)
     support_flush_to_zero: bool = False
 
 
+_RD_BASIC = {RoundingMode.RN: None, RoundingMode.RZ: None,
+             RoundingMode.RM: None, RoundingMode.RP: None}
+_RD_TRUEDIV = {**_RD_BASIC, RoundingMode.FULL: None, RoundingMode.APPROX: None}
+_RD_SQRT = {**_RD_BASIC, RoundingMode.APPROX: None}
+_RD_TANH = {RoundingMode.FULL: None, RoundingMode.APPROX: BytecodeVersion.V_13_2}
+
 BINOP_REGISTRY = {
-    "add": MathOpDef(lambda x, y: x + y,
-                     (RoundingMode.RN, RoundingMode.RZ, RoundingMode.RM, RoundingMode.RP),
-                     support_flush_to_zero=True),
-    "sub": MathOpDef(lambda x, y: x - y,
-                     (RoundingMode.RN, RoundingMode.RZ, RoundingMode.RM, RoundingMode.RP),
-                     support_flush_to_zero=True),
-    "mul": MathOpDef(lambda x, y: x * y,
-                     (RoundingMode.RN, RoundingMode.RZ, RoundingMode.RM, RoundingMode.RP),
-                     support_flush_to_zero=True),
+    "add": MathOpDef(lambda x, y: x + y, _RD_BASIC, support_flush_to_zero=True),
+    "sub": MathOpDef(lambda x, y: x - y, _RD_BASIC, support_flush_to_zero=True),
+    "mul": MathOpDef(lambda x, y: x * y, _RD_BASIC, support_flush_to_zero=True),
     "floordiv": MathOpDef(lambda x, y: x // y),
     "cdiv": MathOpDef(lambda x, y: (x + y - 1) // y),
-    "truediv": MathOpDef(lambda x, y: x / y,
-                         (RoundingMode.RN, RoundingMode.RZ, RoundingMode.RM, RoundingMode.RP,
-                          RoundingMode.FULL, RoundingMode.APPROX),
-                         support_flush_to_zero=True),
+    "truediv": MathOpDef(lambda x, y: x / y, _RD_TRUEDIV, support_flush_to_zero=True),
     "mod": MathOpDef(lambda x, y: x % y),
     "pow": MathOpDef(lambda x, y: x ** y),
-    "max": MathOpDef(max, (), support_flush_to_zero=True),
-    "min": MathOpDef(min, (), support_flush_to_zero=True),
+    "atan2": MathOpDef(math.atan2),
+    "max": MathOpDef(max, support_flush_to_zero=True),
+    "min": MathOpDef(min, support_flush_to_zero=True),
     "and_": MathOpDef(lambda x, y: x & y),
     "or_": MathOpDef(lambda x, y: x | y),
     "xor": MathOpDef(lambda x, y: x ^ y),
@@ -81,30 +81,26 @@ class MathOpDef:
     "abs": MathOpDef(abs),
     "neg": MathOpDef(lambda x: -x),
     "exp": MathOpDef(math.exp),
-    "exp2": MathOpDef(lambda x: 2 ** x, (), support_flush_to_zero=True),
+    "exp2": MathOpDef(lambda x: 2 ** x, support_flush_to_zero=True),
     "sin": MathOpDef(math.sin),
     "sinh": MathOpDef(math.sinh),
     "cos": MathOpDef(math.cos),
     "cosh": MathOpDef(math.cosh),
     "tan": MathOpDef(math.tan),
-    # TODO: RoundingMode support dependent on bytecode version
-    "tanh": MathOpDef(math.tanh),
+    "tanh": MathOpDef(math.tanh, _RD_TANH),
     "log": MathOpDef(math.log),
     "log2": MathOpDef(math.log2),
-    "sqrt": MathOpDef(math.sqrt,
-                      (RoundingMode.RN, RoundingMode.RZ, RoundingMode.RM, RoundingMode.RP,
-                       RoundingMode.APPROX),
-                      support_flush_to_zero=True),
-    "rsqrt": MathOpDef(lambda x: x ** -0.5, (), support_flush_to_zero=True),
+    "sqrt": MathOpDef(math.sqrt, _RD_SQRT, support_flush_to_zero=True),
+    "rsqrt": MathOpDef(lambda x: x ** -0.5, support_flush_to_zero=True),
     "invert": MathOpDef(lambda x: ~x),
     "not_": MathOpDef(lambda x: not x),
     "floor": MathOpDef(math.floor),
     "ceil": MathOpDef(math.ceil),
 }
 
 
-def get_default_rounding_mode():
-    return RoundingMode.RN
+def get_default_rounding_mode(opname: Optional[str] = None):
+    return RoundingMode.FULL if opname == 'tanh' else RoundingMode.RN
 
 
 rounding_mode_to_bytecode = {
@@ -117,8 +113,6 @@ def get_default_rounding_mode():
     RoundingMode.RZI: bc.RoundingMode.NEAREST_INT_TO_ZERO
 }
 
-rounding_mode_to_bytecode[None] = rounding_mode_to_bytecode[get_default_rounding_mode()]
-
 
 def get_rounding_mode(op: Operation, constants: Dict[str, Any]) -> Optional[RoundingMode]:
     return (
@@ -146,6 +140,14 @@ def check_rd_and_ftz(fn: str, rounding_mode: Optional[RoundingMode], flush_to_ze
         if rounding_mode not in math_op_def.supported_rounding_modes:
             raise TileTypeError(
                 f'Rounding mode {rounding_mode.value} is not supported for {fn}')
+        min_version = math_op_def.supported_rounding_modes[rounding_mode]
+        if min_version is not None:
+            cur_version = Builder.get_current().ir_ctx.tileiras_version
+            if cur_version < min_version:
+                raise TileUnsupportedFeatureError(
+                    f'{fn} rounding_mode={rounding_mode.value} requires tileiras '
+                    f'{min_version.major()}.{min_version.minor()} or later. '
+                    f'Current version is {cur_version.major()}.{cur_version.minor()}.')
         if not datatype.is_float(dtype):
             raise TileTypeError(
                 f'Rounding mode can only be used for float types, '
 
@@ -240,7 +240,8 @@ def lower_scan(ctx: "BytecodeContext", x: bc.Value, input_ty: Type,
 
     element_tile_typeid = tt.tile(element_type_id, ())
     with nested_builder.new_block((element_tile_typeid, element_tile_typeid)) as (a, b):
-        rounding_mode_bc = rounding_mode_to_bytecode[rounding_mode]
+        rm = rounding_mode if rounding_mode is not None else get_default_rounding_mode()
+        rounding_mode_bc = rounding_mode_to_bytecode[rm]
         match scan_fn, use_float:
             case "add", True:
                 res = bc.encode_AddFOp(ctx.builder, element_tile_typeid, a, b,