Remove opt_level=0 workaround for print functions

gunnersdeng · gunnersdeng · commit ac6b47cf3c2e · 2026-03-06T11:42:20.000-08:00
Signed-off-by: Ziheng Deng &lt;zihengd@nvidia.com&gt;
diff --git a/changelog.d/remove-opt-level-print-note.md b/changelog.d/remove-opt-level-print-note.md
@@ -0,0 +1,4 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- Removed the `opt_level=0` workaround note from `ct.printf()` and `ct.print()` documentation; it is no longer required as of tileiras 13.2
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -2023,15 +2023,6 @@ def printf(format, *args) -> None:
     Notes:
         This operation has significant overhead, and should only be used
         for debugging purpose.
-
-        When printing from multiple tile blocks, outputs will be interleaved.
-        One workaround is to set optimization level to 0:
-
-        .. code-block:: python
-
-            @ct.kernel(opt_level=0)
-            def my_print_kernel():
-                ct.printf("%d", 123)
     """
 
 
@@ -2062,8 +2053,6 @@ def print(*args, sep: str = ' ', end: str = '\n') -> None:
 
         F-string expressions must evaluate to tile values. Constant compile-time
         values are supported as string-formatted segments.
-
-        Use ``opt_level=0`` to prevent block-level output interleaving.
     """
 
 
diff --git a/test/test_print.py b/test/test_print.py
@@ -12,68 +12,72 @@
 
 from math import ceil
 import cuda.tile as ct
+from cuda.tile._bytecode.version import BytecodeVersion
+from cuda.tile._compiler_options import CompilerOptions
+from conftest import get_tileiras_version
 
-# FIXME: Default opt_level causes print to be out of order.
-# Remove when it is fixed in tile compiler.
+# opt_level=0 required for correct print ordering in tileiras < 13.2
+_DEFAULT_OPT_LEVEL = CompilerOptions.__dataclass_fields__['opt_level'].default
+_OPT_LEVEL = 0 if get_tileiras_version() < BytecodeVersion.V_13_2 else _DEFAULT_OPT_LEVEL
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_printf_float(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.printf("tile[%d]:%.5f\n", bid, tx)
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_printf_int(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.printf("tile[%d]:%d\n", bid, tx)
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_print_int(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.print(f"tile[{bid}]:{tx}")
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_print_float(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.print(f"tile[{bid}]:{tx:.5f}")
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_print_sep(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.print("tile:", tx, sep='')
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_print_two_vars_with_expr(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.print(f"tile[{bid}]: a={tx:.6f} b={tx + tx:.6f}")
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_print_no_end(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     ct.print(tx, end='')
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_builtin_print_int(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
     print(f"tile[{bid}]:{tx}")
 
 
-@ct.kernel(opt_level=0)
+@ct.kernel(opt_level=_OPT_LEVEL)
 def kernel_builtin_print_float(x, TILE: ct.Constant[int]):
     bid = ct.bid(0)
     tx = ct.load(x, index=(bid,), shape=(TILE,))
@@ -213,7 +217,7 @@ def test_builtin_print(shape, tile, dtype_str):
 def test_ct_print_error_conversion():
     from cuda.tile._exception import TileSyntaxError
 
-    @ct.kernel(opt_level=0)
+    @ct.kernel(opt_level=_OPT_LEVEL)
     def bad_kernel(x, TILE: ct.Constant[int]):
         tx = ct.load(x, index=(0,), shape=(TILE,))
         ct.print(f"{tx!r}")
@@ -226,7 +230,7 @@ def bad_kernel(x, TILE: ct.Constant[int]):
 def test_ct_print_error_dynamic_format_spec():
     from cuda.tile._exception import TileSyntaxError
 
-    @ct.kernel(opt_level=0)
+    @ct.kernel(opt_level=_OPT_LEVEL)
     def bad_kernel(x, TILE: ct.Constant[int]):
         width = 5
         tx = ct.load(x, index=(0,), shape=(TILE,))