Raise UnsupportedFeatureError for FP8 on sm80 family

haijieg · haijieg · commit 2df0fc25d14d · 2026-02-04T22:24:56.000-08:00
- Relax tf32 mma test tolerance for sm80 family
- Fix rmsnorm kernel to add zero padding when load out of bound
- Reduce tile size on sm80 family for persistent rmsnorm benchmark

Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/changelog.d/raise-fp8-unsupported-error-for-sm80.md b/changelog.d/raise-fp8-unsupported-error-for-sm80.md
@@ -0,0 +1,5 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- Compiling FP8 operation for SM80 family will raise `TileUnsupportedFeatureError`
+- Add `TileUnsupportedFeatureError` to the public API
diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
@@ -13,6 +13,7 @@ Exception Types
 .. autoclass:: TileSyntaxError()
 .. autoclass:: TileTypeError()
 .. autoclass:: TileValueError()
+.. autoclass:: TileUnsupportedFeatureError()
 .. autoclass:: TileCompilerExecutionError()
 .. autoclass:: TileCompilerTimeoutError()
 
diff --git a/samples/BatchMatMul.py b/samples/BatchMatMul.py
@@ -143,23 +143,26 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
     # --- Test Case 2: Standard BMM (float8_e4m3fn) ---
     print("\n--- Test 2: Standard BMM (float8_e4m3fn) ---")
-    A_fp8 = torch.randn(
-        BATCH_DIM, M_DIM, K_DIM, dtype=torch.float32, device='cuda'
-    ).to(torch.float8_e4m3fn)
-    B_fp8 = torch.randn(
-        BATCH_DIM, K_DIM, N_DIM, dtype=torch.float32, device='cuda'
-    ).to(torch.float8_e4m3fn)
-    print(f"Input A shape: {A_fp8.shape}, dtype: {A_fp8.dtype}")
-    print(f"Input B shape: {B_fp8.shape}, dtype: {B_fp8.dtype}")
-
-    C_bmm_cutile_fp32 = bmm(A_fp8, B_fp8, torch.float32)
-    print(f"""cuTile Standard BMM Output C
-            shape:{C_bmm_cutile_fp32.shape},
-            dtype: {C_bmm_cutile_fp32.dtype}""")
-    if args.correctness_check:
-        torch.testing.assert_close(C_bmm_cutile_fp32, torch_batch_matmul_fp8(A_fp8, B_fp8))
-        print("Correctness check passed")
+    if torch.cuda.get_device_capability()[0] == 8:
+        print("skip: Ampere does not support float8")
     else:
-        print("Correctness check disabled")
+        A_fp8 = torch.randn(
+            BATCH_DIM, M_DIM, K_DIM, dtype=torch.float32, device='cuda'
+        ).to(torch.float8_e4m3fn)
+        B_fp8 = torch.randn(
+            BATCH_DIM, K_DIM, N_DIM, dtype=torch.float32, device='cuda'
+        ).to(torch.float8_e4m3fn)
+        print(f"Input A shape: {A_fp8.shape}, dtype: {A_fp8.dtype}")
+        print(f"Input B shape: {B_fp8.shape}, dtype: {B_fp8.dtype}")
+
+        C_bmm_cutile_fp32 = bmm(A_fp8, B_fp8, torch.float32)
+        print(f"""cuTile Standard BMM Output C
+                shape:{C_bmm_cutile_fp32.shape},
+                dtype: {C_bmm_cutile_fp32.dtype}""")
+        if args.correctness_check:
+            torch.testing.assert_close(C_bmm_cutile_fp32, torch_batch_matmul_fp8(A_fp8, B_fp8))
+            print("Correctness check passed")
+        else:
+            print("Correctness check disabled")
 
     print("\n--- cuTile Batched Matrix Multiplication (Standard Tiled) examples complete ---")
diff --git a/samples/MatMul.py b/samples/MatMul.py
@@ -294,7 +294,11 @@ def cutile_matmul(A: torch.Tensor, B: torch.Tensor, persistent: bool = False) ->
     print(f"Input A shape: {A_fp32.shape}, dtype: {A_fp32.dtype}")
     print(f"Input B shape: {B_fp32.shape}, dtype: {B_fp32.dtype}")
 
-    atol, rtol = 1e-4, 1e-3
+    if torch.cuda.get_device_capability()[0] <= 8:
+        # Ampere tfloat32 numerics is loose
+        atol, rtol = 5e-3, 5e-3
+    else:
+        atol, rtol = 1e-4, 1e-3
 
     # Perform matrix multiplication using the cuTile wrapper function.
     C_fp32_cutile = cutile_matmul(A_fp32, B_fp32)
diff --git a/samples/templates/BatchMatMul.py b/samples/templates/BatchMatMul.py
@@ -104,23 +104,26 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
     # --- Test Case 2: Standard BMM (float8_e4m3fn) ---
     print("\n--- Test 2: Standard BMM (float8_e4m3fn) ---")
-    A_fp8 = torch.randn(
-        BATCH_DIM, M_DIM, K_DIM, dtype=torch.float32, device='cuda'
-    ).to(torch.float8_e4m3fn)
-    B_fp8 = torch.randn(
-        BATCH_DIM, K_DIM, N_DIM, dtype=torch.float32, device='cuda'
-    ).to(torch.float8_e4m3fn)
-    print(f"Input A shape: {A_fp8.shape}, dtype: {A_fp8.dtype}")
-    print(f"Input B shape: {B_fp8.shape}, dtype: {B_fp8.dtype}")
-
-    C_bmm_cutile_fp32 = bmm(A_fp8, B_fp8, torch.float32)
-    print(f"""cuTile Standard BMM Output C
-            shape:{C_bmm_cutile_fp32.shape},
-            dtype: {C_bmm_cutile_fp32.dtype}""")
-    if args.correctness_check:
-        torch.testing.assert_close(C_bmm_cutile_fp32, torch_batch_matmul_fp8(A_fp8, B_fp8))
-        print("Correctness check passed")
+    if torch.cuda.get_device_capability()[0] == 8:
+        print("skip: Ampere does not support float8")
     else:
-        print("Correctness check disabled")
+        A_fp8 = torch.randn(
+            BATCH_DIM, M_DIM, K_DIM, dtype=torch.float32, device='cuda'
+        ).to(torch.float8_e4m3fn)
+        B_fp8 = torch.randn(
+            BATCH_DIM, K_DIM, N_DIM, dtype=torch.float32, device='cuda'
+        ).to(torch.float8_e4m3fn)
+        print(f"Input A shape: {A_fp8.shape}, dtype: {A_fp8.dtype}")
+        print(f"Input B shape: {B_fp8.shape}, dtype: {B_fp8.dtype}")
+
+        C_bmm_cutile_fp32 = bmm(A_fp8, B_fp8, torch.float32)
+        print(f"""cuTile Standard BMM Output C
+                shape:{C_bmm_cutile_fp32.shape},
+                dtype: {C_bmm_cutile_fp32.dtype}""")
+        if args.correctness_check:
+            torch.testing.assert_close(C_bmm_cutile_fp32, torch_batch_matmul_fp8(A_fp8, B_fp8))
+            print("Correctness check passed")
+        else:
+            print("Correctness check disabled")
 
     print("\n--- cuTile Batched Matrix Multiplication (Standard Tiled) examples complete ---")
diff --git a/samples/templates/MatMul.py b/samples/templates/MatMul.py
@@ -127,7 +127,11 @@ def cutile_matmul(A: torch.Tensor, B: torch.Tensor, persistent: bool = False) ->
     print(f"Input A shape: {A_fp32.shape}, dtype: {A_fp32.dtype}")
     print(f"Input B shape: {B_fp32.shape}, dtype: {B_fp32.dtype}")
 
-    atol, rtol = 1e-4, 1e-3
+    if torch.cuda.get_device_capability()[0] <= 8:
+        # Ampere tfloat32 numerics is loose
+        atol, rtol = 5e-3, 5e-3
+    else:
+        atol, rtol = 1e-4, 1e-3
 
     # Perform matrix multiplication using the cuTile wrapper function.
     C_fp32_cutile = cutile_matmul(A_fp32, B_fp32)
diff --git a/src/cuda/tile/__init__.py b/src/cuda/tile/__init__.py
@@ -47,6 +47,7 @@
     TileRecursionError,
     TileSyntaxError,
     TileTypeError,
+    TileUnsupportedFeatureError,
     TileValueError,
 )
 
@@ -181,6 +182,7 @@
     "TileRecursionError",
     "TileSyntaxError",
     "TileTypeError",
+    "TileUnsupportedFeatureError",
     "TileValueError",
 
     "Array",
diff --git a/src/cuda/tile/_compile.py b/src/cuda/tile/_compile.py
@@ -46,6 +46,7 @@
 )
 
 from cuda.tile._passes.alias_analysis import alias_analysis_pass
+from cuda.tile._passes.check_ampere_fp8 import check_ampere_fp8
 from cuda.tile._passes.dce import dead_code_elimination_pass
 from cuda.tile._passes.token_order import token_order_pass
 from cuda.tile._ir2bytecode import generate_bytecode_for_kernel
@@ -195,6 +196,7 @@ def compile_tile(pyfunc,
         print(f'\n{code}', file=sys.stderr)
 
     sm_arch = get_sm_arch()
+    check_ampere_fp8(func_ir.body, sm_arch)
 
     bytecode_generator = functools.partial(generate_bytecode_for_kernel,
                                            func_ir, compiler_options, sm_arch)
diff --git a/src/cuda/tile/_exception.py b/src/cuda/tile/_exception.py
@@ -145,6 +145,12 @@ class TileValueError(TileError):
     pass
 
 
+class TileUnsupportedFeatureError(TileError):
+    """Exception when a feature is not supported by the underlying compiler or
+      the GPU architecture."""
+    pass
+
+
 class TileInternalError(TileError):
     pass
 
diff --git a/src/cuda/tile/_passes/check_ampere_fp8.py b/src/cuda/tile/_passes/check_ampere_fp8.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) <2025> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.tile._ir.ir import Block
+from cuda.tile._ir.type import TileTy, ArrayTy
+from cuda.tile._datatype import float8_e4m3fn, float8_e5m2, DType
+from cuda.tile._exception import TileUnsupportedFeatureError
+
+FLOAT8_DTYPES = (float8_e4m3fn, float8_e5m2)
+
+
+def check_ampere_fp8(root_block: Block, sm_arch: str) -> None:
+    # Technically sm_89 (Ada Lovelace) supports FP8, but tileiras doesn't have support for it yet.
+    if not sm_arch.startswith("sm_8"):
+        return
+
+    for op in root_block.traverse():
+        for var in op.all_inputs():
+            ty = var.try_get_type()
+            dtype = None
+            if isinstance(ty, (TileTy, ArrayTy)):
+                dtype = ty.dtype
+            elif isinstance(ty, DType):
+                dtype = ty
+            if dtype in FLOAT8_DTYPES:
+                raise TileUnsupportedFeatureError(
+                    "float8 dtype is not supported on Ampere or Ada Lovelace (sm_8*) architecture",
+                    loc=op.loc
+                )
diff --git a/test/bench_matmul.py b/test/bench_matmul.py
@@ -8,7 +8,7 @@
 import pytest
 import cuda.tile as ct
 
-from util import estimate_bench_iter, torch_use_tf32_matmul
+from util import estimate_bench_iter, torch_use_tf32_matmul, is_ampere_or_ada
 from kernels.matmul import (
     matmul_kernel, matmul_split_k_kernel, batch_matmul_kernel, persistent_matmul_kernel
 )
@@ -156,6 +156,7 @@ def fp8_dtype(request):
     return request.param
 
 
+@pytest.mark.skipif(is_ampere_or_ada(), reason="float8 not supported on Ampere or Ada")
 @pytest.mark.benchmark(group='batch_matmul')
 def bench_batch_matmul(batch_matmul_shape, fp8_dtype, backend, benchmark):
     _run_batch_matmul_benchmark(batch_matmul_shape, fp8_dtype, backend, benchmark)
diff --git a/test/bench_rms_norm.py b/test/bench_rms_norm.py
@@ -10,7 +10,7 @@
 import cuda.tile_experimental as ct_experimental
 import itertools
 from math import ceil
-from util import estimate_bench_iter, next_power_of_2
+from util import estimate_bench_iter, next_power_of_2, is_ampere_or_ada
 from kernels.rms_norm import (
     rms_norm_kernel, rms_norm_kernel_gather, rms_norm_kernel_static_persistent
 )
@@ -85,18 +85,18 @@ def cutile_rms_norm(x, weight, eps, static_persistent, gather):
     M, N = x.shape
 
     if static_persistent:
-        NUM_SMS = torch.cuda.get_device_properties(
-            "cuda"
-        ).multi_processor_count
-        TILE_SIZE_M = 4  # Default value, could be made configurable
+        device_prop = torch.cuda.get_device_properties("cuda")
+        NUM_SMS = device_prop.multi_processor_count
         TILE_SIZE_N = next_power_of_2(N)
-
-        # Other tile sizes are more optimal when other dimension is too large/too small
-        if TILE_SIZE_N <= 1024:
-            TILE_SIZE_M = 16
-        elif TILE_SIZE_N >= 16384:
+        if is_ampere_or_ada():
             TILE_SIZE_M = 2
-
+        else:
+            if TILE_SIZE_N <= 1024:
+                TILE_SIZE_M = 16
+            elif TILE_SIZE_N >= 16384:
+                TILE_SIZE_M = 2
+            else:
+                TILE_SIZE_M = 4
         grid_size = min(
             NUM_SMS,
             ceil(M / TILE_SIZE_M) * ceil(N / TILE_SIZE_N),
diff --git a/test/kernels/rms_norm.py b/test/kernels/rms_norm.py
@@ -26,6 +26,7 @@ def rms_norm_kernel(
             x, index=(row, j), shape=(1, TILE_SIZE),
             allow_tma=False,
             latency=1,
+            padding_mode=ct.PaddingMode.ZERO
         )
         xj = ct.astype(xj, np.float32)
         _rms += xj * xj
diff --git a/test/test_mma.py b/test/test_mma.py
@@ -3,13 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
+from unittest.mock import patch
 import pytest
 import torch
 
 import cuda.tile as ct
-from util import assert_close, assert_equal, torch_to_tf32
+from util import assert_close, assert_equal, torch_to_tf32, is_ampere_or_ada
 from conftest import dtype_id
-from cuda.tile._exception import TileTypeError
+from cuda.tile._exception import TileTypeError, TileUnsupportedFeatureError
 
 
 # ================ ct.mma =================
@@ -111,6 +112,7 @@ def test_mma_regular_float(tile_size, case):
     assert_close(C, ref, atol=atol, rtol=rtol)
 
 
+@pytest.mark.skipif(is_ampere_or_ada(), reason="float8 not supported on Ampere or Ada")
 @pytest.mark.parametrize("tile_size", [(16, 16, 16)])
 @pytest.mark.parametrize("case", fp8_cases, ids=str)
 def test_mma_fp8(tile_size, case):
@@ -140,8 +142,12 @@ def test_mma_tf32(tile_size):
     ref = C + torch_to_tf32(A) @ torch_to_tf32(B)
     ct.launch(torch.cuda.current_stream(), (1,), mma_tf32_kernel,
               (A, B, C, m, n, k))
-    # use float16 for tolerance because tf32 has the same precision
-    atol, rtol = get_tolerance(torch.float16)
+    if is_ampere_or_ada():
+        # ampere has loose tfloat32 numerics
+        atol, rtol = 5e-3, 5e-3
+    else:
+        # use float16 for tolerance because tf32 has the same precision
+        atol, rtol = get_tolerance(torch.float16)
     assert_close(C, ref, atol=atol, rtol=rtol)
 
 
@@ -260,6 +266,7 @@ def test_matmul(tile_size, x_dtype, y_dtype):
         assert_close(C, ref, atol=atol, rtol=rtol)
 
 
+@pytest.mark.skipif(is_ampere_or_ada(), reason="float8 not supported on Ampere or Ada")
 @pytest.mark.parametrize("tile_size", [(16, 16, 16)])
 @pytest.mark.parametrize("dtype", [f8e4m3fn, f8e5m2], ids=dtype_id)
 def test_matmul_fp8(tile_size, dtype):
@@ -385,3 +392,15 @@ def test_matmul_nd(ranks):
               (A, B, C, b, m, n, k))
     atol, rtol = get_tolerance(A.dtype)
     assert_close(C, ref, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("dtype", [f8e4m3fn, f8e5m2], ids=dtype_id)
+def test_ampere_fp8_error(dtype):
+    A = torch.randn((16, 16), device="cuda").to(dtype)
+    B = torch.randn((16, 16), device="cuda").to(dtype)
+    C = torch.zeros((16, 16), dtype=torch.float16, device="cuda")
+    with patch("cuda.tile._compile.get_sm_arch", return_value="sm_80"):
+        with pytest.raises(TileUnsupportedFeatureError,
+                           match="float8 dtype is not supported on Ampere"):
+            ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
+                      (A, B, C, 16, 16, 16))
diff --git a/test/test_typeinfer.py b/test/test_typeinfer.py
@@ -11,7 +11,7 @@
 from cuda.tile._exception import TileTypeError, TileValueError
 from cuda.tile._compile import compile_tile
 
-from util import raises_if
+from util import raises_if, is_ampere_or_ada
 
 
 def nd_tensor(nd: int, dtype=None):
@@ -206,6 +206,7 @@ def kernel():
 
 
 def test_printf_format():
+    SKIP_FP8 = is_ampere_or_ada()
 
     def print_kernel():
         # signed
@@ -226,8 +227,9 @@ def print_kernel():
         ct.printf("%f", ct.float16(3.14))
         ct.printf("%f", ct.float32(3.14))
         ct.printf("%f", ct.float64(3.14))
-        ct.printf("%f", ct.float8_e5m2(3.14))
-        ct.printf("%f", ct.float8_e4m3fn(3.14))
+        if not SKIP_FP8:
+            ct.printf("%f", ct.float8_e5m2(3.14))
+            ct.printf("%f", ct.float8_e4m3fn(3.14))
         ct.printf("%f", ct.tfloat32(3.14))
         # others
         ct.printf("escape %% %d", 123)
diff --git a/test/util.py b/test/util.py
@@ -22,6 +22,8 @@
 
 from cuda.tile._exception import TileTypeError
 from cuda.tile._compile import compile_tile
+from cuda.tile._cext import get_compute_capability
+
 
 TensorLike = torch.Tensor
 Scalar = Union[int, float]
@@ -217,3 +219,8 @@ def torch_use_tf32_matmul():
     torch.backends.cuda.matmul.fp32_precision = "tf32"
     yield
     torch.backends.cuda.matmul.fp32_precision = origin
+
+
+def is_ampere_or_ada():
+    major, _minor = get_compute_capability()
+    return major == 8