mma_scaled

blinxt · blinxt · commit 8ce0189e52b5 · 2026-03-06T14:44:53.000-08:00
Signed-off-by: Boyan Li &lt;boyanl@nvidia.com&gt;
diff --git a/changelog.d/future/mma-scaled.md b/changelog.d/future/mma-scaled.md
@@ -0,0 +1,6 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- New `ct.mma_scaled()` operation for block-scaled matrix multiply-accumulate.
+  Supported input dtypes: `float8_e4m3fn`, `float8_e5m2`, `float4_e2m1fn`.
+  Supported scale dtypes: `float8_e8m0fnu`, `float8_e4m3fn` (f4 inputs only).
diff --git a/src/cuda/tile/_datatype.py b/src/cuda/tile/_datatype.py
@@ -436,6 +436,53 @@ def _resolve_mma_supported_dtype(x_dtype: DType,
     return acc_dtype
 
 
+_mma_scaled_supported_dtypes = {
+    # operand dtype -> {scale dtype: (result dtype, scaling block sizes)}
+    float8_e4m3fn: {float8_e8m0fnu: (float32, (32,))},
+    float8_e5m2:   {float8_e8m0fnu: (float32, (32,))},
+    float4_e2m1fn: {float8_e8m0fnu: (float32, (16, 32)),
+                    float8_e4m3fn:  (float32, (16,))},
+}
+
+
+def _resolve_mma_scaled_supported_dtype(x_dtype: DType,
+                                        x_scale_dtype: DType,
+                                        y_dtype: DType,
+                                        y_scale_dtype: DType,
+                                        acc_dtype: DType):
+    if x_dtype != y_dtype:
+        raise TileTypeError(
+            f"x and y must have the same dtype, got {x_dtype} and {y_dtype}")
+    if x_scale_dtype != y_scale_dtype:
+        raise TileTypeError(
+            f"x_scale and y_scale must have the same dtype, "
+            f"got {x_scale_dtype} and {y_scale_dtype}")
+    if x_dtype not in _mma_scaled_supported_dtypes:
+        candidates = ", ".join(str(d) for d in _mma_scaled_supported_dtypes.keys())
+        raise TileTypeError(
+            f"Unsupported input dtype {x_dtype} for mma_scaled, "
+            f"supported input dtypes are {candidates}")
+    scale_candidates = _mma_scaled_supported_dtypes[x_dtype]
+    if x_scale_dtype not in scale_candidates:
+        candidate_names = ", ".join(str(s) for s in scale_candidates.keys())
+        raise TileTypeError(
+            f"Unsupported scale dtype {x_scale_dtype} for input dtype {x_dtype}, "
+            f"supported scale dtypes are {candidate_names}")
+    expected_acc, _ = scale_candidates[x_scale_dtype]
+    if acc_dtype != expected_acc:
+        raise TileTypeError(
+            f"Unsupported acc dtype {acc_dtype} for mma_scaled, "
+            f"expected {expected_acc}")
+
+
+def _get_mma_scaled_scaling_block_sizes(data_dtype, scale_dtype) -> Tuple[int, ...]:
+    assert data_dtype in _mma_scaled_supported_dtypes
+    scale_candidates = _mma_scaled_supported_dtypes[data_dtype]
+    assert scale_dtype in scale_candidates
+    _, scaling_block_sizes = scale_candidates[scale_dtype]
+    return scaling_block_sizes
+
+
 # =============== Documentation Generator ================
 
 def _generate_rst_dtype_promotion_table() -> str:
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -2786,7 +2786,7 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
 
 
 @impl(ct.mma)
-def mma(x: Var, y: Var, acc: Var) -> Var:
+def mma_impl(x: Var, y: Var, acc: Var) -> Var:
     x_tile_type = require_tile_type(x)
     y_tile_type = require_tile_type(y)
     acc_tile_type = require_tile_type(acc)
@@ -2808,7 +2808,7 @@ def mma(x: Var, y: Var, acc: Var) -> Var:
 
 @impl(ct.matmul)
 @impl(operator.matmul)
-def matmul(x: Var, y: Var) -> Var:
+def matmul_impl(x: Var, y: Var) -> Var:
     x_tile_type = require_tile_type(x)
     y_tile_type = require_tile_type(y)
     x_shape_orig = x_tile_type.shape
@@ -2833,6 +2833,89 @@ def matmul(x: Var, y: Var) -> Var:
     return ret
 
 
+@dataclass(eq=False)
+class TileMmaScaled(Operation, opcode="tile_mma_scaled"):
+    x: Var = operand()
+    x_scale: Var = operand()
+    y: Var = operand()
+    y_scale: Var = operand()
+    acc: Var = operand()
+
+    @override
+    def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
+        x_value = ctx.get_value(self.x)
+        x_scale_value = ctx.get_value(self.x_scale)
+        y_value = ctx.get_value(self.y)
+        y_scale_value = ctx.get_value(self.y_scale)
+        acc_value = ctx.get_value(self.acc)
+        res_typeid = ctx.typeid_of(self.result_var)
+        return bc.encode_MmaFScaledOp(ctx.builder, res_typeid, x_value, y_value,
+                                      acc_value, x_scale_value, y_scale_value)
+
+
+def _verify_scaling_block_size(ty: TileTy, scale_ty: TileTy, k_axis: int,
+                               name: str, scale_name: str):
+    shape = ty.shape_value
+    dtype = ty.dtype
+    scale_shape = scale_ty.shape_value
+    scale_dtype = scale_ty.dtype
+    k_axis = normalize_axis(k_axis, len(shape))
+    if any(x != y for i, (x, y) in enumerate(zip(shape, scale_shape, strict=True)) if i != k_axis):
+        raise TileTypeError(
+            f"{scale_name} shape {scale_shape} is not compatible with {name} shape {shape}. "
+            f"All dimensions except K axis {k_axis} must match")
+
+    allowed = datatype._get_mma_scaled_scaling_block_sizes(ty.dtype, scale_ty.dtype)
+    scaling_block_size, rem = divmod(shape[k_axis], scale_shape[k_axis])
+    if rem != 0 or scaling_block_size not in allowed:
+        raise TileTypeError(
+            f"For mma_scaled with dtype={dtype}, scale_dtype={scale_dtype}: "
+            f"{name}.shape[{k_axis}] must be an exact multiple of {scale_name}.shape[{k_axis}] "
+            f"with scaling block size B = K // K_s in {set(allowed)}, "
+            f"got {name}.shape[{k_axis}] = {shape[k_axis]} and "
+            f"{scale_name}.shape[{k_axis}] = {scale_shape[k_axis]}")
+
+
+@impl(ct.mma_scaled, min_version=BytecodeVersion.V_13_3)
+def mma_scaled_impl(x: Var, x_scale: Var, y: Var, y_scale: Var, acc: Var) -> Var:
+    x_ty = require_tile_type(x)
+    y_ty = require_tile_type(y)
+    acc_ty = require_tile_type(acc)
+    x_scale_ty = require_tile_type(x_scale)
+    y_scale_ty = require_tile_type(y_scale)
+
+    for name, shape in [("x", x_ty.shape), ("y", y_ty.shape),
+                        ("acc", acc_ty.shape),
+                        ("x_scale", x_scale_ty.shape),
+                        ("y_scale", y_scale_ty.shape)]:
+        if len(shape) not in [2, 3]:
+            raise TileTypeError(
+                f'Expect shape of `{name}` to be 2D or 3D, got {shape}')
+
+    datatype._resolve_mma_scaled_supported_dtype(
+        x_ty.dtype, x_scale_ty.dtype,
+        y_ty.dtype, y_scale_ty.dtype,
+        acc_ty.dtype)
+    _verify_scaling_block_size(x_ty, x_scale_ty, k_axis=-1, name="x", scale_name="x_scale")
+    _verify_scaling_block_size(y_ty, y_scale_ty, k_axis=-2, name="y", scale_name="y_scale")
+
+    x_shape, y_shape, _, output_shape = _matmul_broadcast_shape(x_ty.shape, y_ty.shape)
+    if acc_ty.shape != output_shape:
+        raise TileTypeError(f'Expect acc shape to be {output_shape}, got {acc_ty.shape}')
+
+    # Broadcast scale batch dims to match the broadcasted x/y batch dims
+    batch = x_shape.value_types[:-2]
+    x_scale_shape = TupleTy(batch + x_scale_ty.shape.value_types[-2:])
+    y_scale_shape = TupleTy(batch + y_scale_ty.shape.value_types[-2:])
+
+    x = _promote_and_broadcast_to(x, TileTy(x_ty.dtype, x_shape))
+    y = _promote_and_broadcast_to(y, TileTy(y_ty.dtype, y_shape))
+    x_scale = _promote_and_broadcast_to(x_scale, TileTy(x_scale_ty.dtype, x_scale_shape))
+    y_scale = _promote_and_broadcast_to(y_scale, TileTy(y_scale_ty.dtype, y_scale_shape))
+    return add_operation(TileMmaScaled, acc_ty,
+                         x=x, x_scale=x_scale, y=y, y_scale=y_scale, acc=acc)
+
+
 @dataclass(eq=False)
 class TileReduce(Operation, opcode="tile_reduce"):
     identities: tuple[bool | int | float, ...] = attribute()
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -1005,6 +1005,61 @@ def mma(x, y, /, acc) -> Tile:
     """
 
 
+@function
+def mma_scaled(x, x_scale, y, y_scale, /, acc) -> Tile:
+    """Block-scaled matrix multiply-accumulate.
+
+    Computes a matrix multiply-accumulate where inputs are scaled by block scales
+    along the K dimension before the mma::
+
+        result[i, j] = sum(x[i, k] * x_scale[i, k // B] * y[k, j] * y_scale[k // B, j]
+                           for k in range(K)) + acc[i, j]
+
+    The scaling block size is ``B = K // K_s``, where ``K_s`` is the K dimension of the scale tile.
+    ``K`` must be divisible by ``K_s``, and ``B`` must be one of the allowed values listed
+    in the table below.
+
+    Args:
+        x (Tile): LHS input, 2D or 3D ``[..., M, K]``.
+        x_scale (Tile): Scale factors for x, shape ``[..., M, K_s]``.
+            All dimensions except K_s must match x exactly.
+        y (Tile): RHS input, 2D or 3D ``[..., K, N]``.
+        y_scale (Tile): Scale factors for y, shape ``[..., K_s, N]``.
+            All dimensions except K_s must match y exactly.
+        acc (Tile): Accumulator ``[..., M, N]``.
+
+    Supported datatypes and scaling block sizes:
+
+    +----------------------------+------------+---------+--------+
+    | Input (x/y)                | Scale      | Acc/Out | B      |
+    +============================+============+=========+========+
+    | f8e4m3fn, f8e5m2           | f8e8m0fnu  | f32     | 32     |
+    +----------------------------+------------+---------+--------+
+    | f4e2m1fn                   | f8e8m0fnu  | f32     | 16, 32 |
+    +----------------------------+------------+---------+--------+
+    | f4e2m1fn                   | f8e4m3fn   | f32     | 16     |
+    +----------------------------+------------+---------+--------+
+
+    Batch dimensions of x and y are broadcast against each other (same as
+    :func:`mma`). x_scale's batch dimension must match x's batch exactly,
+    and y_scale's batch dimension must match y's batch exactly; both are
+    then broadcast to the output batch shape.
+
+    Returns:
+        Tile:
+
+    Example:
+
+        >>> # B = K // K_s = 64 // 2 = 32
+        >>> tx = ct.ones((16, 64), ct.float8_e4m3fn)
+        >>> sx = ct.ones((16, 2), ct.float8_e8m0fnu)
+        >>> ty = ct.ones((64, 16), ct.float8_e4m3fn)
+        >>> sy = ct.ones((2, 16), ct.float8_e8m0fnu)
+        >>> acc = ct.zeros((16, 16), ct.float32)
+        >>> tz = ct.mma_scaled(tx, sx, ty, sy, acc)
+    """
+
+
 @function
 def matmul(x, y, /) -> Tile:
     """Performs matrix multiply on the given tiles.
diff --git a/test/test_mma_scaled.py b/test/test_mma_scaled.py