Update encodings and supported bytecode version to 13.3

xiaoqiqi177 · xiaoqiqi177 · commit 84daa7da1ace · 2026-04-22T15:41:56.000-07:00
Signed-off-by: Qiqi Xiao &lt;qiqix@nvidia.com&gt;
diff --git a/samples/BatchMatMul.py b/samples/BatchMatMul.py
@@ -80,7 +80,7 @@ def bmm(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype) -> torch.Tenso
     output = torch.empty((Batch, M, N), device=a.device, dtype=out_dtype)
 
     # --- Determine Tile Shapes for Optimization (Fixed for float16 as per previous request) ---
-    tm_val, tn_val, tk_val = 128, 256, 64  # Larger tiles for Tensor Core benefits
+    tm_val, tn_val, tk_val = 128, 256, 128  # Larger tiles for Tensor Core benefits
 
     # --- Grid calculation for standard 3D tiled kernel ---
     grid = (Batch, ceil(M / tm_val), ceil(N / tn_val))
@@ -103,8 +103,7 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         A_row = A[i].contiguous()
         B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
         C[i] = torch._scaled_mm(
-            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
-            use_fast_accum=True
+            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
         )
     return C
 
diff --git a/samples/templates/BatchMatMul.py b/samples/templates/BatchMatMul.py
@@ -41,7 +41,7 @@ def bmm(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype) -> torch.Tenso
     output = torch.empty((Batch, M, N), device=a.device, dtype=out_dtype)
 
     # --- Determine Tile Shapes for Optimization (Fixed for float16 as per previous request) ---
-    tm_val, tn_val, tk_val = 128, 256, 64  # Larger tiles for Tensor Core benefits
+    tm_val, tn_val, tk_val = 128, 256, 128  # Larger tiles for Tensor Core benefits
 
     # --- Grid calculation for standard 3D tiled kernel ---
     grid = (Batch, ceil(M / tm_val), ceil(N / tn_val))
@@ -64,8 +64,7 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         A_row = A[i].contiguous()
         B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
         C[i] = torch._scaled_mm(
-            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
-            use_fast_accum=True
+            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
         )
     return C
 
diff --git a/src/cuda/tile/_bytecode/encodings.py b/src/cuda/tile/_bytecode/encodings.py
@@ -63,6 +63,18 @@ class MemoryScope(enum.Enum):
     SYS = b"\x02"
 
 
+class ProgramIDDim(enum.Enum):
+    X = b"\x00"
+    Y = b"\x01"
+    Z = b"\x02"
+
+
+class PtrAttr(enum.Enum):
+    NONE = b"\x00"
+    UNICAST = b"\x01"
+    MULTICAST = b"\x02"
+
+
 class RoundingMode(enum.Enum):
     NEAREST_EVEN = b"\x00"
     ZERO = b"\x01"
@@ -313,8 +325,8 @@ def encode_AtomicRedViewTkoOp(  # since 13.3
     code_builder: CodeBuilder,
     result_token_type: TypeId,  # since 13.3
     view: Value,  # since 13.3
+    index: Sequence[Value],  # since 13.3
     value: Value,  # since 13.3
-    mask: Optional[Value],  # since 13.3
     token: Optional[Value],  # since 13.3
     memory_ordering_semantics: MemoryOrderingSemantics,  # since 13.3
     memory_scope: MemoryScope,  # since 13.3
@@ -323,19 +335,18 @@ def encode_AtomicRedViewTkoOp(  # since 13.3
     _buf = code_builder.buf
     # Opcode
     encode_varint(117, _buf)
-    # Result types
-    encode_typeid(result_token_type, _buf)
+    # Variadic result types
+    encode_sized_typeid_seq((result_token_type,), _buf)
     # Flags
-    encode_varint((mask is not None)
-                  | ((token is not None) << 1), _buf)
+    encode_varint((token is not None), _buf)
     # Attributes
     code_builder.encode_opattr_enum(MemoryOrderingSemantics, memory_ordering_semantics)
     code_builder.encode_opattr_enum(MemoryScope, memory_scope)
     code_builder.encode_opattr_enum(AtomicRMWMode, mode)
     # Operands
     encode_operand(view, _buf)
+    encode_sized_variadic_operands(index, _buf)
     encode_operand(value, _buf)
-    encode_optional_operand(mask, _buf)
     encode_optional_operand(token, _buf)
     return code_builder.new_op()
 
@@ -1242,12 +1253,18 @@ def encode_MmaFOp(
     lhs: Value,
     rhs: Value,
     acc: Value,
+    fast_acc: bool,  # since 13.3
 ) -> Value:
     _buf = code_builder.buf
     # Opcode
     encode_varint(73, _buf)
     # Result types
     encode_typeid(result_type, _buf)
+    # Flags
+    _flag_bits = bool(fast_acc)
+    assert _flag_bits < 1 or code_builder.version >= BytecodeVersion.V_13_3
+    if code_builder.version >= BytecodeVersion.V_13_3:
+        encode_varint(_flag_bits, _buf)
     # Operands
     encode_operand(lhs, _buf)
     encode_operand(rhs, _buf)
@@ -2024,6 +2041,8 @@ def encode_YieldOp(
     'IntegerOverflow',
     'MemoryOrderingSemantics',
     'MemoryScope',
+    'ProgramIDDim',
+    'PtrAttr',
     'RoundingMode',
     'Signedness',
     'SymbolVisibility',
diff --git a/src/cuda/tile/_compile.py b/src/cuda/tile/_compile.py
@@ -571,6 +571,7 @@ def _find_compiler_bin() -> _CompilerBinary:
 _SUPPORTED_VERSIONS = [
     BytecodeVersion.V_13_1,
     BytecodeVersion.V_13_2,
+    BytecodeVersion.V_13_3,
 ]
 
 
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -3217,8 +3217,9 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
             return bc.encode_MmaIOp(ctx.builder, res_typeid, x_value, y_value,
                                     acc_value, signedness_lhs, signedness_rhs)
         else:
+            # TODO: consider expose fast_acc
             return bc.encode_MmaFOp(ctx.builder, res_typeid, x_value, y_value,
-                                    acc_value)
+                                    acc_value, fast_acc=False)
 
 
 @impl(ct.mma)
diff --git a/test/bench_matmul.py b/test/bench_matmul.py
@@ -181,8 +181,7 @@ def torch_batch_matmul(bs, A, B, C):
             A_row = A[i].contiguous()
             B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
             C[i] = torch._scaled_mm(
-                A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
-                use_fast_accum=True
+                A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
             )
 
 
diff --git a/test/test_mma.py b/test/test_mma.py
@@ -125,7 +125,7 @@ def test_mma_fp8(tile_size, case):
     C = torch.ones((m, n), dtype=case.acc_dtype, device="cuda")
     scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
     try:
-        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype, use_fast_accum=True) + C
+        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype) + C
     except (RuntimeError, ValueError) as e:
         assert 'Multiplication of two Float8_e5m2 matrices is not supported' in str(e)
         ref = None
@@ -280,7 +280,7 @@ def test_matmul_fp8(tile_size, dtype):
     scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
     try:
         ref = torch._scaled_mm(A, B.T, scale, scale,
-                               out_dtype=torch.float16, use_fast_accum=True)
+                               out_dtype=torch.float16)
     except (RuntimeError, ValueError) as e:
         assert 'Multiplication of two Float8_e5m2 matrices is not supported' in str(e)
         ref = None

Original file line number	Diff line number	Diff line change
`@@ -571,6 +571,7 @@ def _find_compiler_bin() -> _CompilerBinary:`
`571`	`571`	`_SUPPORTED_VERSIONS = [`
`572`	`572`	`BytecodeVersion.V_13_1,`
`573`	`573`	`BytecodeVersion.V_13_2,`
	`574`	`+ BytecodeVersion.V_13_3,`
`574`	`575`	`]`
`575`	`576`
`576`	`577`
Original file line number	Diff line number	Diff line change
`@@ -181,8 +181,7 @@ def torch_batch_matmul(bs, A, B, C):`
`181`	`181`	`A_row = A[i].contiguous()`
`182`	`182`	`B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)`
`183`	`183`	`C[i] = torch._scaled_mm(`
`184`		`- A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,`
`185`		`- use_fast_accum=True`
	`184`	`+ A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32`
`186`	`185`	`)`
`187`	`186`
`188`	`187`