Fix fp8 matmul reference for sm90

haijieg · haijieg · commit 286fb8188ed3 · 2026-03-20T09:12:01.000-07:00
Currently we need `use_fast_accum` on `torch._scaled_mm` to match
the behavior of `ct.mma` and `ct.scaled_mma` with fp8 on sm90.

Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/samples/BatchMatMul.py b/samples/BatchMatMul.py
@@ -103,7 +103,8 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         A_row = A[i].contiguous()
         B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
         C[i] = torch._scaled_mm(
-            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
+            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
+            use_fast_accum=True
         )
     return C
 
diff --git a/samples/templates/BatchMatMul.py b/samples/templates/BatchMatMul.py
@@ -64,7 +64,8 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         A_row = A[i].contiguous()
         B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
         C[i] = torch._scaled_mm(
-            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
+            A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
+            use_fast_accum=True
         )
     return C
 
diff --git a/test/bench_matmul.py b/test/bench_matmul.py
@@ -181,7 +181,8 @@ def torch_batch_matmul(bs, A, B, C):
             A_row = A[i].contiguous()
             B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)
             C[i] = torch._scaled_mm(
-                A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32
+                A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,
+                use_fast_accum=True
             )
 
 
diff --git a/test/test_mma.py b/test/test_mma.py
@@ -125,7 +125,7 @@ def test_mma_fp8(tile_size, case):
     C = torch.ones((m, n), dtype=case.acc_dtype, device="cuda")
     scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
     try:
-        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype) + C
+        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype, use_fast_accum=True) + C
     except (RuntimeError, ValueError) as e:
         assert 'Multiplication of two Float8_e5m2 matrices is not supported' in str(e)
         ref = None
@@ -279,7 +279,8 @@ def test_matmul_fp8(tile_size, dtype):
     C = torch.zeros((m, n), dtype=dtype, device="cuda")
     scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
     try:
-        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=torch.float16).to(dtype)
+        ref = torch._scaled_mm(A, B.T, scale, scale,
+                               out_dtype=torch.float16, use_fast_accum=True).to(dtype)
     except (RuntimeError, ValueError) as e:
         assert 'Multiplication of two Float8_e5m2 matrices is not supported' in str(e)
         ref = None

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,8 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:`
`103`	`103`	`A_row = A[i].contiguous()`
`104`	`104`	`B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)`
`105`	`105`	`C[i] = torch._scaled_mm(`
`106`		`- A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32`
	`106`	`+ A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,`
	`107`	`+ use_fast_accum=True`
`107`	`108`	`)`
`108`	`109`	`return C`
`109`	`110`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ def torch_batch_matmul_fp8(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:`
`64`	`64`	`A_row = A[i].contiguous()`
`65`	`65`	`B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)`
`66`	`66`	`C[i] = torch._scaled_mm(`
`67`		`- A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32`
	`67`	`+ A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,`
	`68`	`+ use_fast_accum=True`
`68`	`69`	`)`
`69`	`70`	`return C`
`70`	`71`
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,8 @@ def torch_batch_matmul(bs, A, B, C):`
`181`	`181`	`A_row = A[i].contiguous()`
`182`	`182`	`B_col = B[i].transpose(-2, -1).contiguous().transpose(-2, -1)`
`183`	`183`	`C[i] = torch._scaled_mm(`
`184`		`- A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32`
	`184`	`+ A_row, B_col, scale_a=inv_sa, scale_b=inv_sb, out_dtype=torch.float32,`
	`185`	`+ use_fast_accum=True`
`185`	`186`	`)`
`186`	`187`
`187`	`188`