Loosen tf32 matmul sample tolerance for Ampere

haijieg · haijieg · commit c56fa240a9ff · 2026-02-09T08:55:25.000-08:00
Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/samples/MatMul.py b/samples/MatMul.py
@@ -296,7 +296,7 @@ def cutile_matmul(A: torch.Tensor, B: torch.Tensor, persistent: bool = False) ->
 
     if torch.cuda.get_device_capability()[0] <= 8:
         # Ampere tfloat32 numerics is loose
-        atol, rtol = 5e-3, 5e-3
+        atol, rtol = 1e-2, 1e-2
     else:
         atol, rtol = 1e-4, 1e-3
 
diff --git a/samples/templates/MatMul.py b/samples/templates/MatMul.py
@@ -129,7 +129,7 @@ def cutile_matmul(A: torch.Tensor, B: torch.Tensor, persistent: bool = False) ->
 
     if torch.cuda.get_device_capability()[0] <= 8:
         # Ampere tfloat32 numerics is loose
-        atol, rtol = 5e-3, 5e-3
+        atol, rtol = 1e-2, 1e-2
     else:
         atol, rtol = 1e-4, 1e-3