switch correctness checks to SNR-based assertion for cuda quant int4_matmul (#19300)

Gasoonjia · web-flow · commit a0d6e9bc4bb8 · 2026-05-05T12:33:46.000-07:00
Replace torch.allclose(atol/rtol) with an SNR (signal-to-noise ratio)
assertion across all int4_matmul / int4_matvec / dequant-vs-fused tests.

Why:
- test_prefill_short was flaking on CI (A10G) with max_abs_err=1.0000.
Root cause: bf16 GEMM with K=2048 reduction produces output magnitudes
up to ~200; at that scale, the bf16 ULP gap is 0.5-1.0. Triton fused
kernel and cuBLAS reduce in different orders (and Triton autotune picks
different tile configs on different hardware), so 1-ULP element-wise
differences are unavoidable. atol/rtol false-fails on these outliers;
SNR averages them out.
- atol/rtol thresholds also depend on size: a value tuned for K=2048 is
too loose for K=64 and too tight for K=4096. SNR is size-invariant
(||signal|| and ||noise|| both scale with sqrt(N) and sqrt(K), canceling
in the ratio).

What:
- Add _assert_snr(test_case, actual, expected, label) helper that
asserts 20*log10(||expected|| / ||actual-expected||) &gt;= 50 dB.
- Replace 4 call sites: TestInt4Matmul, TestInt4Matvec (x2),
TestDequantThenMatmul.
- 50 dB ~ 0.3% RMS error: well below observed clean noise (80-90 dB) and
well above any real functional bug (&lt;20 dB SNR for wrong stride /
flipped nibble / off-by-one group_idx / missing mask).

Test plan:
  python -m pytest backends/cuda/tests/test_int4_matmul.py -v
  -&gt; 35/35 passed
diff --git a/backends/cuda/tests/test_int4_matmul.py b/backends/cuda/tests/test_int4_matmul.py
@@ -19,7 +19,6 @@
 import unittest
 
 import torch
-
 from executorch.backends.cuda.triton.kernels.int4_matmul import (
     dequant_w4_to_bf16,
     int4_matmul,
@@ -28,6 +27,41 @@
 
 ATOL = 0.01
 DEVICE = "cuda"
+SNR_THRESHOLD_DB = 50.0
+
+
+def _assert_snr(test_case, actual, expected, label, threshold_db=SNR_THRESHOLD_DB):
+    """Assert signal-to-noise ratio (in dB) of `actual` vs `expected` >= threshold.
+
+    SNR = 20*log10(||expected||_2 / ||actual - expected||_2)
+
+    Why SNR rather than torch.allclose(atol/rtol):
+      * Size-invariant: ||signal|| and ||noise|| both scale with sqrt(N) and
+        with sqrt(K) (CLT + random-walk rounding), so the ratio is independent
+        of tensor size and reduction depth. The same threshold works for
+        K=64 and K=4096, M=1 and M=1024.
+      * Robust to bf16 ULP outliers: with K=2048 and output magnitudes ~200,
+        a single element can differ by ~1.0 just from differing reduction
+        orders (Triton fused vs cuBLAS). atol/rtol false-fails on these;
+        SNR averages them out.
+      * Sensitive to real bugs: wrong stride, flipped nibble, off-by-one
+        group_idx, or a missing mask all collapse SNR to <20 dB. The 50 dB
+        threshold (≈0.3% RMS error) sits comfortably between observed clean
+        noise floor (~80-90 dB) and any genuine functional break.
+    """
+    a = actual.float()
+    b = expected.float()
+    diff = a - b
+    signal = b.norm()
+    noise = diff.norm()
+    snr_db = (20.0 * torch.log10(signal / noise.clamp(min=1e-9))).item()
+    test_case.assertGreater(
+        snr_db,
+        threshold_db,
+        f"{label}: SNR={snr_db:.1f} dB (threshold {threshold_db:.1f} dB), "
+        f"max_abs_err={diff.abs().max().item():.4f}, "
+        f"signal_norm={signal.item():.2f}, noise_norm={noise.item():.4f}",
+    )
 
 
 def _quantize_simple(w_bf16, group_size):
@@ -118,12 +152,7 @@ def _run_matmul(self, M, N, K, group_size):
 
         self.assertEqual(out.shape, (M, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=ATOL, rtol=0.01),
-            f"int4_matmul M={M} [{N}x{K}] gs={group_size}: "
-            f"max_abs_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel_err={((out.float() - ref.float()).abs() / ref.float().abs().clamp(min=1e-6)).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matmul M={M} [{N}x{K}] gs={group_size}")
 
     # --- Decode (M=1) ---
     def test_decode_square(self):
@@ -189,13 +218,7 @@ def _run_matvec(self, N, K, group_size):
 
         self.assertEqual(out.shape, (1, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        # atol=1.0 for large accumulation across K, rtol=0.01 for relative
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=1.0, rtol=0.01),
-            f"int4_matvec [{N}x{K}] gs={group_size}: "
-            f"max_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel={((out.float()-ref.float()).abs()/(ref.float().abs().clamp(min=0.1))).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matvec [{N}x{K}] gs={group_size}")
 
     def test_qkv_proj(self):
         self._run_matvec(2048, 2048, 128)
@@ -226,10 +249,7 @@ def test_matches_int4_matmul(self):
         out_mv = int4_matvec(x, packed, scale, gs)
         out_mm = int4_matmul(x, packed, scale, gs)
 
-        self.assertTrue(
-            torch.allclose(out_mv.float(), out_mm.float(), atol=1.0, rtol=0.01),
-            f"matvec vs matmul: max_err={(out_mv.float() - out_mm.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_mv, out_mm, "matvec vs matmul")
 
 
 class TestDequantThenMatmul(unittest.TestCase):
@@ -248,13 +268,7 @@ def _run(self, M, N, K, group_size):
         w_bf16 = dequant_w4_to_bf16(packed, scale, group_size)
         out_dequant = torch.nn.functional.linear(x, w_bf16)
 
-        self.assertTrue(
-            torch.allclose(
-                out_fused.float(), out_dequant.float(), atol=ATOL, rtol=0.01
-            ),
-            f"fused vs dequant M={M} [{N}x{K}]: "
-            f"max_abs_err={(out_fused.float() - out_dequant.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_fused, out_dequant, f"fused vs dequant M={M} [{N}x{K}]")
 
     def test_decode(self):
         self._run(1, 2048, 2048, 128)