[Benchmark] Improve benchmark test reliability using cudagraph

haijieg · haijieg · commit abcdc7c6b748 · 2026-05-28T11:32:18.000-07:00
Using cudagraph eliminates any host cpu overhead or jittering.

Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/test/bench_attention.py b/test/bench_attention.py
@@ -67,11 +67,13 @@ def bench_fmha(qkv_shape, dtype, backend, benchmark):
 
     warmup_rounds, iterations, rounds = estimate_bench_iter(
         backend, (q, k, v, o, is_causal, enable_gqa),
+        cudagraph=True
     )
 
     benchmark.pedantic(
         backend, (q, k, v, o, is_causal, enable_gqa),
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True
     )
 
     B, H, L, D = q.shape
diff --git a/test/bench_fft.py b/test/bench_fft.py
@@ -63,10 +63,11 @@ def bench_fft(shape, dtype, fft_backend, benchmark):
     y_ref = torch_fft(*args)
     l2error = (y_ref - y_test).norm() / (y_ref).norm()
     assert l2error < tolerance_map[dtype]
-    warmup_rounds, iterations, rounds = estimate_bench_iter(fft_backend, args)
+    warmup_rounds, iterations, rounds = estimate_bench_iter(fft_backend, args, cudagraph=True)
     benchmark.pedantic(
         fft_backend, args,
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True
     )
 
     flop_count = 0  # TODO
diff --git a/test/bench_layer_norm.py b/test/bench_layer_norm.py
@@ -49,31 +49,37 @@ def bench_layer_norm(shape, dtype, mode, backend, benchmark):
         torch.bfloat16: (1e-2, 1e-2),
     }[dtype]
 
-    y = backend(x, weight, bias, eps)
-    y_ref = torch_layer_norm(x, weight, bias, eps)
-    if mode == "forward":
-        torch.testing.assert_close(y, y_ref, atol=atol, rtol=rtol)
-        bench_f, bench_args = backend, (x, weight, bias, eps)
-    else:
-        y.backward(dy, retain_graph=True)
-        dx, dw, db = [_.grad.clone() for _ in [x, weight, bias]]
-        x.grad, weight.grad, bias.grad = None, None, None
-
-        y_ref.backward(dy, retain_graph=True)
-        dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
-
-        torch.testing.assert_close(dx, dx_ref, atol=atol, rtol=rtol)
-        torch.testing.assert_close(dw, dw_ref, atol=atol, rtol=rtol)
-        torch.testing.assert_close(db, db_ref, atol=atol, rtol=rtol)
-
-        bench_f, bench_args = partial(y.backward, retain_graph=True), (dy,)
-
-    warmup_rounds, iterations, rounds = estimate_bench_iter(bench_f, bench_args)
-
-    benchmark.pedantic(
-        bench_f, bench_args,
-        rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
-    )
+    # Run in non default stream so backward graph can be captured without
+    # sync with default stream
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        y = backend(x, weight, bias, eps)
+        y_ref = torch_layer_norm(x, weight, bias, eps)
+        if mode == "forward":
+            torch.testing.assert_close(y, y_ref, atol=atol, rtol=rtol)
+            bench_f, bench_args = backend, (x, weight, bias, eps)
+        else:
+            y.backward(dy, retain_graph=True)
+            dx, dw, db = [_.grad.clone() for _ in [x, weight, bias]]
+            x.grad, weight.grad, bias.grad = None, None, None
+
+            y_ref.backward(dy, retain_graph=True)
+            dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
+
+            torch.testing.assert_close(dx, dx_ref, atol=atol, rtol=rtol)
+            torch.testing.assert_close(dw, dw_ref, atol=atol, rtol=rtol)
+            torch.testing.assert_close(db, db_ref, atol=atol, rtol=rtol)
+
+            bench_f, bench_args = partial(y.backward, retain_graph=True), (dy,)
+
+        warmup_rounds, iterations, rounds = estimate_bench_iter(bench_f, bench_args, cudagraph=True)
+
+        benchmark.pedantic(
+            bench_f, bench_args,
+            rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+            cudagraph=True
+        )
 
 
 class CuTileLayerNorm(torch.autograd.Function):
diff --git a/test/bench_matmul.py b/test/bench_matmul.py
@@ -21,7 +21,8 @@ def dtype(request):
     return request.param
 
 
-def _run_matmul_benchmark(shape, dtype, backend, benchmark, extra_args=(), atol=1e-3, rtol=1e-3):
+def _run_matmul_benchmark(shape, dtype, backend, benchmark,
+                          extra_args=(), atol=1e-3, rtol=1e-3):
     m, n, k = shape
     A = torch.rand((m, k), dtype=dtype, device="cuda")
     B = torch.rand((k, n), dtype=dtype, device="cuda")
@@ -34,10 +35,11 @@ def _run_matmul_benchmark(shape, dtype, backend, benchmark, extra_args=(), atol=
         torch.testing.assert_close(C, A @ B, atol=atol, rtol=rtol)
 
     torch.cuda.synchronize()
-    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, args)
+    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, args, cudagraph=True)
     benchmark.pedantic(
         backend, args,
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True
     )
 
     flop_count = 2 * m * n * k
@@ -63,12 +65,12 @@ def _run_batch_matmul_benchmark(
             torch.testing.assert_close(C, ref, atol=atol, rtol=rtol)
 
     torch.cuda.synchronize()
-    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, args)
+    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, args, cudagraph=True)
     benchmark.pedantic(
         backend, args,
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True,
     )
-
     flop_count = 2 * b * m * n * k
     bytes_rw = sum([t.numel() * t.dtype.itemsize for t in (A, B, C)])
     benchmark.extra_info['flop_count'] = flop_count
@@ -122,7 +124,8 @@ def bench_matmul_split_k(split_k_shape, dtype, backend, benchmark):
                         dtype=torch.int32, device="cuda")
     COUNTS = torch.zeros_like(LOCKS)
     extra_args = (LOCKS, COUNTS, tile_sizes)
-    _run_matmul_benchmark(split_k_shape, dtype, backend, benchmark, extra_args, rtol=2e-3)
+    _run_matmul_benchmark(split_k_shape, dtype, backend, benchmark,
+                          extra_args, rtol=2e-3)
 
 
 def cutile_matmul_split_k(A, B, C, LOCKS, COUNTS, tile_sizes):
@@ -172,8 +175,8 @@ def cutile_batch_matmul(bs, A, B, C):
 def torch_batch_matmul(bs, A, B, C):
     if A.dtype == torch.float8_e5m2:
         pytest.skip("float8_e5m2 matmul on torch is not supported")
-    inv_sa = torch.tensor(1.0, device=A.device, dtype=torch.float32)
-    inv_sb = torch.tensor(1.0, device=B.device, dtype=torch.float32)
+    inv_sa = torch.full((), 1.0, device=A.device, dtype=torch.float32)
+    inv_sb = torch.full((), 1.0, device=B.device, dtype=torch.float32)
     with torch_use_tf32_matmul():
         for i in range(bs):
             # Only multiplication of row-major and column-major matrices is supported by cuBLASLt
diff --git a/test/bench_rms_norm.py b/test/bench_rms_norm.py
@@ -72,11 +72,13 @@ def bench_rms_norm(shape, dtype, algo, backend, benchmark):
 
     warmup_rounds, iterations, rounds = estimate_bench_iter(
         backend, (x, weight, eps, static_persistent, gather),
+        cudagraph=True
     )
 
     benchmark.pedantic(
         backend, (x, weight, eps, static_persistent, gather),
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True
     )
 
     M, N = x.shape
diff --git a/test/bench_transpose.py b/test/bench_transpose.py
@@ -27,10 +27,11 @@ def _run_transpose_benchmark(shape, dtype, backend, benchmark, atol=1e-3, rtol=1
     backend(A, B)
     torch.testing.assert_close(B, A.T, atol=atol, rtol=rtol)
     torch.cuda.synchronize()
-    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, (A, B))
+    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, (A, B), cudagraph=True)
     benchmark.pedantic(
         backend, (A, B),
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True,
     )
 
     flop_count = m * n
diff --git a/test/bench_vec_add.py b/test/bench_vec_add.py
@@ -53,11 +53,13 @@ def bench_vec_add(shape, dtype, backend, use_gather, benchmark):
     torch.testing.assert_close(c, ref, atol=1e-3, rtol=1e-3)
     torch.cuda.synchronize()
 
-    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, (a, b, use_gather))
+    warmup_rounds, iterations, rounds = estimate_bench_iter(backend, (a, b, use_gather),
+                                                            cudagraph=True)
 
     benchmark.pedantic(
         backend, (a, b, use_gather),
         rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,
+        cudagraph=True
     )
 
     flop_count = 0
diff --git a/test/conftest.py b/test/conftest.py
@@ -4,20 +4,20 @@
 
 import torch
 import pytest
-import cuda_timer
 import subprocess
 import sys
 import math
 import tempfile
-from functools import cache
+from functools import cache, partial
 
 from cuda.tile._bytecode.version import BytecodeVersion
 from cuda.tile._compile import (
         _get_max_supported_bytecode_version,
         _SUPPORTED_VERSIONS,
         _find_compiler_bin)
 from cuda.tile._cext import dev_features_enabled
-from util import require_blackwell_or_newer, require_hopper_or_newer
+from util import (require_blackwell_or_newer, require_hopper_or_newer,
+                  benchmark_cudagraph_runner, benchmark_eager_runner)
 
 
 def pytest_addoption(parser):
@@ -160,11 +160,27 @@ def uint_dtype(request):
     return request.param
 
 
+def patch_benchmark_fixture(benchmark):
+    """Patch BenchmarkFixture to use custom runner: eager or cudagraph.
+    Extends the `pedantic` method to take additional `cudagraph` argument.
+    """
+
+    benchmark._make_runner = benchmark_eager_runner
+
+    def pedantic(original, *args, **kwargs):
+        if 'cudagraph' in kwargs:
+            cudagraph = kwargs.pop('cudagraph')
+            if cudagraph:
+                benchmark._make_runner = benchmark_cudagraph_runner
+        return original(*args, **kwargs)
+
+    benchmark.pedantic = partial(pedantic, benchmark.pedantic)
+
+
 # ----- For pytest benchmark
 @pytest.fixture
 def benchmark(benchmark):
-    # Patch benchmark fixture to use cuda timer
-    benchmark._timer = cuda_timer.time
+    patch_benchmark_fixture(benchmark)
     return benchmark
 
 
diff --git a/test/util.py b/test/util.py
@@ -148,23 +148,69 @@ def raises_autocast_error(launch, from_ty, to_ty) -> bool:
         return False
 
 
-def estimate_bench_iter(f, tuple_of_args):
+def benchmark_cudagraph_runner(f, args, kwargs):
+    # For patching BenchmarkFixture._make_runner
+    def runner(loops_range, **unused) -> float:
+        # run the regular function a few times to ensure kernel and memory states are stable
+        # before graph capture
+        for _ in range(3):
+            f(*args, **kwargs)
+
+        # cuda graph capture must happen on non-default stream
+        if torch.cuda.current_stream() == torch.cuda.default_stream():
+            stream = torch.cuda.Stream()
+            stream.wait_stream(torch.cuda.current_stream())
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            ev_start = torch.cuda.Event(enable_timing=True, external=True)
+            ev_end = torch.cuda.Event(enable_timing=True, external=True)
+            l2_size = torch.cuda.get_device_properties(0).L2_cache_size
+            cache_flush_tensor = torch.empty(l2_size, dtype=torch.uint8, device="cuda")
+
+            with torch.cuda.graph(g):
+                cache_flush_tensor.zero_()
+                ev_start.record()
+                f(*args, **kwargs)
+                ev_end.record()
+
+            torch.cuda.synchronize()
+            assert loops_range is not None
+            ret = 0
+            for _ in loops_range:
+                g.replay()
+                ev_end.synchronize()
+                ret += ev_start.elapsed_time(ev_end)
+            return ret / 1000  # secs
+    return runner
+
+
+def benchmark_eager_runner(f, args, kwargs):
+    def runner(loops_range, **unused) -> float:
+        assert loops_range is not None
+        torch.cuda.synchronize()
+        ev_start = torch.cuda.Event(enable_timing=True)
+        ev_end = torch.cuda.Event(enable_timing=True)
+        ev_start.record()
+        for _ in loops_range:
+            f(*args, **kwargs)
+        ev_end.record()
+        ev_end.synchronize()
+        return ev_start.elapsed_time(ev_end) / 1000
+    return runner
+
+
+def estimate_bench_iter(f, tuple_of_args, cudagraph=False):
     warmup_iter_guess = 5
     min_round_time_ms = 100
     rounds = 5
     warmup_rounds = 1
-
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    for _ in range(warmup_iter_guess):
-        f(*tuple_of_args)
-    end.record()
-    torch.cuda.synchronize()
-    elapsed = start.elapsed_time(end) / warmup_iter_guess
-
-    main_iter = ceil(min_round_time_ms / elapsed)
-
+    runner = (benchmark_cudagraph_runner(f, tuple_of_args, {}) if cudagraph else
+              benchmark_eager_runner(f, tuple_of_args, {}))
+    time_per_iter = runner(range(warmup_iter_guess)) / warmup_iter_guess
+    main_iter = max(min(ceil(min_round_time_ms / (time_per_iter * 1000)), 200), warmup_iter_guess)
     return warmup_rounds, main_iter, rounds
 
 

Original file line number	Diff line number	Diff line change
`@@ -67,11 +67,13 @@ def bench_fmha(qkv_shape, dtype, backend, benchmark):`
`67`	`67`
`68`	`68`	`warmup_rounds, iterations, rounds = estimate_bench_iter(`
`69`	`69`	`backend, (q, k, v, o, is_causal, enable_gqa),`
	`70`	`+ cudagraph=True`
`70`	`71`	`)`
`71`	`72`
`72`	`73`	`benchmark.pedantic(`
`73`	`74`	`backend, (q, k, v, o, is_causal, enable_gqa),`
`74`	`75`	`rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,`
	`76`	`+ cudagraph=True`
`75`	`77`	`)`
`76`	`78`
`77`	`79`	`B, H, L, D = q.shape`
Original file line number	Diff line number	Diff line change
`@@ -72,11 +72,13 @@ def bench_rms_norm(shape, dtype, algo, backend, benchmark):`
`72`	`72`
`73`	`73`	`warmup_rounds, iterations, rounds = estimate_bench_iter(`
`74`	`74`	`backend, (x, weight, eps, static_persistent, gather),`
	`75`	`+ cudagraph=True`
`75`	`76`	`)`
`76`	`77`
`77`	`78`	`benchmark.pedantic(`
`78`	`79`	`backend, (x, weight, eps, static_persistent, gather),`
`79`	`80`	`rounds=rounds, warmup_rounds=warmup_rounds, iterations=iterations,`
	`81`	`+ cudagraph=True`
`80`	`82`	`)`
`81`	`83`
`82`	`84`	`M, N = x.shape`