Skip to content

Commit ac597b7

Browse files
committed
Fix tensormap, add full overhead column to compareq
1 parent 6cb1265 commit ac597b7

4 files changed

Lines changed: 131 additions & 89 deletions

File tree

benchmarks/cuda_bindings/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
66
The goal is to benchmark how much overhead does the Python layer adds to calling
77
CUDA APIs and what operations are not in our target of less than 1us of overhead.
88

9-
Each Python benchmark has a C++ counterpart, which is used to compare the
10-
operations. We try to make each implementation perform small operations
11-
and nearly the same work as possible and are run under similar conditions.
9+
Most Python benchmarks have a C++ counterpart that is used as a comparative
10+
baseline. We try to make each implementation perform small operations and
11+
nearly the same work as possible and are run under similar conditions.
12+
13+
A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
14+
because they measure costs with no direct C++ equivalent — such as enum
15+
construction and member access on `cuda.bindings` enum classes.
1216

1317
These are **not** throughput benchmarks to measure the overall performance
1418
of kernels and applications.

benchmarks/cuda_bindings/benchmarks/bench_tensormap.py

Lines changed: 58 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -44,78 +44,78 @@
4444

4545
_SUCCESS = cuda.CUresult.CUDA_SUCCESS
4646

47+
# Resolve bindings once at module load. A missing attribute (old binding that
48+
# predates a TMA API) is the only legitimate reason for a probe to skip —
49+
# everything else (signature mismatches, unexpected TypeError, etc.) should
50+
# surface loudly instead of being reclassified as "unsupported".
51+
_ENCODE_TILED = getattr(cuda, "cuTensorMapEncodeTiled", None)
52+
_ENCODE_IM2COL = getattr(cuda, "cuTensorMapEncodeIm2col", None)
53+
_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
54+
_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
55+
4756

4857
def _probe_tiled() -> bool:
49-
try:
50-
err, _ = cuda.cuTensorMapEncodeTiled(
51-
TILED_DTYPE,
52-
TILED_RANK,
53-
PTR,
54-
TILED_GLOBAL_DIM,
55-
TILED_GLOBAL_STRIDES,
56-
TILED_BOX_DIM,
57-
TILED_ELEMENT_STRIDES,
58-
TILED_INTERLEAVE,
59-
TILED_SWIZZLE,
60-
TILED_L2,
61-
TILED_OOB,
62-
)
63-
except Exception:
58+
if _ENCODE_TILED is None:
6459
return False
60+
err, _ = _ENCODE_TILED(
61+
TILED_DTYPE,
62+
TILED_RANK,
63+
PTR,
64+
TILED_GLOBAL_DIM,
65+
TILED_GLOBAL_STRIDES,
66+
TILED_BOX_DIM,
67+
TILED_ELEMENT_STRIDES,
68+
TILED_INTERLEAVE,
69+
TILED_SWIZZLE,
70+
TILED_L2,
71+
TILED_OOB,
72+
)
6573
return err == _SUCCESS
6674

6775

6876
def _probe_im2col() -> bool:
69-
try:
70-
err, _ = cuda.cuTensorMapEncodeIm2col(
71-
IM2COL_DTYPE,
72-
IM2COL_RANK,
73-
PTR,
74-
IM2COL_GLOBAL_DIM,
75-
IM2COL_GLOBAL_STRIDES,
76-
IM2COL_PIXEL_BOX_LOWER,
77-
IM2COL_PIXEL_BOX_UPPER,
78-
IM2COL_CHANNELS,
79-
IM2COL_PIXELS,
80-
IM2COL_ELEMENT_STRIDES,
81-
IM2COL_INTERLEAVE,
82-
IM2COL_SWIZZLE,
83-
IM2COL_L2,
84-
IM2COL_OOB,
85-
)
86-
except Exception:
77+
if _ENCODE_IM2COL is None:
8778
return False
79+
err, _ = _ENCODE_IM2COL(
80+
IM2COL_DTYPE,
81+
IM2COL_RANK,
82+
PTR,
83+
IM2COL_GLOBAL_DIM,
84+
IM2COL_GLOBAL_STRIDES,
85+
IM2COL_PIXEL_BOX_LOWER,
86+
IM2COL_PIXEL_BOX_UPPER,
87+
IM2COL_CHANNELS,
88+
IM2COL_PIXELS,
89+
IM2COL_ELEMENT_STRIDES,
90+
IM2COL_INTERLEAVE,
91+
IM2COL_SWIZZLE,
92+
IM2COL_L2,
93+
IM2COL_OOB,
94+
)
8895
return err == _SUCCESS
8996

9097

91-
_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
92-
_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
93-
94-
9598
def _probe_im2col_wide() -> bool:
9699
if _ENCODE_IM2COL_WIDE is None or _IM2COL_WIDE_MODE_CLS is None:
97100
return False
98-
try:
99-
mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
100-
err, _ = _ENCODE_IM2COL_WIDE(
101-
IM2COL_DTYPE,
102-
IM2COL_RANK,
103-
PTR,
104-
IM2COL_GLOBAL_DIM,
105-
IM2COL_GLOBAL_STRIDES,
106-
0,
107-
0,
108-
IM2COL_CHANNELS,
109-
IM2COL_PIXELS,
110-
IM2COL_ELEMENT_STRIDES,
111-
IM2COL_INTERLEAVE,
112-
mode,
113-
cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
114-
IM2COL_L2,
115-
IM2COL_OOB,
116-
)
117-
except Exception:
118-
return False
101+
mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
102+
err, _ = _ENCODE_IM2COL_WIDE(
103+
IM2COL_DTYPE,
104+
IM2COL_RANK,
105+
PTR,
106+
IM2COL_GLOBAL_DIM,
107+
IM2COL_GLOBAL_STRIDES,
108+
0,
109+
0,
110+
IM2COL_CHANNELS,
111+
IM2COL_PIXELS,
112+
IM2COL_ELEMENT_STRIDES,
113+
IM2COL_INTERLEAVE,
114+
mode,
115+
cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
116+
IM2COL_L2,
117+
IM2COL_OOB,
118+
)
119119
return err == _SUCCESS
120120

121121

benchmarks/cuda_bindings/compare.py

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,23 @@ def fmt_rsd(rsd: float | None) -> str:
5151

5252

5353
def fmt_ns(seconds: float) -> str:
54-
ns = seconds * 1e9
55-
if ns >= 1000:
56-
return f"{ns / 1000:.2f} us"
57-
return f"{ns:.0f} ns"
54+
"""Format a duration in nanoseconds with a thousands separator.
55+
56+
Using a single unit across the whole table makes side-by-side comparison
57+
easier, even when some entries get into the multi-microsecond range.
58+
"""
59+
return f"{seconds * 1e9:,.0f}"
60+
61+
62+
def fmt_overhead_ns(py_mean: float, cpp_mean: float) -> str:
63+
return f"{(py_mean - cpp_mean) * 1e9:+,.0f}"
64+
65+
66+
def fmt_overhead_pct(py_mean: float, cpp_mean: float) -> str:
67+
if cpp_mean <= 0.0:
68+
return "-"
69+
pct = (py_mean - cpp_mean) / cpp_mean * 100
70+
return f"{pct:+,.0f}%"
5871

5972

6073
def main() -> None:
@@ -90,22 +103,29 @@ def main() -> None:
90103
name_width = max(len(n) for n in all_names)
91104
name_width = max(name_width, len("Benchmark"))
92105

106+
# Right-aligned numeric columns. Widths chosen so header text fits and
107+
# multi-microsecond ns values with thousands separators still align.
108+
cpp_w = 12
109+
py_w = 12
110+
rsd_w = 8
111+
oh_ns_w = 12
112+
oh_pct_w = 10
113+
93114
# Header
94115
if cpp_benchmarks:
95116
header = (
96-
f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} "
97-
f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}"
117+
f"{'Benchmark':<{name_width}} "
118+
f"{'C++ (ns)':>{cpp_w}} {'C++ RSD':>{rsd_w}} "
119+
f"{'Python (ns)':>{py_w}} {'Py RSD':>{rsd_w}} "
120+
f"{'Overhead ns':>{oh_ns_w}} {'Overhead %':>{oh_pct_w}}"
98121
)
99-
sep = "-" * len(header)
100-
print(sep)
101-
print(header)
102-
print(sep)
103122
else:
104-
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}"
105-
sep = "-" * len(header)
106-
print(sep)
107-
print(header)
108-
print(sep)
123+
header = f"{'Benchmark':<{name_width}} {'Python (ns)':>{py_w}} {'Py RSD':>{rsd_w}}"
124+
125+
sep = "-" * len(header)
126+
print(sep)
127+
print(header)
128+
print(sep)
109129

110130
for name in all_names:
111131
py_vals = py_benchmarks.get(name)
@@ -120,17 +140,21 @@ def main() -> None:
120140
cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"
121141

122142
if py_stats and cpp_stats:
123-
py_mean = py_stats[0]
124-
cpp_mean = cpp_stats[0]
125-
overhead_ns = (py_mean - cpp_mean) * 1e9
126-
overhead_str = f"{overhead_ns:+.0f} ns"
143+
overhead_ns_str = fmt_overhead_ns(py_stats[0], cpp_stats[0])
144+
overhead_pct_str = fmt_overhead_pct(py_stats[0], cpp_stats[0])
127145
else:
128-
overhead_str = "-"
146+
overhead_ns_str = "-"
147+
overhead_pct_str = "-"
129148

130149
if cpp_benchmarks:
131-
print(f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} {py_str:>14} {py_rsd:>7} {overhead_str:>10}")
150+
print(
151+
f"{name:<{name_width}} "
152+
f"{cpp_str:>{cpp_w}} {cpp_rsd:>{rsd_w}} "
153+
f"{py_str:>{py_w}} {py_rsd:>{rsd_w}} "
154+
f"{overhead_ns_str:>{oh_ns_w}} {overhead_pct_str:>{oh_pct_w}}"
155+
)
132156
else:
133-
print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}")
157+
print(f"{name:<{name_width}} {py_str:>{py_w}} {py_rsd:>{rsd_w}}")
134158

135159
print(sep)
136160

benchmarks/cuda_bindings/runner/main.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -253,12 +253,6 @@ def main() -> None:
253253
remaining_argv = ensure_pyperf_worker_env(remaining_argv)
254254
is_worker = "--worker" in remaining_argv
255255

256-
# Delete the file so this run starts fresh.
257-
if not is_worker:
258-
output_path.unlink(missing_ok=True)
259-
260-
sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
261-
262256
# Drop benchmarks that the owning module has marked as unavailable on
263257
# this driver/device. Without this step a single unsupported bench
264258
# (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
@@ -269,6 +263,26 @@ def main() -> None:
269263
print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)
270264
benchmark_ids = [bench_id for bench_id in benchmark_ids if bench_id not in skipped]
271265

266+
# If every selected benchmark was skipped, fail loudly instead of silently
267+
# printing "Results saved" with no output. Leave any existing output file
268+
# untouched so a prior successful run is not destroyed.
269+
if not benchmark_ids:
270+
if not is_worker:
271+
print(
272+
"No benchmarks to run: every selected benchmark is unsupported "
273+
"on this driver/device. Existing output file (if any) was left "
274+
"untouched.",
275+
file=sys.stderr,
276+
)
277+
sys.exit(1)
278+
279+
# Delete the file so this run starts fresh. Only destructive once we know
280+
# at least one benchmark will actually run.
281+
if not is_worker:
282+
output_path.unlink(missing_ok=True)
283+
284+
sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
285+
272286
runner = pyperf.Runner()
273287
for bench_id in benchmark_ids:
274288
runner.bench_time_func(bench_id, registry[bench_id])

0 commit comments

Comments
 (0)