Fix tensormap, add full overhead column to compareq

danielfrg · danielfrg · commit ac597b74f875 · 2026-04-30T10:18:46.000-05:00
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
@@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
 The goal is to benchmark how much overhead does the Python layer adds to calling
 CUDA APIs and what operations are not in our target of less than 1us of overhead.
 
-Each Python benchmark has a C++ counterpart, which is used to compare the
-operations. We try to make each implementation perform small operations
-and nearly the same work as possible and are run under similar conditions.
+Most Python benchmarks have a C++ counterpart that is used as a comparative
+baseline. We try to make each implementation perform small operations and
+nearly the same work as possible and are run under similar conditions.
+
+A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
+because they measure costs with no direct C++ equivalent — such as enum
+construction and member access on `cuda.bindings` enum classes.
 
 These are **not** throughput benchmarks to measure the overall performance
 of kernels and applications.
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_tensormap.py b/benchmarks/cuda_bindings/benchmarks/bench_tensormap.py
@@ -44,78 +44,78 @@
 
 _SUCCESS = cuda.CUresult.CUDA_SUCCESS
 
+# Resolve bindings once at module load. A missing attribute (old binding that
+# predates a TMA API) is the only legitimate reason for a probe to skip —
+# everything else (signature mismatches, unexpected TypeError, etc.) should
+# surface loudly instead of being reclassified as "unsupported".
+_ENCODE_TILED = getattr(cuda, "cuTensorMapEncodeTiled", None)
+_ENCODE_IM2COL = getattr(cuda, "cuTensorMapEncodeIm2col", None)
+_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
+_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
+
 
 def _probe_tiled() -> bool:
-    try:
-        err, _ = cuda.cuTensorMapEncodeTiled(
-            TILED_DTYPE,
-            TILED_RANK,
-            PTR,
-            TILED_GLOBAL_DIM,
-            TILED_GLOBAL_STRIDES,
-            TILED_BOX_DIM,
-            TILED_ELEMENT_STRIDES,
-            TILED_INTERLEAVE,
-            TILED_SWIZZLE,
-            TILED_L2,
-            TILED_OOB,
-        )
-    except Exception:
+    if _ENCODE_TILED is None:
         return False
+    err, _ = _ENCODE_TILED(
+        TILED_DTYPE,
+        TILED_RANK,
+        PTR,
+        TILED_GLOBAL_DIM,
+        TILED_GLOBAL_STRIDES,
+        TILED_BOX_DIM,
+        TILED_ELEMENT_STRIDES,
+        TILED_INTERLEAVE,
+        TILED_SWIZZLE,
+        TILED_L2,
+        TILED_OOB,
+    )
     return err == _SUCCESS
 
 
 def _probe_im2col() -> bool:
-    try:
-        err, _ = cuda.cuTensorMapEncodeIm2col(
-            IM2COL_DTYPE,
-            IM2COL_RANK,
-            PTR,
-            IM2COL_GLOBAL_DIM,
-            IM2COL_GLOBAL_STRIDES,
-            IM2COL_PIXEL_BOX_LOWER,
-            IM2COL_PIXEL_BOX_UPPER,
-            IM2COL_CHANNELS,
-            IM2COL_PIXELS,
-            IM2COL_ELEMENT_STRIDES,
-            IM2COL_INTERLEAVE,
-            IM2COL_SWIZZLE,
-            IM2COL_L2,
-            IM2COL_OOB,
-        )
-    except Exception:
+    if _ENCODE_IM2COL is None:
         return False
+    err, _ = _ENCODE_IM2COL(
+        IM2COL_DTYPE,
+        IM2COL_RANK,
+        PTR,
+        IM2COL_GLOBAL_DIM,
+        IM2COL_GLOBAL_STRIDES,
+        IM2COL_PIXEL_BOX_LOWER,
+        IM2COL_PIXEL_BOX_UPPER,
+        IM2COL_CHANNELS,
+        IM2COL_PIXELS,
+        IM2COL_ELEMENT_STRIDES,
+        IM2COL_INTERLEAVE,
+        IM2COL_SWIZZLE,
+        IM2COL_L2,
+        IM2COL_OOB,
+    )
     return err == _SUCCESS
 
 
-_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
-_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
-
-
 def _probe_im2col_wide() -> bool:
     if _ENCODE_IM2COL_WIDE is None or _IM2COL_WIDE_MODE_CLS is None:
         return False
-    try:
-        mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
-        err, _ = _ENCODE_IM2COL_WIDE(
-            IM2COL_DTYPE,
-            IM2COL_RANK,
-            PTR,
-            IM2COL_GLOBAL_DIM,
-            IM2COL_GLOBAL_STRIDES,
-            0,
-            0,
-            IM2COL_CHANNELS,
-            IM2COL_PIXELS,
-            IM2COL_ELEMENT_STRIDES,
-            IM2COL_INTERLEAVE,
-            mode,
-            cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
-            IM2COL_L2,
-            IM2COL_OOB,
-        )
-    except Exception:
-        return False
+    mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    err, _ = _ENCODE_IM2COL_WIDE(
+        IM2COL_DTYPE,
+        IM2COL_RANK,
+        PTR,
+        IM2COL_GLOBAL_DIM,
+        IM2COL_GLOBAL_STRIDES,
+        0,
+        0,
+        IM2COL_CHANNELS,
+        IM2COL_PIXELS,
+        IM2COL_ELEMENT_STRIDES,
+        IM2COL_INTERLEAVE,
+        mode,
+        cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
+        IM2COL_L2,
+        IM2COL_OOB,
+    )
     return err == _SUCCESS
 
 
diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
@@ -51,10 +51,23 @@ def fmt_rsd(rsd: float | None) -> str:
 
 
 def fmt_ns(seconds: float) -> str:
-    ns = seconds * 1e9
-    if ns >= 1000:
-        return f"{ns / 1000:.2f} us"
-    return f"{ns:.0f} ns"
+    """Format a duration in nanoseconds with a thousands separator.
+
+    Using a single unit across the whole table makes side-by-side comparison
+    easier, even when some entries get into the multi-microsecond range.
+    """
+    return f"{seconds * 1e9:,.0f}"
+
+
+def fmt_overhead_ns(py_mean: float, cpp_mean: float) -> str:
+    return f"{(py_mean - cpp_mean) * 1e9:+,.0f}"
+
+
+def fmt_overhead_pct(py_mean: float, cpp_mean: float) -> str:
+    if cpp_mean <= 0.0:
+        return "-"
+    pct = (py_mean - cpp_mean) / cpp_mean * 100
+    return f"{pct:+,.0f}%"
 
 
 def main() -> None:
@@ -90,22 +103,29 @@ def main() -> None:
     name_width = max(len(n) for n in all_names)
     name_width = max(name_width, len("Benchmark"))
 
+    # Right-aligned numeric columns. Widths chosen so header text fits and
+    # multi-microsecond ns values with thousands separators still align.
+    cpp_w = 12
+    py_w = 12
+    rsd_w = 8
+    oh_ns_w = 12
+    oh_pct_w = 10
+
     # Header
     if cpp_benchmarks:
         header = (
-            f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'C++ RSD':>8}  "
-            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}"
+            f"{'Benchmark':<{name_width}}  "
+            f"{'C++ (ns)':>{cpp_w}}  {'C++ RSD':>{rsd_w}}  "
+            f"{'Python (ns)':>{py_w}}  {'Py RSD':>{rsd_w}}  "
+            f"{'Overhead ns':>{oh_ns_w}}  {'Overhead %':>{oh_pct_w}}"
         )
-        sep = "-" * len(header)
-        print(sep)
-        print(header)
-        print(sep)
     else:
-        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}  {'Py RSD':>7}"
-        sep = "-" * len(header)
-        print(sep)
-        print(header)
-        print(sep)
+        header = f"{'Benchmark':<{name_width}}  {'Python (ns)':>{py_w}}  {'Py RSD':>{rsd_w}}"
+
+    sep = "-" * len(header)
+    print(sep)
+    print(header)
+    print(sep)
 
     for name in all_names:
         py_vals = py_benchmarks.get(name)
@@ -120,17 +140,21 @@ def main() -> None:
         cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"
 
         if py_stats and cpp_stats:
-            py_mean = py_stats[0]
-            cpp_mean = cpp_stats[0]
-            overhead_ns = (py_mean - cpp_mean) * 1e9
-            overhead_str = f"{overhead_ns:+.0f} ns"
+            overhead_ns_str = fmt_overhead_ns(py_stats[0], cpp_stats[0])
+            overhead_pct_str = fmt_overhead_pct(py_stats[0], cpp_stats[0])
         else:
-            overhead_str = "-"
+            overhead_ns_str = "-"
+            overhead_pct_str = "-"
 
         if cpp_benchmarks:
-            print(f"{name:<{name_width}}  {cpp_str:>12}  {cpp_rsd:>8}  {py_str:>14}  {py_rsd:>7}  {overhead_str:>10}")
+            print(
+                f"{name:<{name_width}}  "
+                f"{cpp_str:>{cpp_w}}  {cpp_rsd:>{rsd_w}}  "
+                f"{py_str:>{py_w}}  {py_rsd:>{rsd_w}}  "
+                f"{overhead_ns_str:>{oh_ns_w}}  {overhead_pct_str:>{oh_pct_w}}"
+            )
         else:
-            print(f"{name:<{name_width}}  {py_str:>14}  {py_rsd:>7}")
+            print(f"{name:<{name_width}}  {py_str:>{py_w}}  {py_rsd:>{rsd_w}}")
 
     print(sep)
 
diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
@@ -253,12 +253,6 @@ def main() -> None:
     remaining_argv = ensure_pyperf_worker_env(remaining_argv)
     is_worker = "--worker" in remaining_argv
 
-    # Delete the file so this run starts fresh.
-    if not is_worker:
-        output_path.unlink(missing_ok=True)
-
-    sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
-
     # Drop benchmarks that the owning module has marked as unavailable on
     # this driver/device. Without this step a single unsupported bench
     # (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
@@ -269,6 +263,26 @@ def main() -> None:
             print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)
     benchmark_ids = [bench_id for bench_id in benchmark_ids if bench_id not in skipped]
 
+    # If every selected benchmark was skipped, fail loudly instead of silently
+    # printing "Results saved" with no output. Leave any existing output file
+    # untouched so a prior successful run is not destroyed.
+    if not benchmark_ids:
+        if not is_worker:
+            print(
+                "No benchmarks to run: every selected benchmark is unsupported "
+                "on this driver/device. Existing output file (if any) was left "
+                "untouched.",
+                file=sys.stderr,
+            )
+        sys.exit(1)
+
+    # Delete the file so this run starts fresh. Only destructive once we know
+    # at least one benchmark will actually run.
+    if not is_worker:
+        output_path.unlink(missing_ok=True)
+
+    sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv]
+
     runner = pyperf.Runner()
     for bench_id in benchmark_ids:
         runner.bench_time_func(bench_id, registry[bench_id])