Simplify cuda_calc_xblock_count / cuda_calc_block_count via if constexpr (#5783)

q10 · meta-codesync[bot] · commit 15f7c24eb447 · 2026-05-26T13:23:42.000-07:00
Summary: Pull Request resolved: #5783 The current `cuda_block_count.h` defines `cuda_calc_xblock_count` as **four SFINAE overloads** (signed/unsigned x signed/unsigned for the two integer parameters) plus a `cuda_calc_xblock_count_base` helper -- five functions in total -- purely to suppress "pointless comparison against zero" compiler warnings on unsigned integer types. The header itself documents this rationale at lines 28-32 of the pre-diff file: "This system prevents 'pointless comparison against zero' warnings from the compiler for unsigned types (simpler ways of suppressing this warning didn't work) while maintaining the various warnings." The "simpler ways didn't work" comment dates from before `if constexpr` was widely available. With C++17/C++20 the entire five-function tower collapses to **one** function template using `if constexpr` to gate the signed-only `>= 0` checks at compile time. The unused branch is discarded entirely so no warning is emitted on unsigned types. `cuda_calc_block_count` (the y/z-dim wrapper that adds the 65535 cap) is similarly trimmed to a 4-line template that delegates to `cuda_calc_xblock_count`. Net effect: - Five functions reduced to two. - File length: ~155 lines -> ~85 lines (~45% reduction). - Public API unchanged: same names, same return types, same observable behaviour. TORCH_CHECK messages match the originals verbatim. - Behaviour-preserving: every existing caller across fbgemm_gpu continues to compile and produces the same `uint32_t` result. This is a prep diff for an upcoming change that introduces a `determine_grid_blocks` helper (with a `BlockCapPolicy` enum) on top of these primitives. Folding the SFINAE tower now keeps that follow-up diff's helper signature minimal. Reviewed By: spcyppt Differential Revision: D106262731 fbshipit-source-id: 3c8ed771812f552af548942fbd5f47acf865e789
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_block_count.h b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_block_count.h
@@ -9,146 +9,76 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <algorithm>
 #include <cstdint>
+#include <type_traits>
 
 /// Determine an appropriate CUDA block count along the x axis
 ///
 /// When launching CUDA kernels the number of blocks B is often calculated
 /// w.r.t. the number of threads T and items to be processed N as
 /// B=(N+T-1)/T - which is integer division rounding up.
 /// This function abstracts that calculation, performs it in an
-/// overflow-safe manner, and limits the return value appropriately.
+/// overflow-safe manner, and limits the return value to the CUDA grid-x
+/// dimension cap (2^31-1 for compute capability >= 3.5).
 ///
-/// This is a general function for all integral data types.
-/// The goal of this set of functions is to ensure correct calculations
-/// across a variety of data types without forcing the programmer to
-/// cast to an appropriate type (which is dangerous because we don't
-/// have conversion warnings enabled). The values of the variables
-/// can then be checked for correctness at run-time.
-/// Specialized functions below handle various combinations of signed
-/// and unsigned inputs. This system prevents "pointless comparison
-/// against zero" warnings from the compiler for unsigned types
-/// (simpler ways of suppressing this warning didn't work) while
-/// maintaining the various warnings.
-///
-/// Function is designed to facilitate run-time value checking.
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
-    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
-constexpr uint32_t cuda_calc_xblock_count_base(
+/// Accepts any pair of integral types. The `if constexpr` branches on
+/// signedness emit the `>= 0` TORCH_CHECKs only for signed types, which
+/// avoids "pointless comparison against zero" warnings on unsigned types
+/// without needing per-signedness SFINAE overloads.
+template <typename Integer1, typename Integer2>
+constexpr uint32_t cuda_calc_xblock_count(
     Integer1 num_items,
     Integer2 threads_per_block) {
-  // The number of threads can be as high as 2048 on some newer architectures,
-  // but this is not portable.
+  static_assert(
+      std::is_integral_v<Integer1>,
+      "cuda_calc_xblock_count: num_items must be an integral type");
+  static_assert(
+      std::is_integral_v<Integer2>,
+      "cuda_calc_xblock_count: threads_per_block must be an integral type");
+
+  // The number of threads can be as high as 2048 on some newer
+  // architectures, but this is not portable.
   TORCH_CHECK(threads_per_block <= 1024, "Number of threads must be <=1024!");
+
+  if constexpr (std::is_signed_v<Integer1>) {
+    TORCH_CHECK(
+        num_items >= 0,
+        "When calculating block counts, the number of items must be positive!");
+  }
+  if constexpr (std::is_signed_v<Integer2>) {
+    TORCH_CHECK(
+        threads_per_block >= 0,
+        "When calculating thread counts, the number of threads must be positive!");
+  }
+
   // The CUDA specification at
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
   // states that for compute capability 3.5-* the grid dimension of a kernel
-  // launch must must be <=2^31-1.
+  // launch must be <=2^31-1.
   constexpr uint64_t max_blocks = 2147483647;
   const auto u_num_items = static_cast<uint64_t>(num_items);
   const auto u_threads = static_cast<uint64_t>(threads_per_block);
-  // Overflow safe variant of (a + b - 1) / b
+  // Overflow-safe variant of (a + b - 1) / b.
   const uint64_t blocks =
       u_num_items / u_threads + (u_num_items % u_threads != 0);
   return static_cast<uint32_t>(std::min(blocks, max_blocks));
 }
 
-// See: cuda_calc_xblock_count_base
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<
-        std::is_integral_v<Integer1> && std::is_signed_v<Integer2>,
-        bool> = true,
-    std::enable_if_t<
-        std::is_integral_v<Integer2> && std::is_unsigned_v<Integer2>,
-        bool> = true>
-constexpr uint32_t cuda_calc_xblock_count(
-    Integer1 num_items,
-    Integer2 threads_per_block) {
-  TORCH_CHECK(
-      num_items >= 0,
-      "When calculating block counts, the number of items must be positive!");
-  return cuda_calc_xblock_count_base(num_items, threads_per_block);
-}
-
-// See: cuda_calc_xblock_count_base
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<
-        std::is_integral_v<Integer1> && std::is_unsigned_v<Integer2>,
-        bool> = true,
-    std::enable_if_t<
-        std::is_integral_v<Integer2> && std::is_signed_v<Integer2>,
-        bool> = true>
-constexpr uint32_t cuda_calc_xblock_count(
-    Integer1 num_items,
-    Integer2 threads_per_block) {
-  TORCH_CHECK(
-      threads_per_block >= 0,
-      "When calculating thread counts, the number of threads must be positive!");
-  return cuda_calc_xblock_count_base(num_items, threads_per_block);
-}
-
-// See: cuda_calc_xblock_count_base
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<
-        std::is_integral_v<Integer1> && std::is_signed_v<Integer2>,
-        bool> = true,
-    std::enable_if_t<
-        std::is_integral_v<Integer2> && std::is_signed_v<Integer2>,
-        bool> = true>
-constexpr uint32_t cuda_calc_xblock_count(
-    Integer1 num_items,
-    Integer2 threads_per_block) {
-  TORCH_CHECK(
-      num_items >= 0,
-      "When calculating block counts, the number of items must be positive!");
-  TORCH_CHECK(
-      threads_per_block >= 0,
-      "When calculating thread counts, the number of threads must be positive!");
-  return cuda_calc_xblock_count_base(num_items, threads_per_block);
-}
-
-// See: cuda_calc_xblock_count_base
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<
-        std::is_integral_v<Integer1> && std::is_unsigned_v<Integer2>,
-        bool> = true,
-    std::enable_if_t<
-        std::is_integral_v<Integer2> && std::is_unsigned_v<Integer2>,
-        bool> = true>
-constexpr uint32_t cuda_calc_xblock_count(
-    Integer1 num_items,
-    Integer2 threads_per_block) {
-  return cuda_calc_xblock_count_base(num_items, threads_per_block);
-}
-
-/// Determine an appropriate CUDA block count.
+/// Determine an appropriate CUDA block count for a y- or z-dim of the
+/// launch grid.
 ///
-/// See cuda_calc_xblock_count_base() for details.
-template <
-    typename Integer1,
-    typename Integer2,
-    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
-    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+/// The CUDA specification states that the grid dimension of a kernel
+/// launch must generally be <=65535. (For compute capability 3.5-* the
+/// grid's x-dimension may be <=2^31-1; that larger limit is enforced
+/// by `cuda_calc_xblock_count` instead.) Because this function does not
+/// know which dimension is being calculated, it uses the smaller limit.
+///
+/// See `cuda_calc_xblock_count` for the underlying arithmetic.
+template <typename Integer1, typename Integer2>
 constexpr uint32_t cuda_calc_block_count(
     Integer1 num_items,
     Integer2 threads_per_block) {
-  // The CUDA specification at
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
-  // states that the grid dimension of a kernel launch must generally
-  // be <=65535. (For compute capability 3.5-* the grid's x-dimension must
-  // be <=2^31-1.) Because this function does not know which dimension
-  // is being calculated, we use the smaller limit.
   constexpr uint32_t max_blocks = 65535;
   return std::min(
       cuda_calc_xblock_count(num_items, threads_per_block), max_blocks);