Replace custom size kernel with cub::DeviceReduce::TransformReduce

PointKernel · PointKernel · commit c9219b584dd7 · 2026-05-01T18:16:05.000Z
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
@@ -707,43 +707,6 @@ CUCO_KERNEL void retrieve(InputProbeIt input_probe,
   }
 }
 
-/**
- * @brief Calculates the number of filled slots for the given bucket storage.
- *
- * @tparam BlockSize Number of threads in each block
- * @tparam StorageRef Type of non-owning ref allowing access to storage
- * @tparam Predicate Type of predicate indicating if the given slot is filled
- * @tparam AtomicT Atomic counter type
- *
- * @param storage Non-owning device ref used to access the slot storage
- * @param is_filled Predicate indicating if the given slot is filled
- * @param count Number of filled slots
- */
-template <int BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
-CUCO_KERNEL __launch_bounds__(BlockSize) void size(StorageRef storage,
-                                                   Predicate is_filled,
-                                                   AtomicT* count)
-{
-  using size_type = typename StorageRef::size_type;
-
-  auto const loop_stride = cuco::detail::grid_stride();
-  auto idx               = cuco::detail::global_thread_id();
-
-  size_type thread_count = 0;
-  auto const n           = storage.capacity();
-
-  while (idx < n) {
-    thread_count += static_cast<size_type>(is_filled(*(storage.data() + idx)));
-
-    idx += loop_stride;
-  }
-
-  using BlockReduce = cub::BlockReduce<size_type, BlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  auto const block_count = BlockReduce(temp_storage).Sum(thread_count);
-  if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
-}
-
 template <int BlockSize, typename ContainerRef, typename Predicate>
 CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
   typename ContainerRef::storage_ref_type storage_ref,
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -31,6 +31,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cub/device/device_for.cuh>
+#include <cub/device/device_reduce.cuh>
 #include <cub/device/device_select.cuh>
 #include <cuda/atomic>
 #include <cuda/iterator>
@@ -958,21 +959,56 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
    */
   [[nodiscard]] size_type size(cuda::stream_ref stream) const
   {
-    auto counter =
-      detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator(), stream};
-    counter.reset(stream);
+    using temp_allocator_type =
+      typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
+    auto temp_allocator = temp_allocator_type{this->allocator()};
+
+    auto* d_count =
+      reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
 
-    auto const grid_size = cuco::detail::grid_size(this->capacity());
     auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
       this->empty_key_sentinel(), this->erased_key_sentinel()};
+    auto const slot_begin = cuda::make_transform_iterator(
+      cuda::counting_iterator{size_type{0}},
+      detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
+
+    std::size_t temp_storage_bytes = 0;
+
+    CUCO_CUDA_TRY(cub::DeviceReduce::TransformReduce(nullptr,
+                                                     temp_storage_bytes,
+                                                     slot_begin,
+                                                     d_count,
+                                                     this->capacity(),
+                                                     cuda::std::plus<size_type>{},
+                                                     is_filled,
+                                                     size_type{0},
+                                                     stream.get()));
+
+    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
+
+    CUCO_CUDA_TRY(cub::DeviceReduce::TransformReduce(d_temp_storage,
+                                                     temp_storage_bytes,
+                                                     slot_begin,
+                                                     d_count,
+                                                     this->capacity(),
+                                                     cuda::std::plus<size_type>{},
+                                                     is_filled,
+                                                     size_type{0},
+                                                     stream.get()));
+
+    size_type h_count;
+    CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+      &h_count, d_count, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
 
-    // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
-    // v2.1.0
-    detail::open_addressing_ns::size<cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
-        storage_.ref(), is_filled, counter.data());
+    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
+    temp_allocator.deallocate(reinterpret_cast<char*>(d_count), sizeof(size_type), stream);
 
-    return counter.load_to_host(stream);
+    return h_count;
   }
 
   /**