Skip to content

Commit c9219b5

Browse files
committed
Replace custom size kernel with cub::DeviceReduce::TransformReduce
1 parent b748b9d commit c9219b5

2 files changed

Lines changed: 46 additions & 47 deletions

File tree

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -707,43 +707,6 @@ CUCO_KERNEL void retrieve(InputProbeIt input_probe,
707707
}
708708
}
709709

710-
/**
711-
* @brief Calculates the number of filled slots for the given bucket storage.
712-
*
713-
* @tparam BlockSize Number of threads in each block
714-
* @tparam StorageRef Type of non-owning ref allowing access to storage
715-
* @tparam Predicate Type of predicate indicating if the given slot is filled
716-
* @tparam AtomicT Atomic counter type
717-
*
718-
* @param storage Non-owning device ref used to access the slot storage
719-
* @param is_filled Predicate indicating if the given slot is filled
720-
* @param count Number of filled slots
721-
*/
722-
template <int BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
723-
CUCO_KERNEL __launch_bounds__(BlockSize) void size(StorageRef storage,
724-
Predicate is_filled,
725-
AtomicT* count)
726-
{
727-
using size_type = typename StorageRef::size_type;
728-
729-
auto const loop_stride = cuco::detail::grid_stride();
730-
auto idx = cuco::detail::global_thread_id();
731-
732-
size_type thread_count = 0;
733-
auto const n = storage.capacity();
734-
735-
while (idx < n) {
736-
thread_count += static_cast<size_type>(is_filled(*(storage.data() + idx)));
737-
738-
idx += loop_stride;
739-
}
740-
741-
using BlockReduce = cub::BlockReduce<size_type, BlockSize>;
742-
__shared__ typename BlockReduce::TempStorage temp_storage;
743-
auto const block_count = BlockReduce(temp_storage).Sum(thread_count);
744-
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
745-
}
746-
747710
template <int BlockSize, typename ContainerRef, typename Predicate>
748711
CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
749712
typename ContainerRef::storage_ref_type storage_ref,

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <cuco/utility/traits.hpp>
3232

3333
#include <cub/device/device_for.cuh>
34+
#include <cub/device/device_reduce.cuh>
3435
#include <cub/device/device_select.cuh>
3536
#include <cuda/atomic>
3637
#include <cuda/iterator>
@@ -958,21 +959,56 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
958959
*/
959960
[[nodiscard]] size_type size(cuda::stream_ref stream) const
960961
{
961-
auto counter =
962-
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator(), stream};
963-
counter.reset(stream);
962+
using temp_allocator_type =
963+
typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
964+
auto temp_allocator = temp_allocator_type{this->allocator()};
965+
966+
auto* d_count =
967+
reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
964968

965-
auto const grid_size = cuco::detail::grid_size(this->capacity());
966969
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
967970
this->empty_key_sentinel(), this->erased_key_sentinel()};
971+
auto const slot_begin = cuda::make_transform_iterator(
972+
cuda::counting_iterator{size_type{0}},
973+
detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
974+
975+
std::size_t temp_storage_bytes = 0;
976+
977+
CUCO_CUDA_TRY(cub::DeviceReduce::TransformReduce(nullptr,
978+
temp_storage_bytes,
979+
slot_begin,
980+
d_count,
981+
this->capacity(),
982+
cuda::std::plus<size_type>{},
983+
is_filled,
984+
size_type{0},
985+
stream.get()));
986+
987+
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
988+
989+
CUCO_CUDA_TRY(cub::DeviceReduce::TransformReduce(d_temp_storage,
990+
temp_storage_bytes,
991+
slot_begin,
992+
d_count,
993+
this->capacity(),
994+
cuda::std::plus<size_type>{},
995+
is_filled,
996+
size_type{0},
997+
stream.get()));
998+
999+
size_type h_count;
1000+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
1001+
&h_count, d_count, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
1002+
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
1003+
stream.sync();
1004+
#else
1005+
stream.wait();
1006+
#endif
9681007

969-
// TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
970-
// v2.1.0
971-
detail::open_addressing_ns::size<cuco::detail::default_block_size()>
972-
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
973-
storage_.ref(), is_filled, counter.data());
1008+
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
1009+
temp_allocator.deallocate(reinterpret_cast<char*>(d_count), sizeof(size_type), stream);
9741010

975-
return counter.load_to_host(stream);
1011+
return h_count;
9761012
}
9771013

9781014
/**

0 commit comments

Comments
 (0)