From 7e00710328888a4ca9d565662da79f0d5a051593 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 1 May 2026 16:52:29 +0000 Subject: [PATCH] Revert PR #580 streaming workaround (CCCL #1422 resolved) --- .../open_addressing/open_addressing_impl.cuh | 80 ++++++++----------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index c1f97ae42..b4e0d528f 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -829,60 +829,48 @@ class open_addressing_impl : private open_addressing_compatible::template rebind_alloc; - cuco::detail::index_type constexpr stride = std::numeric_limits::max(); - - cuco::detail::index_type h_num_out{0}; auto temp_allocator = temp_allocator_type{this->allocator()}; auto d_num_out = reinterpret_cast(temp_allocator.allocate(sizeof(size_type), stream)); - // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved - for (cuco::detail::index_type offset = 0; - offset < static_cast(this->capacity()); - offset += stride) { - auto const num_items = - std::min(static_cast(this->capacity()) - offset, stride); - auto const begin = cuda::make_transform_iterator( - cuda::counting_iterator{static_cast(offset)}, - detail::open_addressing_ns::get_slot(this->storage_ref())); - auto const is_filled = detail::open_addressing_ns::slot_is_filled{ - this->empty_key_sentinel(), this->erased_key_sentinel()}; - - std::size_t temp_storage_bytes = 0; - - CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - begin, - output_begin + h_num_out, - d_num_out, - static_cast(num_items), - is_filled, - stream.get())); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream); - - CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, - temp_storage_bytes, - begin, - output_begin + h_num_out, - d_num_out, - static_cast(num_items), - is_filled, - stream.get())); - - size_type temp_count; - CUCO_CUDA_TRY(cuco::detail::memcpy_async( - &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + auto const begin = cuda::make_transform_iterator( + cuda::counting_iterator{size_type{0}}, + detail::open_addressing_ns::get_slot(this->storage_ref())); + auto const is_filled = detail::open_addressing_ns::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + std::size_t temp_storage_bytes = 0; + + CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream.get())); + + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream.get())); + + size_type h_num_out; + CUCO_CUDA_TRY(cuco::detail::memcpy_async( + &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) - stream.sync(); + stream.sync(); #else - stream.wait(); + stream.wait(); #endif - h_num_out += temp_count; - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream); - } + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream); temp_allocator.deallocate(reinterpret_cast(d_num_out), sizeof(size_type), stream); return output_begin + h_num_out;