Skip to content

Commit a4b8172

Browse files
committed
Revert PR #580 streaming workaround (CCCL #1422 resolved)
1 parent b748b9d commit a4b8172

1 file changed

Lines changed: 34 additions & 46 deletions

File tree

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 34 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -829,60 +829,48 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
829829
using temp_allocator_type =
830830
typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
831831

832-
cuco::detail::index_type constexpr stride = std::numeric_limits<std::int32_t>::max();
833-
834-
cuco::detail::index_type h_num_out{0};
835832
auto temp_allocator = temp_allocator_type{this->allocator()};
836833
auto d_num_out =
837834
reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
838835

839-
// TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
840-
for (cuco::detail::index_type offset = 0;
841-
offset < static_cast<cuco::detail::index_type>(this->capacity());
842-
offset += stride) {
843-
auto const num_items =
844-
std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
845-
auto const begin = cuda::make_transform_iterator(
846-
cuda::counting_iterator{static_cast<size_type>(offset)},
847-
detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
848-
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
849-
this->empty_key_sentinel(), this->erased_key_sentinel()};
850-
851-
std::size_t temp_storage_bytes = 0;
852-
853-
CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
854-
temp_storage_bytes,
855-
begin,
856-
output_begin + h_num_out,
857-
d_num_out,
858-
static_cast<std::int32_t>(num_items),
859-
is_filled,
860-
stream.get()));
861-
862-
// Allocate temporary storage
863-
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
864-
865-
CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
866-
temp_storage_bytes,
867-
begin,
868-
output_begin + h_num_out,
869-
d_num_out,
870-
static_cast<std::int32_t>(num_items),
871-
is_filled,
872-
stream.get()));
873-
874-
size_type temp_count;
875-
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
876-
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
836+
auto const begin = cuda::make_transform_iterator(
837+
cuda::counting_iterator{size_type{0}},
838+
detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
839+
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
840+
this->empty_key_sentinel(), this->erased_key_sentinel()};
841+
842+
std::size_t temp_storage_bytes = 0;
843+
844+
CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
845+
temp_storage_bytes,
846+
begin,
847+
output_begin,
848+
d_num_out,
849+
this->capacity(),
850+
is_filled,
851+
stream.get()));
852+
853+
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
854+
855+
CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
856+
temp_storage_bytes,
857+
begin,
858+
output_begin,
859+
d_num_out,
860+
this->capacity(),
861+
is_filled,
862+
stream.get()));
863+
864+
size_type h_num_out;
865+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
866+
&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
877867
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
878-
stream.sync();
868+
stream.sync();
879869
#else
880-
stream.wait();
870+
stream.wait();
881871
#endif
882-
h_num_out += temp_count;
883-
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
884-
}
885872

873+
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
886874
temp_allocator.deallocate(reinterpret_cast<char*>(d_num_out), sizeof(size_type), stream);
887875

888876
return output_begin + h_num_out;

0 commit comments

Comments
 (0)