@@ -829,60 +829,48 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
829829 using temp_allocator_type =
830830 typename std::allocator_traits<allocator_type>::template rebind_alloc<char >;
831831
832- cuco::detail::index_type constexpr stride = std::numeric_limits<std::int32_t >::max ();
833-
834- cuco::detail::index_type h_num_out{0 };
835832 auto temp_allocator = temp_allocator_type{this ->allocator ()};
836833 auto d_num_out =
837834 reinterpret_cast <size_type*>(temp_allocator.allocate (sizeof (size_type), stream));
838835
839- // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
840- for (cuco::detail::index_type offset = 0 ;
841- offset < static_cast <cuco::detail::index_type>(this ->capacity ());
842- offset += stride) {
843- auto const num_items =
844- std::min (static_cast <cuco::detail::index_type>(this ->capacity ()) - offset, stride);
845- auto const begin = cuda::make_transform_iterator (
846- cuda::counting_iterator{static_cast <size_type>(offset)},
847- detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this ->storage_ref ()));
848- auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
849- this ->empty_key_sentinel (), this ->erased_key_sentinel ()};
850-
851- std::size_t temp_storage_bytes = 0 ;
852-
853- CUCO_CUDA_TRY (cub::DeviceSelect::If (nullptr ,
854- temp_storage_bytes,
855- begin,
856- output_begin + h_num_out,
857- d_num_out,
858- static_cast <std::int32_t >(num_items),
859- is_filled,
860- stream.get ()));
861-
862- // Allocate temporary storage
863- auto d_temp_storage = temp_allocator.allocate (temp_storage_bytes, stream);
864-
865- CUCO_CUDA_TRY (cub::DeviceSelect::If (d_temp_storage,
866- temp_storage_bytes,
867- begin,
868- output_begin + h_num_out,
869- d_num_out,
870- static_cast <std::int32_t >(num_items),
871- is_filled,
872- stream.get ()));
873-
874- size_type temp_count;
875- CUCO_CUDA_TRY (cuco::detail::memcpy_async (
876- &temp_count, d_num_out, sizeof (size_type), cudaMemcpyDeviceToHost, stream));
836+ auto const begin = cuda::make_transform_iterator (
837+ cuda::counting_iterator{size_type{0 }},
838+ detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this ->storage_ref ()));
839+ auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
840+ this ->empty_key_sentinel (), this ->erased_key_sentinel ()};
841+
842+ std::size_t temp_storage_bytes = 0 ;
843+
844+ CUCO_CUDA_TRY (cub::DeviceSelect::If (nullptr ,
845+ temp_storage_bytes,
846+ begin,
847+ output_begin,
848+ d_num_out,
849+ this ->capacity (),
850+ is_filled,
851+ stream.get ()));
852+
853+ auto d_temp_storage = temp_allocator.allocate (temp_storage_bytes, stream);
854+
855+ CUCO_CUDA_TRY (cub::DeviceSelect::If (d_temp_storage,
856+ temp_storage_bytes,
857+ begin,
858+ output_begin,
859+ d_num_out,
860+ this ->capacity (),
861+ is_filled,
862+ stream.get ()));
863+
864+ size_type h_num_out;
865+ CUCO_CUDA_TRY (cuco::detail::memcpy_async (
866+ &h_num_out, d_num_out, sizeof (size_type), cudaMemcpyDeviceToHost, stream));
877867#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
878- stream.sync ();
868+ stream.sync ();
879869#else
880- stream.wait ();
870+ stream.wait ();
881871#endif
882- h_num_out += temp_count;
883- temp_allocator.deallocate (d_temp_storage, temp_storage_bytes, stream);
884- }
885872
873+ temp_allocator.deallocate (d_temp_storage, temp_storage_bytes, stream);
886874 temp_allocator.deallocate (reinterpret_cast <char *>(d_num_out), sizeof (size_type), stream);
887875
888876 return output_begin + h_num_out;
0 commit comments