From 7e00710328888a4ca9d565662da79f0d5a051593 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 1 May 2026 16:52:29 +0000
Subject: [PATCH] Revert PR #580 streaming workaround (CCCL #1422 resolved)

---
 .../open_addressing/open_addressing_impl.cuh  | 80 ++++++++-----------
 1 file changed, 34 insertions(+), 46 deletions(-)
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
index c1f97ae42..b4e0d528f 100644
--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -829,60 +829,48 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
     using temp_allocator_type =
       typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
 
-    cuco::detail::index_type constexpr stride = std::numeric_limits<std::int32_t>::max();
-
-    cuco::detail::index_type h_num_out{0};
     auto temp_allocator = temp_allocator_type{this->allocator()};
     auto d_num_out =
       reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
 
-    // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
-    for (cuco::detail::index_type offset = 0;
-         offset < static_cast<cuco::detail::index_type>(this->capacity());
-         offset += stride) {
-      auto const num_items =
-        std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
-      auto const begin = cuda::make_transform_iterator(
-        cuda::counting_iterator{static_cast<size_type>(offset)},
-        detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
-      auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
-        this->empty_key_sentinel(), this->erased_key_sentinel()};
-
-      std::size_t temp_storage_bytes = 0;
-
-      CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
-                                          temp_storage_bytes,
-                                          begin,
-                                          output_begin + h_num_out,
-                                          d_num_out,
-                                          static_cast<std::int32_t>(num_items),
-                                          is_filled,
-                                          stream.get()));
-
-      // Allocate temporary storage
-      auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
-
-      CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
-                                          temp_storage_bytes,
-                                          begin,
-                                          output_begin + h_num_out,
-                                          d_num_out,
-                                          static_cast<std::int32_t>(num_items),
-                                          is_filled,
-                                          stream.get()));
-
-      size_type temp_count;
-      CUCO_CUDA_TRY(cuco::detail::memcpy_async(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+    auto const begin = cuda::make_transform_iterator(
+      cuda::counting_iterator{size_type{0}},
+      detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
+    auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
+      this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+    std::size_t temp_storage_bytes = 0;
+
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream.get()));
+
+    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
+
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream.get()));
+
+    size_type h_num_out;
+    CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+      &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-      stream.sync();
+    stream.sync();
 #else
-      stream.wait();
+    stream.wait();
 #endif
-      h_num_out += temp_count;
-      temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
-    }
 
+    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
     temp_allocator.deallocate(reinterpret_cast<char*>(d_num_out), sizeof(size_type), stream);
 
     return output_begin + h_num_out;