Skip to content

Commit fa595c1

Browse files
committed
Extend 128-bit atomics: helper dedup + bench types + skipped tests
1 parent 640f775 commit fa595c1

10 files changed

Lines changed: 94 additions & 70 deletions

File tree

benchmarks/benchmark_defaults.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525

2626
namespace cuco::benchmark::defaults {
2727

28-
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
29-
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
28+
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
29+
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
3030
using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
3131
cuco::xxhash_32<char>,
3232
cuco::xxhash_64<char>,

benchmarks/benchmark_utils.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,5 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform,
9292
NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
9393
"GAUSSIAN",
9494
"distribution::gaussian");
95+
96+
NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t");

include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh

Lines changed: 34 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -545,41 +545,17 @@ class open_addressing_ref_impl {
545545

546546
// If the key is already in the container, return false
547547
if (eq_res == detail::equal_result::EQUAL) {
548-
if constexpr (has_payload and sizeof(value_type) > 8) {
549-
#if (__CUDA_ARCH__ >= 900)
550-
if constexpr (not cuco::detail::is_packable<value_type>()) {
551-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
552-
}
553-
#else
554-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
555-
#endif
556-
}
548+
this->maybe_wait_for_payload(slot_ptr);
557549
return {iterator{slot_ptr}, false};
558550
}
559551
if (eq_res == detail::equal_result::AVAILABLE) {
560552
switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) {
561553
case insert_result::SUCCESS: {
562-
if constexpr (has_payload and sizeof(value_type) > 8) {
563-
#if (__CUDA_ARCH__ >= 900)
564-
if constexpr (not cuco::detail::is_packable<value_type>()) {
565-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
566-
}
567-
#else
568-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
569-
#endif
570-
}
554+
this->maybe_wait_for_payload(slot_ptr);
571555
return {iterator{slot_ptr}, true};
572556
}
573557
case insert_result::DUPLICATE: {
574-
if constexpr (has_payload and sizeof(value_type) > 8) {
575-
#if (__CUDA_ARCH__ >= 900)
576-
if constexpr (not cuco::detail::is_packable<value_type>()) {
577-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
578-
}
579-
#else
580-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
581-
#endif
582-
}
558+
this->maybe_wait_for_payload(slot_ptr);
583559
return {iterator{slot_ptr}, false};
584560
}
585561
default: continue;
@@ -647,17 +623,7 @@ class open_addressing_ref_impl {
647623
if (group_finds_equal) {
648624
auto const src_lane = __ffs(group_finds_equal) - 1;
649625
auto const res = group.shfl(reinterpret_cast<intptr_t>(slot_ptr), src_lane);
650-
if (group.thread_rank() == src_lane) {
651-
if constexpr (has_payload and sizeof(value_type) > 8) {
652-
#if (__CUDA_ARCH__ >= 900)
653-
if constexpr (not cuco::detail::is_packable<value_type>()) {
654-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
655-
}
656-
#else
657-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
658-
#endif
659-
}
660-
}
626+
if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); }
661627
group.sync();
662628
return {iterator{reinterpret_cast<value_type*>(res)}, false};
663629
}
@@ -673,32 +639,12 @@ class open_addressing_ref_impl {
673639

674640
switch (group.shfl(status, src_lane)) {
675641
case insert_result::SUCCESS: {
676-
if (group.thread_rank() == src_lane) {
677-
if constexpr (has_payload and sizeof(value_type) > 8) {
678-
#if (__CUDA_ARCH__ >= 900)
679-
if constexpr (not cuco::detail::is_packable<value_type>()) {
680-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
681-
}
682-
#else
683-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
684-
#endif
685-
}
686-
}
642+
if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); }
687643
group.sync();
688644
return {iterator{reinterpret_cast<value_type*>(res)}, true};
689645
}
690646
case insert_result::DUPLICATE: {
691-
if (group.thread_rank() == src_lane) {
692-
if constexpr (has_payload and sizeof(value_type) > 8) {
693-
#if (__CUDA_ARCH__ >= 900)
694-
if constexpr (not cuco::detail::is_packable<value_type>()) {
695-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
696-
}
697-
#else
698-
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
699-
#endif
700-
}
701-
}
647+
if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); }
702648
group.sync();
703649
return {iterator{reinterpret_cast<value_type*>(res)}, false};
704650
}
@@ -1973,6 +1919,34 @@ class open_addressing_ref_impl {
19731919
} while (cuco::detail::bitwise_compare(current, sentinel));
19741920
}
19751921

1922+
/**
1923+
* @brief Conditionally spin-waits for the payload of a non-atomically inserted slot to become
1924+
* visible.
1925+
*
1926+
* For containers where the key and value are inserted by separate instructions
1927+
* (`cas_dependent_write` / `back_to_back_cas`), an observer thread may see the key before the
1928+
* payload. This helper spins until the payload is visible. For atomic single-CAS paths (slot
1929+
* size <= 8 bytes, or a packable slot on sm_90+ via `atom.cas.b128`), the payload is already
1930+
* visible and this is a no-op.
1931+
*
1932+
* @tparam SlotPtr Pointer-like type to a slot holding a `.second` payload member
1933+
*
1934+
* @param slot_ptr Pointer to the slot whose payload may need waiting on
1935+
*/
1936+
template <typename SlotPtr>
1937+
__device__ void maybe_wait_for_payload(SlotPtr slot_ptr) noexcept
1938+
{
1939+
if constexpr (has_payload and sizeof(value_type) > 8) {
1940+
#if (__CUDA_ARCH__ >= 900)
1941+
if constexpr (not cuco::detail::is_packable<value_type>()) {
1942+
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
1943+
}
1944+
#else
1945+
this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel());
1946+
#endif
1947+
}
1948+
}
1949+
19761950
// TODO: Clean up the sentinel handling since it's duplicated in ref and equal wrapper
19771951
value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot
19781952
detail::equal_wrapper<key_type, key_equal, allows_duplicates>

tests/static_map/heterogeneous_lookup_test.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_map.cuh>
2021

2122
#include <cuda/functional>
@@ -88,6 +89,10 @@ TEMPLATE_TEST_CASE_SIG("static_map heterogeneous lookup tests",
8889
(int64_t, 1),
8990
(int64_t, 2),
9091
#endif
92+
#if defined(CUCO_HAS_128BIT_ATOMICS)
93+
(__int128_t, 1),
94+
(__int128_t, 2),
95+
#endif
9196

9297
(int32_t, 1),
9398
(int32_t, 2))

tests/static_map/shared_memory_test.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_map.cuh>
2021

2122
#include <cuda/functional>
@@ -72,7 +73,12 @@ TEMPLATE_TEST_CASE_SIG("static_map shared memory tests",
7273
(int32_t, int32_t),
7374
(int32_t, int64_t),
7475
(int64_t, int32_t),
75-
(int64_t, int64_t))
76+
(int64_t, int64_t)
77+
#if defined(CUCO_HAS_128BIT_ATOMICS)
78+
,
79+
(__int128_t, __int128_t)
80+
#endif
81+
)
7682
{
7783
constexpr std::size_t number_of_maps = 1000;
7884
constexpr std::size_t elements_in_map = 500;

tests/static_multimap/heterogeneous_lookup_test.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ struct custom_key_equal {
7878
}
7979
};
8080

81+
// TODO: extend with __int128_t once the multimap can handle slots larger than
82+
// 32 bytes (key_pair<__int128_t> is 32 bytes, so pair<key_pair<__int128_t>, V>
83+
// exceeds the current slot-size budget).
8184
TEMPLATE_TEST_CASE(
8285
"static_multimap heterogeneous lookup tests",
8386
"",

tests/static_multiset/large_input_test.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_multiset.cuh>
2021

2122
#include <cuda/iterator>
@@ -56,9 +57,17 @@ TEMPLATE_TEST_CASE_SIG(
5657
"",
5758
((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
5859
(int64_t, cuco::test::probe_sequence::double_hashing, 1),
59-
(int64_t, cuco::test::probe_sequence::double_hashing, 2))
60+
(int64_t, cuco::test::probe_sequence::double_hashing, 2)
61+
#if defined(CUCO_HAS_128BIT_ATOMICS)
62+
,
63+
(__int128_t, cuco::test::probe_sequence::double_hashing, 1),
64+
(__int128_t, cuco::test::probe_sequence::double_hashing, 2)
65+
#endif
66+
)
6067
{
61-
constexpr std::size_t num_keys{1'200'000'000};
68+
// Reduce the key count for 16-byte keys to stay within GPU memory.
69+
// 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB.
70+
constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000;
6271

6372
using extent_type = cuco::extent<std::size_t>;
6473
using probe = cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>;

tests/static_set/heterogeneous_lookup_test.cu

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_set.cuh>
2021

2122
#include <cuda/functional>
@@ -84,7 +85,13 @@ TEMPLATE_TEST_CASE_SIG("static_set heterogeneous lookup tests",
8485
"",
8586
((typename T, int CGSize), T, CGSize),
8687
(int32_t, 1),
87-
(int32_t, 2))
88+
(int32_t, 2)
89+
#if defined(CUCO_HAS_128BIT_ATOMICS)
90+
,
91+
(__int128_t, 1),
92+
(__int128_t, 2)
93+
#endif
94+
)
8895
{
8996
using Key = T;
9097
using InsertKey = key_pair<T>;

tests/static_set/large_input_test.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_set.cuh>
2021

2122
#include <cuda/iterator>
@@ -74,9 +75,17 @@ TEMPLATE_TEST_CASE_SIG(
7475
"",
7576
((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
7677
(int64_t, cuco::test::probe_sequence::double_hashing, 1),
77-
(int64_t, cuco::test::probe_sequence::double_hashing, 2))
78+
(int64_t, cuco::test::probe_sequence::double_hashing, 2)
79+
#if defined(CUCO_HAS_128BIT_ATOMICS)
80+
,
81+
(__int128_t, cuco::test::probe_sequence::double_hashing, 1),
82+
(__int128_t, cuco::test::probe_sequence::double_hashing, 2)
83+
#endif
84+
)
7885
{
79-
constexpr std::size_t num_keys{1'200'000'000};
86+
// Reduce the key count for 16-byte keys to stay within GPU memory.
87+
// 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB.
88+
constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000;
8089

8190
using extent_type = cuco::extent<std::size_t>;
8291
using probe = cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>;

tests/static_set/shared_memory_test.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <test_utils.hpp>
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/static_set.cuh>
2021

2122
#include <cuda/functional>
@@ -63,8 +64,16 @@ __global__ void shared_memory_test_kernel(Ref* sets,
6364
}
6465
}
6566

66-
TEMPLATE_TEST_CASE_SIG(
67-
"static_set shared memory tests", "", ((typename Key), Key), (int32_t), (int64_t))
67+
TEMPLATE_TEST_CASE_SIG("static_set shared memory tests",
68+
"",
69+
((typename Key), Key),
70+
(int32_t),
71+
(int64_t)
72+
#if defined(CUCO_HAS_128BIT_ATOMICS)
73+
,
74+
(__int128_t)
75+
#endif
76+
)
6877
{
6978
constexpr std::size_t number_of_sets = 1000;
7079
constexpr std::size_t elements_in_set = 500;

0 commit comments

Comments
 (0)