Skip to content

Commit c4ab17f

Browse files
authored
Enable 128bit atomics and unify static type checks (#803)
Uses `cuda::atomic_ref` 128-bit CAS on Hopper and newer (sm_90+, via `atom.cas.b128`) to lift key/value size limits and add a faster atomic path for 16-byte slots. ## What this enables on sm_90+ - `static_set` with keys up to 16 bytes (e.g., `__uint128_t`, UUID-like structs). Previously capped at 8 bytes. - `static_map` with keys and/or mapped types up to 16 bytes. The total slot can grow to 32 bytes (e.g., `cuco::pair<__uint128_t, __uint128_t>`). - Single-CAS atomic writes for packable 16-byte slots. `static_map` with `cuco::pair<K, V>` where `sizeof(K) + sizeof(V) == 16` and no padding (e.g., `cuco::pair<uint64_t, uint64_t>`) now uses one `atom.cas.b128` instead of `back_to_back_cas`. This removes the `wait_for_payload` spin required by the split-CAS path, so observers never see a half-written slot. - Tests and benchmarks now cover 16-byte keys and 32-byte slots. ## Static Type Checks - Open-addressing size and type constraints are now centralized in shared implementation checks, so owning containers and ref types enforce the same requirements. - Public documentation now states the relevant constraints explicitly: key size, slot size, payload size, and `cuco::is_bitwise_comparable_v` requirements. - Public max-size constants are exposed via `cuco/constraints.cuh` as `cuco::open_addressing_max_key_size`, `cuco::open_addressing_max_payload_size`, and `cuco::open_addressing_max_slot_size`. ## What changes on pre-sm_90 Nothing. The existing `packed_cas` (≤ 8 B), `back_to_back_cas` (Volta+), and `cas_dependent_write` (pre-Volta) paths are unchanged. Size limits remain 8 B key / 16 B slot.
1 parent 8576506 commit c4ab17f

66 files changed

Lines changed: 1012 additions & 211 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
1717

1818
set(rapids-cmake-version 26.04)
19+
set(rapids-cmake-branch "release/${rapids-cmake-version}")
1920
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
2021
file(DOWNLOAD
2122
https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake
@@ -74,7 +75,7 @@ rapids_find_package(
7475
###################################################################################################
7576
# - find packages we depend on --------------------------------------------------------------------
7677

77-
rapids_cpm_init()
78+
rapids_cpm_init(OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cccl_override.json")
7879

7980
include(cmake/thirdparty/get_cccl.cmake)
8081

benchmarks/benchmark_defaults.hpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#pragma once
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/hash_functions.cuh>
2021

2122
#include <nvbench/nvbench.cuh>
@@ -25,12 +26,17 @@
2526

2627
namespace cuco::benchmark::defaults {
2728

29+
#if defined(CUCO_HAS_128BIT_ATOMICS)
30+
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
31+
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
32+
#else
2833
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
2934
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
30-
using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
31-
cuco::xxhash_32<char>,
32-
cuco::xxhash_64<char>,
33-
cuco::murmurhash3_32<char>>; //,
35+
#endif
36+
using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
37+
cuco::xxhash_32<char>,
38+
cuco::xxhash_64<char>,
39+
cuco::murmurhash3_32<char>>; //,
3440
// cuco::murmurhash3_x86_128<char>,
3541
// cuco::murmurhash3_x64_128<char>>; // TODO handle tuple-like hash value
3642

benchmarks/benchmark_utils.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#pragma once
1818

19+
#include <cuco/detail/__config>
1920
#include <cuco/detail/error.hpp>
2021
#include <cuco/utility/key_generator.cuh>
2122

@@ -92,3 +93,7 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform,
9293
NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
9394
"GAUSSIAN",
9495
"distribution::gaussian");
96+
97+
#if defined(CUCO_HAS_128BIT_ATOMICS)
98+
NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t");
99+
#endif

ci/matrix.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ devcontainer_version: '26.04'
3737
pull_request:
3838
nvcc:
3939
- {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70,80', std: [17], jobs: ['build', 'test']}
40-
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build', 'test']}
40+
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '90,100', std: [17], jobs: ['build', 'test']}
4141
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'arm64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build']}
4242
- {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']}
4343
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '21', exe: 'clang++'}, gpu_build_archs: '100', std: [17], jobs: ['build']}

cmake/cccl_override.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"packages": {
3+
"CCCL": {
4+
"version": "3.3.0",
5+
"git_url": "https://github.com/NVIDIA/cccl.git",
6+
"git_tag": "09094af138841ef521de1adbbdd18ab8b3dad47b",
7+
"git_shallow": false,
8+
"patches": [
9+
{
10+
"file": "${current_json_dir}/patches/cccl_fix_128bit_cas.patch",
11+
"issue": "Fix 128-bit atomic CAS operand indices [https://github.com/NVIDIA/cccl/issues/8402]",
12+
"fixed_in": "3.3.2"
13+
}
14+
]
15+
}
16+
}
17+
}
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
From 1898944000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2+
From: Daniel Juenger <sleeepyjack@users.noreply.github.com>
3+
Date: Mon, 14 Apr 2026 00:00:00 +0000
4+
Subject: [PATCH] Fix codegen in 128bit atomic CAS (#8403)
5+
6+
Fix wrong inline asm operand indices in all atom.cas.*.b128 variants.
7+
See https://github.com/NVIDIA/cccl/issues/8402
8+
---
9+
.../cuda/std/__atomic/functions/cuda_ptx_generated.h | 80 ++++++++++----------
10+
1 file changed, 40 insertions(+), 40 deletions(-)
11+
12+
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
13+
index f3e30d53039..479815f4136 100644
14+
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
15+
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
16+
@@ -1585,8 +1585,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
17+
{
18+
.reg .b128 _d;
19+
.reg .b128 _v;
20+
- mov.b128 _d, {%0, %1};
21+
- mov.b128 _v, {%4, %5};
22+
+ mov.b128 _d, {%3, %4};
23+
+ mov.b128 _v, {%5, %6};
24+
atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
25+
mov.b128 {%0, %1}, _d;
26+
}
27+
@@ -1604,8 +1604,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
28+
{
29+
.reg .b128 _d;
30+
.reg .b128 _v;
31+
- mov.b128 _d, {%0, %1};
32+
- mov.b128 _v, {%4, %5};
33+
+ mov.b128 _d, {%3, %4};
34+
+ mov.b128 _v, {%5, %6};
35+
atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
36+
mov.b128 {%0, %1}, _d;
37+
}
38+
@@ -1623,8 +1623,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
39+
{
40+
.reg .b128 _d;
41+
.reg .b128 _v;
42+
- mov.b128 _d, {%0, %1};
43+
- mov.b128 _v, {%4, %5};
44+
+ mov.b128 _d, {%3, %4};
45+
+ mov.b128 _v, {%5, %6};
46+
atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
47+
mov.b128 {%0, %1}, _d;
48+
}
49+
@@ -1642,8 +1642,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
50+
{
51+
.reg .b128 _d;
52+
.reg .b128 _v;
53+
- mov.b128 _d, {%0, %1};
54+
- mov.b128 _v, {%4, %5};
55+
+ mov.b128 _d, {%3, %4};
56+
+ mov.b128 _v, {%5, %6};
57+
atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
58+
mov.b128 {%0, %1}, _d;
59+
}
60+
@@ -1661,8 +1661,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
61+
{
62+
.reg .b128 _d;
63+
.reg .b128 _v;
64+
- mov.b128 _d, {%0, %1};
65+
- mov.b128 _v, {%4, %5};
66+
+ mov.b128 _d, {%3, %4};
67+
+ mov.b128 _v, {%5, %6};
68+
atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
69+
mov.b128 {%0, %1}, _d;
70+
}
71+
@@ -1680,8 +1680,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
72+
{
73+
.reg .b128 _d;
74+
.reg .b128 _v;
75+
- mov.b128 _d, {%0, %1};
76+
- mov.b128 _v, {%4, %5};
77+
+ mov.b128 _d, {%3, %4};
78+
+ mov.b128 _v, {%5, %6};
79+
atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
80+
mov.b128 {%0, %1}, _d;
81+
}
82+
@@ -1699,8 +1699,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
83+
{
84+
.reg .b128 _d;
85+
.reg .b128 _v;
86+
- mov.b128 _d, {%0, %1};
87+
- mov.b128 _v, {%4, %5};
88+
+ mov.b128 _d, {%3, %4};
89+
+ mov.b128 _v, {%5, %6};
90+
atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
91+
mov.b128 {%0, %1}, _d;
92+
}
93+
@@ -1718,8 +1718,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
94+
{
95+
.reg .b128 _d;
96+
.reg .b128 _v;
97+
- mov.b128 _d, {%0, %1};
98+
- mov.b128 _v, {%4, %5};
99+
+ mov.b128 _d, {%3, %4};
100+
+ mov.b128 _v, {%5, %6};
101+
atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
102+
mov.b128 {%0, %1}, _d;
103+
}
104+
@@ -1737,8 +1737,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
105+
{
106+
.reg .b128 _d;
107+
.reg .b128 _v;
108+
- mov.b128 _d, {%0, %1};
109+
- mov.b128 _v, {%4, %5};
110+
+ mov.b128 _d, {%3, %4};
111+
+ mov.b128 _v, {%5, %6};
112+
atom.cas.release.cta.b128 _d,[%2],_d,_v;
113+
mov.b128 {%0, %1}, _d;
114+
}
115+
@@ -1756,8 +1756,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
116+
{
117+
.reg .b128 _d;
118+
.reg .b128 _v;
119+
- mov.b128 _d, {%0, %1};
120+
- mov.b128 _v, {%4, %5};
121+
+ mov.b128 _d, {%3, %4};
122+
+ mov.b128 _v, {%5, %6};
123+
atom.cas.release.cluster.b128 _d,[%2],_d,_v;
124+
mov.b128 {%0, %1}, _d;
125+
}
126+
@@ -1775,8 +1775,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
127+
{
128+
.reg .b128 _d;
129+
.reg .b128 _v;
130+
- mov.b128 _d, {%0, %1};
131+
- mov.b128 _v, {%4, %5};
132+
+ mov.b128 _d, {%3, %4};
133+
+ mov.b128 _v, {%5, %6};
134+
atom.cas.release.gpu.b128 _d,[%2],_d,_v;
135+
mov.b128 {%0, %1}, _d;
136+
}
137+
@@ -1794,8 +1794,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
138+
{
139+
.reg .b128 _d;
140+
.reg .b128 _v;
141+
- mov.b128 _d, {%0, %1};
142+
- mov.b128 _v, {%4, %5};
143+
+ mov.b128 _d, {%3, %4};
144+
+ mov.b128 _v, {%5, %6};
145+
atom.cas.release.sys.b128 _d,[%2],_d,_v;
146+
mov.b128 {%0, %1}, _d;
147+
}
148+
@@ -1813,8 +1813,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
149+
{
150+
.reg .b128 _d;
151+
.reg .b128 _v;
152+
- mov.b128 _d, {%0, %1};
153+
- mov.b128 _v, {%4, %5};
154+
+ mov.b128 _d, {%3, %4};
155+
+ mov.b128 _v, {%5, %6};
156+
atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
157+
mov.b128 {%0, %1}, _d;
158+
}
159+
@@ -1832,8 +1832,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
160+
{
161+
.reg .b128 _d;
162+
.reg .b128 _v;
163+
- mov.b128 _d, {%0, %1};
164+
- mov.b128 _v, {%4, %5};
165+
+ mov.b128 _d, {%3, %4};
166+
+ mov.b128 _v, {%5, %6};
167+
atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
168+
mov.b128 {%0, %1}, _d;
169+
}
170+
@@ -1851,8 +1851,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
171+
{
172+
.reg .b128 _d;
173+
.reg .b128 _v;
174+
- mov.b128 _d, {%0, %1};
175+
- mov.b128 _v, {%4, %5};
176+
+ mov.b128 _d, {%3, %4};
177+
+ mov.b128 _v, {%5, %6};
178+
atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
179+
mov.b128 {%0, %1}, _d;
180+
}
181+
@@ -1870,8 +1870,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
182+
{
183+
.reg .b128 _d;
184+
.reg .b128 _v;
185+
- mov.b128 _d, {%0, %1};
186+
- mov.b128 _v, {%4, %5};
187+
+ mov.b128 _d, {%3, %4};
188+
+ mov.b128 _v, {%5, %6};
189+
atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
190+
mov.b128 {%0, %1}, _d;
191+
}
192+
@@ -1889,8 +1889,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
193+
{
194+
.reg .b128 _d;
195+
.reg .b128 _v;
196+
- mov.b128 _d, {%0, %1};
197+
- mov.b128 _v, {%4, %5};
198+
+ mov.b128 _d, {%3, %4};
199+
+ mov.b128 _v, {%5, %6};
200+
atom.cas.cta.b128 _d,[%2],_d,_v;
201+
mov.b128 {%0, %1}, _d;
202+
}
203+
@@ -1908,8 +1908,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
204+
{
205+
.reg .b128 _d;
206+
.reg .b128 _v;
207+
- mov.b128 _d, {%0, %1};
208+
- mov.b128 _v, {%4, %5};
209+
+ mov.b128 _d, {%3, %4};
210+
+ mov.b128 _v, {%5, %6};
211+
atom.cas.cluster.b128 _d,[%2],_d,_v;
212+
mov.b128 {%0, %1}, _d;
213+
}
214+
@@ -1927,8 +1927,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
215+
{
216+
.reg .b128 _d;
217+
.reg .b128 _v;
218+
- mov.b128 _d, {%0, %1};
219+
- mov.b128 _v, {%4, %5};
220+
+ mov.b128 _d, {%3, %4};
221+
+ mov.b128 _v, {%5, %6};
222+
atom.cas.gpu.b128 _d,[%2],_d,_v;
223+
mov.b128 {%0, %1}, _d;
224+
}
225+
@@ -1946,8 +1946,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
226+
{
227+
.reg .b128 _d;
228+
.reg .b128 _v;
229+
- mov.b128 _d, {%0, %1};
230+
- mov.b128 _v, {%4, %5};
231+
+ mov.b128 _d, {%3, %4};
232+
+ mov.b128 _v, {%5, %6};
233+
atom.cas.sys.b128 _d,[%2],_d,_v;
234+
mov.b128 {%0, %1}, _d;
235+
}
236+
--
237+
2.45.2
238+

include/cuco/constraints.cuh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright (c) 2026, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cuco/detail/__config>
20+
21+
#include <cstddef>
22+
23+
namespace cuco {
24+
25+
/// Maximum supported key size (in bytes) for open-addressing containers.
26+
inline constexpr std::size_t open_addressing_max_key_size =
27+
#if defined(CUCO_HAS_128BIT_ATOMICS)
28+
16;
29+
#else
30+
8;
31+
#endif
32+
33+
/// Maximum supported payload/mapped type size (in bytes) for open-addressing containers.
34+
/// Tied to `open_addressing_max_key_size`: a slot stores at most a key plus an equally-sized
35+
/// payload.
36+
inline constexpr std::size_t open_addressing_max_payload_size = open_addressing_max_key_size;
37+
38+
/// Maximum supported slot size (in bytes) for open-addressing containers.
39+
/// Tied to `open_addressing_max_key_size`: a slot stores at most a key plus an equally-sized
40+
/// payload.
41+
inline constexpr std::size_t open_addressing_max_slot_size =
42+
open_addressing_max_key_size + open_addressing_max_payload_size;
43+
44+
} // namespace cuco

include/cuco/detail/__config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@
5252
#define CUCO_HAS_INT128
5353
#endif
5454

55+
#if defined(CUCO_HAS_INT128) && (CUCO_CUDA_MINIMUM_ARCH >= 900)
56+
#define CUCO_HAS_128BIT_ATOMICS
57+
#endif
58+
5559
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
5660
#define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
5761
#endif

0 commit comments

Comments
 (0)