Is this a duplicate?
Type of Bug
Silent Failure
Describe the bug
@nmetzinger and I were debugging some piece of code and found the issue to be wrong results retrieved from a static_multimap.
We are using a custom hasher / equality operator for retrieve to associate additional data with the retrieved values via the output_probe buffer. However, retrieve results are only partially correct.
Below is the reproducer. Hash table contents are loaded from file. The file is included in the attached self-contained example. With the current cuCollections commit, CUDA 13.3 with its included CCCL, we get the following output on sm_120.
loaded 7909 pairs
d_numHitsPerKey:
11 0 0 0 0 0 0 0 0 0 6 3 0
numReads 1
numKeys 13
numHits 20
with original keys
d_keysOfHits:
23846566 23846566 23846566 23846566 23846566 23846566 24892615 24892615 24892615 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701
d_hits:
4412 6106 3995 4756 7175 2621 4443 3996 7176 1426 2821 1669 3370 6454 6592 4039 5454 6546 3541 4345
with custom keys
querying h_customKeys:
(8083701,0) (23797427,1) (23320168,2) (8047878,3) (22651094,4) (20191956,5) (8608406,6) (7033559,7) (23747319,8) (21716725,9) (23846566,10) (24892615,11) (24811765,12)
d_customKeysOfHits:
(23846566,10) (23846566,10) (23846566,10) (23846566,10) (23846566,10) (23846566,10) (24892615,11) (24892615,11) (24892615,11) (8083701,0) (8083701,0) (8083701,0) (8083701,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0)
d_hits:
4412 6106 3995 4756 7175 2621 4443 3996 7176 1426 2821 1669 3370 0 0 0 0 0 0 0
Note that when using the custom approach, only 4 out of 11 values for key 8083701 are retrieved correctly.
// nvcc -std=c++17 -O0 -g -lineinfo --extended-lambda --expt-relaxed-constexpr -gencode=arch=compute_120,code=sm_120 -I cuCollections/include/ reproducer.cu -o reproducer
#include <cooperative_groups.h>
#include <cuco/static_multimap.cuh>
#include <fstream>
#include <iostream>
#include <limits>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <vector>
using key_type = uint64_t;
struct Location {
Location() = default;
__host__ __device__ constexpr Location(uint64_t data_)
: data(data_){}
uint64_t data;
};
std::ostream& operator<<(std::ostream& os, const Location& loc) {
os << loc.data;
return os;
}
constexpr key_type empty_key_sentinel = std::numeric_limits<key_type>::max();
constexpr Location empty_value_sentinel{ std::numeric_limits<uint64_t>::max()};
using HashTable = cuco::static_multimap<key_type, Location>;
using CustomRetrieveKeyType = cuda::std::tuple<key_type, int>;
struct CustomKeyEqual {
__host__ __device__ bool operator()(const CustomRetrieveKeyType& customKey,
const key_type& key) const {
return cuda::std::get<0>(customKey) == key;
}
};
template<int hasherNum>
struct CustomKeyHash {
using Hasher = typename HashTable::hasher;
__host__ __device__ auto operator()(const CustomRetrieveKeyType& customKey) const {
auto key = cuda::std::get<0>(customKey);
auto hasher = cuda::std::get<hasherNum>(Hasher{});
return hasher(key);
}
};
template <typename MapRef, typename InputIterator, typename OutputIterator>
__global__ void getNumHitsPerKeyKernel(MapRef map, InputIterator keys,
std::size_t n,
OutputIterator numHitsPerKey) {
namespace cg = cooperative_groups;
constexpr auto cg_size = MapRef::cg_size;
auto tile =
cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
while (idx < n) {
int keycount = map.count(tile, *(keys + idx));
keycount = cg::reduce(tile, keycount, cg::plus<int>{});
if (tile.thread_rank() == 0) {
// uint64_t key = *(keys + idx);
// printf("idx %lu, key %lu, count %d\n", idx, key, keycount);
numHitsPerKey[idx] = keycount;
}
idx += loop_stride;
}
}
template <typename InputIterator, typename OutputIterator>
void callGetNumHitsPerKeyKernel(HashTable& map, InputIterator d_keys,
std::size_t numKeys,
OutputIterator d_numHitsPerKey,
cudaStream_t stream) {
auto map_ref_count = map.ref(cuco::count);
int cg_size = decltype(map_ref_count)::cg_size;
int blocksize = 128;
int gridsize = cuda::ceil_div(numKeys, blocksize / cg_size);
getNumHitsPerKeyKernel<<<gridsize, blocksize, 0, stream>>>(
map_ref_count, d_keys, numKeys, d_numHitsPerKey);
}
int main() {
std::string filename = "reference_kvpairs.bin";
using KVPair = cuco::pair<key_type, Location>;
std::vector<KVPair> reference_kvpairs;
std::ifstream infile(filename);
size_t numrefpairs;
infile.read((char*)&numrefpairs, sizeof(size_t));
reference_kvpairs.resize(numrefpairs);
infile.read((char*)reference_kvpairs.data(), sizeof(KVPair) * numrefpairs);
std::cout << "loaded " << numrefpairs << " pairs\n";
double loadFactor = 0.5;
HashTable gpuHashTable(numrefpairs / loadFactor,
cuco::empty_key{empty_key_sentinel},
cuco::empty_value{empty_value_sentinel});
thrust::device_vector<KVPair> d_kvpairs = reference_kvpairs;
gpuHashTable.insert(d_kvpairs.begin(), d_kvpairs.end());
// querying
thrust::device_vector<key_type> d_queryKeys{
8083701ull, 23797427ull, 23320168ull, 8047878ull, 22651094ull,
20191956ull, 8608406ull, 7033559ull, 23747319ull, 21716725ull,
23846566ull, 24892615ull, 24811765ull};
thrust::device_vector<int> d_queryOtherData{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
assert(d_queryKeys.size() == d_queryOtherData.size());
// thrust::reverse(d_queryKeys.begin(), d_queryKeys.end());
// thrust::reverse(d_queryOtherData.begin(), d_queryOtherData.end());
const int numReads = 1;
const int numKeys = d_queryKeys.size();
thrust::device_vector<int> d_numHitsPerKey(numKeys);
callGetNumHitsPerKeyKernel(gpuHashTable, d_queryKeys.begin(), numKeys,
d_numHitsPerKey.begin(), (cudaStream_t)0);
std::cout << "d_numHitsPerKey:\n";
for (int i = 0; i < numKeys; i++) {
std::cout << d_numHitsPerKey[i] << " ";
}
std::cout << "\n";
const int numHits =
thrust::reduce(d_numHitsPerKey.begin(), d_numHitsPerKey.end());
std::cout << "numReads " << numReads << "\n";
std::cout << "numKeys " << numKeys << "\n";
std::cout << "numHits " << numHits << "\n";
std::cout << "\n";
std::cout << "with original keys\n";
{
thrust::device_vector<Location> d_hits(numHits);
thrust::device_vector<key_type> d_keysOfHits(numHits);
gpuHashTable.retrieve(
d_queryKeys.begin(),
d_queryKeys.end(),
d_keysOfHits.begin(),
cuda::make_transform_output_iterator(
d_hits.begin(),
[] __device__(auto cucopair) -> Location { return cucopair.second; }
)
);
cudaDeviceSynchronize();
std::cout << "d_keysOfHits:\n";
for (int i = 0; i < numHits; i++) {
std::cout << d_keysOfHits[i] << " ";
}
std::cout << "\n";
std::cout << "d_hits:\n";
for (int i = 0; i < numHits; i++) {
std::cout << d_hits[i] << " ";
}
std::cout << "\n";
}
std::cout << "\n";
std::cout << "\n";
std::cout << "with custom keys\n";
{
auto customKeys = thrust::make_zip_iterator(d_queryKeys.begin(),
d_queryOtherData.begin());
std::vector<CustomRetrieveKeyType> h_customKeys(numKeys);
thrust::copy(customKeys, customKeys + numKeys, h_customKeys.begin());
std::cout << "querying h_customKeys:\n";
for (int i = 0; i < numKeys; i++) {
CustomRetrieveKeyType tup = h_customKeys[i];
std::cout << "(" << cuda::std::get<0>(tup) << ","
<< cuda::std::get<1>(tup) << ") ";
}
std::cout << "\n";
std::cout << "\n";
thrust::device_vector<Location> d_hits(numHits);
thrust::device_vector<CustomRetrieveKeyType>
d_customKeysOfHits(numHits);
CustomKeyEqual probe_eq;
CustomKeyHash<0> probe_hash_0;
CustomKeyHash<1> probe_hash_1;
auto probe_doublehasher = cuda::std::make_tuple(probe_hash_0, probe_hash_1);
gpuHashTable.retrieve(customKeys,
customKeys + numKeys,
probe_eq,
probe_doublehasher,
d_customKeysOfHits.begin(),
cuda::make_transform_output_iterator(
d_hits.begin(),
[] __device__(auto cucopair) -> Location { return cucopair.second; }
)
);
cudaDeviceSynchronize();
std::cout << "d_customKeysOfHits:\n";
for (int i = 0; i < numHits; i++) {
CustomRetrieveKeyType tup = d_customKeysOfHits[i];
std::cout << "(" << cuda::std::get<0>(tup) << ","
<< cuda::std::get<1>(tup) << ") ";
}
std::cout << "\n";
std::cout << "d_hits:\n";
for (int i = 0; i < numHits; i++) {
std::cout << d_hits[i] << " ";
}
std::cout << "\n";
}
std::cout << "\n";
std::cout << "\n";
}
How to Reproduce
cuco_reproducer.zip
Expected behavior
All results should be correct
Reproduction link
No response
Operating System
Linux
nvidia-smi output
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 610.43.02 KMD Version: 610.43.02 CUDA UMD Version: 13.3 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 6000 Blac... On | 00000000:C1:00.0 Off | 0 |
| N/A 29C P8 34W / 600W | 0MiB / 97887MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
NVCC version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Fri_Apr_24_07:22:02_PM_PDT_2026
Cuda compilation tools, release 13.3, V13.3.33
Build cuda_13.3.r13.3/compiler.37862127_0
Is this a duplicate?
Type of Bug
Silent Failure
Describe the bug
@nmetzinger and I were debugging some piece of code and found the issue to be wrong results retrieved from a static_multimap.
We are using a custom hasher / equality operator for retrieve to associate additional data with the retrieved values via the
output_probebuffer. However, retrieve results are only partially correct.Below is the reproducer. Hash table contents are loaded from file. The file is included in the attached self-contained example. With the current cuCollections commit, CUDA 13.3 with its included CCCL, we get the following output on sm_120.
Note that when using the custom approach, only 4 out of 11 values for key 8083701 are retrieved correctly.
How to Reproduce
cuco_reproducer.zip
Expected behavior
All results should be correct
Reproduction link
No response
Operating System
Linux
nvidia-smi output
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 610.43.02 KMD Version: 610.43.02 CUDA UMD Version: 13.3 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 6000 Blac... On | 00000000:C1:00.0 Off | 0 |
| N/A 29C P8 34W / 600W | 0MiB / 97887MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
NVCC version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Fri_Apr_24_07:22:02_PM_PDT_2026
Cuda compilation tools, release 13.3, V13.3.33
Build cuda_13.3.r13.3/compiler.37862127_0