[BUG]: static_multimap::retrieve gives wrong output with custom equality/hasher

### Is this a duplicate?

- [x] I confirmed there appear to be no duplicate issues for this bug (https://github.com/NVIDIA/cuCollections/issues)

### Type of Bug

Silent Failure

### Describe the bug

@nmetzinger and I were debugging some piece of code and found the issue to be wrong results retrieved from a static_multimap.


We are using a custom hasher / equality operator for retrieve to associate additional data with the retrieved values via the `output_probe` buffer. However, retrieve results are only partially correct.


Below is the reproducer. Hash table contents are loaded from file. The file is included in the attached self-contained example. With the current cuCollections commit, CUDA 13.3 with its included CCCL, we get the following output on sm_120.

```
loaded 7909 pairs
d_numHitsPerKey:
11 0 0 0 0 0 0 0 0 0 6 3 0 
numReads 1
numKeys 13
numHits 20

with original keys
d_keysOfHits:
23846566 23846566 23846566 23846566 23846566 23846566 24892615 24892615 24892615 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 8083701 
d_hits:
4412 6106 3995 4756 7175 2621 4443 3996 7176 1426 2821 1669 3370 6454 6592 4039 5454 6546 3541 4345 


with custom keys
querying h_customKeys:
(8083701,0) (23797427,1) (23320168,2) (8047878,3) (22651094,4) (20191956,5) (8608406,6) (7033559,7) (23747319,8) (21716725,9) (23846566,10) (24892615,11) (24811765,12) 

d_customKeysOfHits:
(23846566,10) (23846566,10) (23846566,10) (23846566,10) (23846566,10) (23846566,10) (24892615,11) (24892615,11) (24892615,11) (8083701,0) (8083701,0) (8083701,0) (8083701,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) 
d_hits:
4412 6106 3995 4756 7175 2621 4443 3996 7176 1426 2821 1669 3370 0 0 0 0 0 0 0 
```

Note that when using the custom approach, only 4 out of 11 values for key 8083701 are retrieved correctly.


```
// nvcc -std=c++17 -O0 -g -lineinfo --extended-lambda --expt-relaxed-constexpr -gencode=arch=compute_120,code=sm_120 -I cuCollections/include/ reproducer.cu -o reproducer
#include <cooperative_groups.h>
#include <cuco/static_multimap.cuh>
#include <fstream>
#include <iostream>
#include <limits>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <vector>

using key_type = uint64_t;

struct Location {
    Location() = default;

    __host__ __device__ constexpr Location(uint64_t data_)
: data(data_){}

    uint64_t data;
};

std::ostream& operator<<(std::ostream& os, const Location& loc) {
    os << loc.data;
    return os;
}

constexpr key_type empty_key_sentinel = std::numeric_limits<key_type>::max();
constexpr Location empty_value_sentinel{ std::numeric_limits<uint64_t>::max()};

using HashTable = cuco::static_multimap<key_type, Location>;

using CustomRetrieveKeyType = cuda::std::tuple<key_type, int>;


struct CustomKeyEqual {
    __host__ __device__ bool operator()(const CustomRetrieveKeyType& customKey,
                                        const key_type& key) const {
        return cuda::std::get<0>(customKey) == key;
    }

};

template<int hasherNum>
struct CustomKeyHash {

    using Hasher = typename HashTable::hasher;

    __host__ __device__ auto operator()(const CustomRetrieveKeyType& customKey) const {
        auto key = cuda::std::get<0>(customKey);

        auto hasher = cuda::std::get<hasherNum>(Hasher{});
        return hasher(key);
    }
};



template <typename MapRef, typename InputIterator, typename OutputIterator>
__global__ void getNumHitsPerKeyKernel(MapRef map, InputIterator keys,
                                       std::size_t n,
                                       OutputIterator numHitsPerKey) {
    namespace cg = cooperative_groups;

    constexpr auto cg_size = MapRef::cg_size;

    auto tile =
        cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());

    int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
    int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

    while (idx < n) {
        int keycount = map.count(tile, *(keys + idx));
        keycount = cg::reduce(tile, keycount, cg::plus<int>{});
        if (tile.thread_rank() == 0) {
            // uint64_t key = *(keys + idx);
            // printf("idx %lu, key %lu, count %d\n", idx, key, keycount);
            numHitsPerKey[idx] = keycount;
        }
        idx += loop_stride;
    }
}


template <typename InputIterator, typename OutputIterator>
void callGetNumHitsPerKeyKernel(HashTable& map, InputIterator d_keys,
                                std::size_t numKeys,
                                OutputIterator d_numHitsPerKey,
                                cudaStream_t stream) {
    auto map_ref_count = map.ref(cuco::count);
    int cg_size = decltype(map_ref_count)::cg_size;
    int blocksize = 128;
    int gridsize = cuda::ceil_div(numKeys, blocksize / cg_size);
    getNumHitsPerKeyKernel<<<gridsize, blocksize, 0, stream>>>(
        map_ref_count, d_keys, numKeys, d_numHitsPerKey);
}


int main() {
    std::string filename = "reference_kvpairs.bin";

    using KVPair = cuco::pair<key_type, Location>;

    std::vector<KVPair> reference_kvpairs;
    std::ifstream infile(filename);
    size_t numrefpairs;
    infile.read((char*)&numrefpairs, sizeof(size_t));
    reference_kvpairs.resize(numrefpairs);
    infile.read((char*)reference_kvpairs.data(), sizeof(KVPair) * numrefpairs);

    std::cout << "loaded " << numrefpairs << " pairs\n";

    double loadFactor = 0.5;
    HashTable gpuHashTable(numrefpairs / loadFactor,
                           cuco::empty_key{empty_key_sentinel},
                           cuco::empty_value{empty_value_sentinel});

    thrust::device_vector<KVPair> d_kvpairs = reference_kvpairs;

    gpuHashTable.insert(d_kvpairs.begin(), d_kvpairs.end());

    // querying
    thrust::device_vector<key_type> d_queryKeys{
        8083701ull,  23797427ull, 23320168ull, 8047878ull,  22651094ull,
        20191956ull, 8608406ull,  7033559ull,  23747319ull, 21716725ull,
        23846566ull, 24892615ull, 24811765ull};
    thrust::device_vector<int> d_queryOtherData{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
    assert(d_queryKeys.size() == d_queryOtherData.size());

    // thrust::reverse(d_queryKeys.begin(), d_queryKeys.end());
    // thrust::reverse(d_queryOtherData.begin(), d_queryOtherData.end());

    const int numReads = 1;
    const int numKeys = d_queryKeys.size();
    thrust::device_vector<int> d_numHitsPerKey(numKeys);

    callGetNumHitsPerKeyKernel(gpuHashTable, d_queryKeys.begin(), numKeys,
                               d_numHitsPerKey.begin(), (cudaStream_t)0);

    std::cout << "d_numHitsPerKey:\n";
    for (int i = 0; i < numKeys; i++) {
        std::cout << d_numHitsPerKey[i] << " ";
    }
    std::cout << "\n";

    const int numHits =
        thrust::reduce(d_numHitsPerKey.begin(), d_numHitsPerKey.end());

    std::cout << "numReads " << numReads << "\n";
    std::cout << "numKeys " << numKeys << "\n";
    std::cout << "numHits " << numHits << "\n";

    std::cout << "\n";

    std::cout << "with original keys\n";
    {
        thrust::device_vector<Location> d_hits(numHits);
        thrust::device_vector<key_type> d_keysOfHits(numHits);

        gpuHashTable.retrieve(
            d_queryKeys.begin(),
            d_queryKeys.end(),
            d_keysOfHits.begin(),
            cuda::make_transform_output_iterator(
                d_hits.begin(),
                [] __device__(auto cucopair) -> Location { return cucopair.second; }
            )
        );

        cudaDeviceSynchronize();

        std::cout << "d_keysOfHits:\n";
        for (int i = 0; i < numHits; i++) {
            std::cout << d_keysOfHits[i] << " ";
        }
        std::cout << "\n";

        std::cout << "d_hits:\n";
        for (int i = 0; i < numHits; i++) {
            std::cout << d_hits[i] << " ";
        }
        std::cout << "\n";
    }

    std::cout << "\n";
    std::cout << "\n";


    std::cout << "with custom keys\n";

    {
        auto customKeys = thrust::make_zip_iterator(d_queryKeys.begin(),
                                                    d_queryOtherData.begin());

        std::vector<CustomRetrieveKeyType> h_customKeys(numKeys);
        thrust::copy(customKeys, customKeys + numKeys, h_customKeys.begin());

        std::cout << "querying h_customKeys:\n";
        for (int i = 0; i < numKeys; i++) {
            CustomRetrieveKeyType tup = h_customKeys[i];
            std::cout << "(" << cuda::std::get<0>(tup) << ","
                      << cuda::std::get<1>(tup) << ") ";
        }
        std::cout << "\n";
        std::cout << "\n";

        thrust::device_vector<Location> d_hits(numHits);
        thrust::device_vector<CustomRetrieveKeyType>
            d_customKeysOfHits(numHits);

        CustomKeyEqual probe_eq;
        CustomKeyHash<0> probe_hash_0;
        CustomKeyHash<1> probe_hash_1;
        auto probe_doublehasher = cuda::std::make_tuple(probe_hash_0, probe_hash_1);
        gpuHashTable.retrieve(customKeys,
            customKeys + numKeys,
            probe_eq,
            probe_doublehasher,
            d_customKeysOfHits.begin(),
            cuda::make_transform_output_iterator(
                d_hits.begin(),
                [] __device__(auto cucopair) -> Location { return cucopair.second; }
            )
        );

        cudaDeviceSynchronize();

        std::cout << "d_customKeysOfHits:\n";
        for (int i = 0; i < numHits; i++) {
            CustomRetrieveKeyType tup = d_customKeysOfHits[i];
            std::cout << "(" << cuda::std::get<0>(tup) << ","
                      << cuda::std::get<1>(tup) << ") ";
        }
        std::cout << "\n";

        std::cout << "d_hits:\n";
        for (int i = 0; i < numHits; i++) {
            std::cout << d_hits[i] << " ";
        }
        std::cout << "\n";
    }

    std::cout << "\n";
    std::cout << "\n";



}
```

### How to Reproduce

[cuco_reproducer.zip](https://github.com/user-attachments/files/28971640/cuco_reproducer.zip)

### Expected behavior

All results should be correct

### Reproduction link

_No response_

### Operating System

Linux

### nvidia-smi output

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 610.43.02              KMD Version: 610.43.02     CUDA UMD Version: 13.3     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA RTX PRO 6000 Blac...    On  |   00000000:C1:00.0 Off |                    0 |
| N/A   29C    P8             34W /  600W |       0MiB /  97887MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

### NVCC version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Fri_Apr_24_07:22:02_PM_PDT_2026
Cuda compilation tools, release 13.3, V13.3.33
Build cuda_13.3.r13.3/compiler.37862127_0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG]: static_multimap::retrieve gives wrong output with custom equality/hasher #822

Is this a duplicate?

Type of Bug

Describe the bug

How to Reproduce

Expected behavior

Reproduction link

Operating System

nvidia-smi output

NVCC version

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

[BUG]: static_multimap::retrieve gives wrong output with custom equality/hasher #822

Description

Is this a duplicate?

Type of Bug

Describe the bug

How to Reproduce

Expected behavior

Reproduction link

Operating System

nvidia-smi output

NVCC version

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions