Skip to content

Commit 080c04a

Browse files
authored
Add approx_distinct_count (rapidsai#20735)
Closes rapidsai#20774 This PR adds an `approx_distinct_count` class which uses Hyperloglog++ under the hood to provide approximate distinct count estimation. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: rapidsai#20735
1 parent 570d95c commit 080c04a

6 files changed

Lines changed: 1501 additions & 0 deletions

File tree

cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ add_library(
629629
src/quantiles/quantiles.cu
630630
src/reductions/all.cu
631631
src/reductions/any.cu
632+
src/reductions/approx_distinct_count.cu
632633
src/reductions/argmax.cu
633634
src/reductions/argmin.cu
634635
src/reductions/bitwise.cu
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include <cudf/table/table_view.hpp>
9+
#include <cudf/types.hpp>
10+
#include <cudf/utilities/default_stream.hpp>
11+
#include <cudf/utilities/export.hpp>
12+
13+
#include <rmm/cuda_stream_view.hpp>
14+
15+
#include <cuda/std/span>
16+
17+
#include <cstddef>
18+
#include <cstdint>
19+
#include <memory>
20+
21+
namespace CUDF_EXPORT cudf {
22+
23+
// Forward declarations
24+
namespace hashing::detail {
25+
template <typename Key>
26+
struct XXHash_64;
27+
}
28+
29+
namespace detail {
30+
template <template <typename> class Hasher>
31+
class approx_distinct_count;
32+
}
33+
34+
/**
35+
* @brief Object-oriented HyperLogLog sketch for approximate distinct counting.
36+
*
37+
* This class provides an object-oriented interface to HyperLogLog sketches, allowing
38+
* incremental addition of data and cardinality estimation.
39+
*
40+
* The implementation uses XXHash64 to hash table rows into 64-bit values, which are
41+
* then added to the HyperLogLog sketch without additional hashing (identity function).
42+
*
43+
* @par HyperLogLog Precision Parameter
44+
* The precision parameter (p) is the number of bits used to index into the register array.
45+
* It determines the number of registers (m = 2^p) in the HLL sketch:
46+
* - Memory usage: 2^p * 4 bytes (m registers of 4 bytes each for GPU atomics)
47+
* - Standard error: 1.04 / sqrt(m) = 1.04 / sqrt(2^p)
48+
*
49+
* Common precision values:
50+
* - p = 10: m = 1,024 registers, ~3.2% standard error, 4KB memory
51+
* - p = 12 (default): m = 4,096 registers, ~1.6% standard error, 16KB memory
52+
* - p = 14: m = 16,384 registers, ~0.8% standard error, 64KB memory
53+
* - p = 16: m = 65,536 registers, ~0.4% standard error, 256KB memory
54+
*
55+
* Valid range: p ∈ [4, 18]. This is not a hard theoretical limit but an empirically
56+
* recommended range:
57+
* - Below 4: Too few registers for HLL's statistical assumptions, resulting in high
58+
* variance and unstable estimates.
59+
* - Above 18: Rapidly diminishing accuracy gains while incurring significant memory
60+
* growth, making the structure no longer space-efficient for approximate counting.
61+
*
62+
* This range represents a practical engineering compromise from HLL++ and is widely
63+
* adopted by systems such as Apache Spark. The default of 12 aligns with Spark's
64+
* configuration and is the largest precision that fits efficiently in GPU shared memory,
65+
* enabling optimal performance for our implementation.
66+
*
67+
* Example usage:
68+
* @code{.cpp}
69+
* auto adc = cudf::approx_distinct_count(table1);
70+
* auto count1 = adc.estimate();
71+
*
72+
* adc.add(table2);
73+
* auto count2 = adc.estimate();
74+
* @endcode
75+
*/
76+
class approx_distinct_count {
77+
public:
78+
using impl_type =
79+
cudf::detail::approx_distinct_count<cudf::hashing::detail::XXHash_64>; ///< Implementation type
80+
81+
/**
82+
* @brief Constructs an approximate distinct count sketch from a table
83+
*
84+
* @param input Table whose rows will be added to the sketch
85+
* @param precision The precision parameter for HyperLogLog (4-18). Higher precision gives
86+
* better accuracy but uses more memory. Default is 12.
87+
* @param null_handling `INCLUDE` or `EXCLUDE` rows with nulls (default: `EXCLUDE`)
88+
* @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL` (default: `NAN_IS_NULL`)
89+
* @param stream CUDA stream used for device memory operations and kernel launches
90+
*/
91+
approx_distinct_count(table_view const& input,
92+
std::int32_t precision = 12,
93+
null_policy null_handling = null_policy::EXCLUDE,
94+
nan_policy nan_handling = nan_policy::NAN_IS_NULL,
95+
rmm::cuda_stream_view stream = cudf::get_default_stream());
96+
97+
/**
98+
* @brief Constructs an approximate distinct count sketch from serialized sketch bytes
99+
*
100+
* This constructor enables distributed distinct counting by allowing sketches to be
101+
* constructed from serialized data. The sketch data is copied into the newly created
102+
* object, which then owns its own independent storage.
103+
*
104+
* @warning The precision parameter must match the precision used to create the original
105+
* sketch. The size of the sketch span must be exactly 2^precision bytes. The null and
106+
* NaN handling policies must match those used when creating the original sketch.
107+
* Providing incompatible parameters will produce incorrect results or errors.
108+
*
109+
* @param sketch_span The serialized sketch bytes to reconstruct from
110+
* @param precision The precision parameter that was used to create the sketch (4-18)
111+
* @param null_handling `INCLUDE` or `EXCLUDE` rows with nulls (default: `EXCLUDE`)
112+
* @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL` (default: `NAN_IS_NULL`)
113+
* @param stream CUDA stream used for device memory operations and kernel launches
114+
*/
115+
approx_distinct_count(cuda::std::span<cuda::std::byte> sketch_span,
116+
std::int32_t precision,
117+
null_policy null_handling = null_policy::EXCLUDE,
118+
nan_policy nan_handling = nan_policy::NAN_IS_NULL,
119+
rmm::cuda_stream_view stream = cudf::get_default_stream());
120+
121+
~approx_distinct_count();
122+
123+
approx_distinct_count(approx_distinct_count const&) = delete;
124+
approx_distinct_count& operator=(approx_distinct_count const&) = delete;
125+
approx_distinct_count(approx_distinct_count&&) = default; ///< Default move constructor
126+
/**
127+
* @brief Move assignment operator
128+
*
129+
* @return A reference to this object
130+
*/
131+
approx_distinct_count& operator=(approx_distinct_count&&) = default;
132+
133+
/**
134+
* @brief Adds rows from a table to the sketch
135+
*
136+
* @param input Table whose rows will be added
137+
* @param stream CUDA stream used for device memory operations and kernel launches
138+
*/
139+
void add(table_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream());
140+
141+
/**
142+
* @brief Merges another sketch into this sketch
143+
*
144+
* After merging, this sketch will contain the combined distinct count estimate of both sketches.
145+
*
146+
* @throw std::invalid_argument if the sketches have different precision values
147+
* @throw std::invalid_argument if the sketches have different null handling policies
148+
* @throw std::invalid_argument if the sketches have different NaN handling policies
149+
*
150+
* @param other The sketch to merge into this sketch
151+
* @param stream CUDA stream used for device memory operations and kernel launches
152+
*/
153+
void merge(approx_distinct_count const& other,
154+
rmm::cuda_stream_view stream = cudf::get_default_stream());
155+
156+
/**
157+
* @brief Merges a sketch from raw bytes into this sketch
158+
*
159+
* This allows merging sketches that have been serialized or created elsewhere, enabling
160+
* distributed distinct counting scenarios.
161+
*
162+
* @warning It is the caller's responsibility to ensure that the provided sketch span was created
163+
* with the same approx_distinct_count configuration (precision, null/NaN handling, etc.) as this
164+
* sketch. Merging incompatible sketches will produce incorrect results.
165+
*
166+
* @param sketch_span The sketch bytes to merge into this sketch
167+
* @param stream CUDA stream used for device memory operations and kernel launches
168+
*/
169+
void merge(cuda::std::span<cuda::std::byte> sketch_span,
170+
rmm::cuda_stream_view stream = cudf::get_default_stream());
171+
172+
/**
173+
* @brief Estimates the approximate number of distinct rows in the sketch
174+
*
175+
* @param stream CUDA stream used for device memory operations and kernel launches
176+
* @return Approximate number of distinct rows
177+
*/
178+
[[nodiscard]] std::size_t estimate(
179+
rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
180+
181+
/**
182+
* @brief Gets the raw sketch bytes for serialization or external merging
183+
*
184+
* The returned span provides access to the internal sketch storage.
185+
* This can be used to serialize the sketch, transfer it between processes,
186+
* or merge it with other sketches using the span-based merge API.
187+
*
188+
* @return A span view of the sketch bytes
189+
*/
190+
[[nodiscard]] cuda::std::span<cuda::std::byte> sketch() noexcept;
191+
192+
/**
193+
* @brief Gets the raw sketch bytes for serialization or external merging (const overload)
194+
*
195+
* The returned span provides access to the internal sketch storage.
196+
* This can be used to serialize the sketch, transfer it between processes,
197+
* or merge it with other sketches using the span-based merge API.
198+
*
199+
* @return A span view of the sketch bytes
200+
*/
201+
[[nodiscard]] cuda::std::span<cuda::std::byte const> sketch() const noexcept;
202+
203+
/**
204+
* @brief Gets the null handling policy for this sketch
205+
*
206+
* @return The null policy set at construction
207+
*/
208+
[[nodiscard]] null_policy null_handling() const noexcept;
209+
210+
/**
211+
* @brief Gets the NaN handling policy for this sketch
212+
*
213+
* @return The NaN policy set at construction
214+
*/
215+
[[nodiscard]] nan_policy nan_handling() const noexcept;
216+
217+
/**
218+
* @brief Gets the precision parameter for this sketch
219+
*
220+
* @return The precision value set at construction
221+
*/
222+
[[nodiscard]] std::int32_t precision() const noexcept;
223+
224+
private:
225+
std::unique_ptr<impl_type> _impl;
226+
};
227+
228+
} // namespace CUDF_EXPORT cudf
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include <cudf/hashing.hpp>
9+
#include <cudf/table/table_view.hpp>
10+
#include <cudf/types.hpp>
11+
#include <cudf/utilities/default_stream.hpp>
12+
#include <cudf/utilities/export.hpp>
13+
14+
#include <rmm/cuda_stream_view.hpp>
15+
#include <rmm/mr/polymorphic_allocator.hpp>
16+
17+
#include <cuco/hyperloglog.cuh>
18+
#include <cuda/functional>
19+
#include <cuda/std/span>
20+
21+
#include <cstddef>
22+
#include <cstdint>
23+
24+
namespace CUDF_EXPORT cudf {
25+
namespace detail {
26+
27+
/**
28+
* @brief HyperLogLog-based approximate distinct count sketch for use by the public API
29+
*
30+
* This detail implementation provides the core HyperLogLog functionality used by the
31+
* public `cudf::approx_distinct_count` class. It maintains a cuco::hyperloglog sketch
32+
* for cardinality estimation.
33+
*
34+
* @tparam Hasher The hash function template to use for hashing table rows. Must be compatible
35+
* with cudf's row_hasher device_hasher interface (a template taking a Key type).
36+
*/
37+
template <template <typename> class Hasher>
38+
class approx_distinct_count {
39+
public:
40+
/**
41+
* @brief Constructs an approximate distinct count sketch from a table
42+
*
43+
* @param input Table whose rows will be added to the sketch
44+
* @param precision The precision parameter for HyperLogLog (4-18). Higher precision gives
45+
* better accuracy but uses more memory.
46+
* @param null_handling `INCLUDE` or `EXCLUDE` rows with nulls
47+
* @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL`
48+
* @param stream CUDA stream used for device memory operations and kernel launches
49+
*/
50+
approx_distinct_count(table_view const& input,
51+
std::int32_t precision,
52+
null_policy null_handling,
53+
nan_policy nan_handling,
54+
rmm::cuda_stream_view stream);
55+
56+
/**
57+
* @brief Constructs an approximate distinct count sketch from serialized sketch bytes
58+
*
59+
* @param sketch_span The serialized sketch bytes to reconstruct from
60+
* @param precision The precision parameter that was used to create the sketch (4-18)
61+
* @param null_handling `INCLUDE` or `EXCLUDE` rows with nulls
62+
* @param nan_handling `NAN_IS_VALID` or `NAN_IS_NULL`
63+
* @param stream CUDA stream used for device memory operations and kernel launches
64+
*/
65+
approx_distinct_count(cuda::std::span<cuda::std::byte> sketch_span,
66+
std::int32_t precision,
67+
null_policy null_handling,
68+
nan_policy nan_handling,
69+
rmm::cuda_stream_view stream);
70+
71+
approx_distinct_count() = delete;
72+
~approx_distinct_count();
73+
approx_distinct_count(approx_distinct_count const&) = delete;
74+
approx_distinct_count& operator=(approx_distinct_count const&) = delete;
75+
approx_distinct_count(approx_distinct_count&&) noexcept = default;
76+
approx_distinct_count& operator=(approx_distinct_count&&) = delete;
77+
78+
/**
79+
* @brief Adds rows from a table to the sketch
80+
*
81+
* Uses the null and NaN handling policies set at construction time.
82+
*
83+
* @param input Table whose rows will be added
84+
* @param stream CUDA stream used for device memory operations and kernel launches
85+
*/
86+
void add(table_view const& input, rmm::cuda_stream_view stream);
87+
88+
/**
89+
* @brief Merges another sketch into this sketch
90+
*
91+
* @param other The sketch to merge into this sketch
92+
* @param stream CUDA stream used for device memory operations and kernel launches
93+
*/
94+
void merge(approx_distinct_count const& other, rmm::cuda_stream_view stream);
95+
96+
/**
97+
* @brief Merges a sketch from raw bytes into this sketch
98+
*
99+
* @warning It is the caller's responsibility to ensure that the provided sketch span was created
100+
* with the same approx_distinct_count configuration (precision, null/NaN handling, etc.) as this
101+
* sketch. Merging incompatible sketches will produce incorrect results.
102+
*
103+
* @param sketch_span The sketch bytes to merge into this sketch
104+
* @param stream CUDA stream used for device memory operations and kernel launches
105+
*/
106+
void merge(cuda::std::span<cuda::std::byte> sketch_span, rmm::cuda_stream_view stream);
107+
108+
/**
109+
* @brief Estimates the approximate number of distinct rows in the sketch
110+
*
111+
* @param stream CUDA stream used for device memory operations and kernel launches
112+
* @return Approximate number of distinct rows
113+
*/
114+
[[nodiscard]] std::size_t estimate(rmm::cuda_stream_view stream) const;
115+
116+
/**
117+
* @brief Gets the raw sketch bytes
118+
*
119+
* @return A span view of the sketch bytes
120+
*/
121+
[[nodiscard]] cuda::std::span<cuda::std::byte> sketch() noexcept;
122+
123+
/**
124+
* @brief Gets the raw sketch bytes (const overload)
125+
*
126+
* @return A span view of the sketch bytes
127+
*/
128+
[[nodiscard]] cuda::std::span<cuda::std::byte const> sketch() const noexcept;
129+
130+
/**
131+
* @brief Gets the null handling policy for this sketch
132+
*
133+
* @return The null policy set at construction
134+
*/
135+
[[nodiscard]] null_policy null_handling() const noexcept;
136+
137+
/**
138+
* @brief Gets the NaN handling policy for this sketch
139+
*
140+
* @return The NaN policy set at construction
141+
*/
142+
[[nodiscard]] nan_policy nan_handling() const noexcept;
143+
144+
/**
145+
* @brief Gets the precision parameter for this sketch
146+
*
147+
* @return The precision value set at construction
148+
*/
149+
[[nodiscard]] std::int32_t precision() const noexcept;
150+
151+
private:
152+
using hll_type = cuco::hyperloglog<uint64_t,
153+
cuda::thread_scope_device,
154+
cuda::std::identity,
155+
rmm::mr::polymorphic_allocator<cuda::std::byte>>;
156+
hll_type _impl;
157+
null_policy _null_handling; ///< Immutable null handling policy
158+
nan_policy _nan_handling; ///< Immutable NaN handling policy
159+
};
160+
161+
} // namespace detail
162+
} // namespace CUDF_EXPORT cudf

0 commit comments

Comments
 (0)