|
30 | 30 | #include <cuda/std/__algorithm/max.h> // TODO #include <cuda/std/algorithm> once available |
31 | 31 | #include <cuda/std/bit> |
32 | 32 | #include <cuda/std/cstddef> |
| 33 | +#include <cuda/std/limits> |
33 | 34 | #include <cuda/std/span> |
| 35 | +#include <cuda/std/type_traits> |
34 | 36 | #include <cuda/std/utility> |
35 | 37 | #include <cuda/stream_ref> |
36 | 38 | #include <thrust/iterator/constant_iterator.h> |
@@ -62,6 +64,9 @@ class hyperloglog_impl { |
62 | 64 | using fp_type = double; ///< Floating point type used for reduction |
63 | 65 | using hash_value_type = cuda::std::remove_cvref_t<decltype(cuda::std::declval<Hash>()( |
64 | 66 | cuda::std::declval<T>()))>; ///< Hash value type |
| 67 | + static_assert(cuda::std::is_unsigned_v<hash_value_type>, |
| 68 | + "HyperLogLog requires an unsigned hash value type"); |
| 69 | + |
65 | 70 | public: |
66 | 71 | static constexpr auto thread_scope = Scope; ///< CUDA thread scope |
67 | 72 |
|
@@ -153,13 +158,11 @@ class hyperloglog_impl { |
153 | 158 | */ |
154 | 159 | __device__ constexpr void add(T const& item) noexcept |
155 | 160 | { |
156 | | - auto const h = this->hash_(item); |
157 | | - auto const reg = h & this->register_mask(); |
158 | | - auto const zeroes = cuda::std::countl_zero(h | this->register_mask()) + 1; // __clz |
159 | | - |
160 | | - // reversed order (same one as Spark uses) |
161 | | - // auto const reg = h >> ((sizeof(hash_value_type) * 8) - this->precision_); |
162 | | - // auto const zeroes = cuda::std::countl_zero(h << this->precision_) + 1; |
| 161 | + constexpr auto hash_bits = cuda::std::numeric_limits<hash_value_type>::digits; |
| 162 | + auto const h = this->hash_(item); |
| 163 | + auto const reg = static_cast<int>(h >> (hash_bits - this->precision_)); |
| 164 | + auto const w_padding = hash_value_type{1} << static_cast<hash_value_type>(this->precision_ - 1); |
| 165 | + auto const zeroes = cuda::std::countl_zero((h << this->precision_) | w_padding) + 1; // __clz |
163 | 166 |
|
164 | 167 | this->update_max(reg, zeroes); |
165 | 168 | } |
@@ -405,7 +408,7 @@ class hyperloglog_impl { |
405 | 408 | int thread_zeroes = 0; |
406 | 409 | for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { |
407 | 410 | auto const reg = this->sketch_[i]; |
408 | | - thread_sum += fp_type{1} / static_cast<fp_type>(1 << reg); |
| 411 | + thread_sum += fp_type{1} / static_cast<fp_type>(1ull << reg); |
409 | 412 | thread_zeroes += reg == 0; |
410 | 413 | } |
411 | 414 |
|
@@ -626,16 +629,6 @@ class hyperloglog_impl { |
626 | 629 | } |
627 | 630 | } |
628 | 631 |
|
629 | | - /** |
630 | | - * @brief Gets the register mask used to separate register index from count. |
631 | | - * |
632 | | - * @return The register mask |
633 | | - */ |
634 | | - __host__ __device__ constexpr hash_value_type register_mask() const noexcept |
635 | | - { |
636 | | - return (1ull << this->precision_) - 1; |
637 | | - } |
638 | | - |
639 | 632 | hasher hash_; ///< Hash function used to hash items |
640 | 633 | int32_t precision_; ///< HLL precision parameter |
641 | 634 | cuda::std::span<register_type> sketch_; ///< HLL sketch storage |
|
0 commit comments