@@ -63,10 +63,12 @@ template <class Hash,
6363 uint32_t GroupsPerBlock = WordsPerBlock>
6464class parametric_filter_policy {
6565 public:
66- using hasher = Hash;
67- using word_type = Word;
68- using hash_argument_type = typename hasher::argument_type;
69- using hash_result_type = decltype (std::declval<hasher>()(std::declval<hash_argument_type>()));
66+ using hasher = Hash; // /< 64-bit hash functor type
67+ using word_type = Word; // /< Underlying filter-block word type
68+ using hash_argument_type = typename hasher::argument_type; // /< Hash function input type
69+ using hash_result_type =
70+ decltype (std::declval<hasher>()(std::declval<hash_argument_type>())); // /< Hash function
71+ // /< output type
7072
7173 private:
7274 static constexpr uint32_t max_salts = 64 ;
@@ -84,8 +86,8 @@ class parametric_filter_policy {
8486 static constexpr uint32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
8587
8688 public:
87- static constexpr uint32_t words_per_block = WordsPerBlock;
88- static constexpr uint32_t pattern_bits = PatternBits;
89+ static constexpr uint32_t words_per_block = WordsPerBlock; // /< Number of words per filter block
90+ static constexpr uint32_t pattern_bits = PatternBits; // /< Fingerprint bits per key
8991
9092 static constexpr uint32_t add_horizontal_layout =
9193 AddHorizontalLayout; // /< horizontal vectorization layout for add operation
@@ -96,31 +98,39 @@ class parametric_filter_policy {
9698 static constexpr uint32_t contains_vertical_layout =
9799 ContainsVerticalLayout; // /< vertical vectorization layout for contains operation
98100
99- static constexpr size_t max_filter_blocks = cuda::std::numeric_limits< uint32_t >::max();
100- // This ensures each word in the block has at least one bit set; otherwise we would never
101- // use some of the words
101+ static constexpr size_t max_filter_blocks =
102+ cuda::std::numeric_limits< uint32_t >::max(); // /< Upper bound on the number of filter blocks
103+ // / Lower bound on `pattern_bits`: at least one bit per word so every word contributes.
102104 static constexpr auto min_pattern_bits = words_per_block;
103- // The maximum number of bits to be set for a key is capped by the total number of bits in
104- // the filter block, as well as the number of available salts
105+ // / Upper bound on `pattern_bits`: the total number of bits in a filter block, capped by the
106+ // / number of available salts.
105107 static constexpr auto max_pattern_bits = cuda::std::min(word_bits * words_per_block, max_salts);
106108
107109 // ===----------Cache-Sectorized----------===//
108- static constexpr uint32_t groups_per_block = GroupsPerBlock;
109- static constexpr bool is_cache_sectorized = groups_per_block != words_per_block ? true : false ;
110- static constexpr uint32_t words_per_group = words_per_block / groups_per_block;
110+ static constexpr uint32_t groups_per_block =
111+ GroupsPerBlock; // /< Cache-sectorization groups per block (paper's z)
112+ static constexpr bool is_cache_sectorized =
113+ groups_per_block != words_per_block ? true : false ; // /< CSBF mode flag
114+ static constexpr uint32_t words_per_group =
115+ words_per_block / groups_per_block; // /< Words per cache-sectorization group
111116 // TODO: when `pattern_bits % groups_per_block != 0`, using a ceil packs all remainder bits into
112117 // the first `pattern_bits / max_bits_per_group` groups, leaving later groups with a zero
113118 // expected pattern. This wastes block capacity and inflates FPR. Distribute floor bits to every
114119 // group plus one extra bit to the first `pattern_bits % groups_per_block` groups, and update
115120 // the salt-to-group mapping in `set_bits` accordingly.
116- static constexpr uint32_t max_bits_per_group =
117- cuco::detail::int_div_ceil (pattern_bits, groups_per_block);
118- static constexpr uint32_t add_groups_per_vertical_layout = add_vertical_layout / words_per_group;
121+ static constexpr uint32_t max_bits_per_group = cuco::detail::int_div_ceil(
122+ pattern_bits, groups_per_block); // /< CSBF: max fingerprint bits set per group per key
123+ static constexpr uint32_t add_groups_per_vertical_layout =
124+ add_vertical_layout / words_per_group; // /< CSBF: groups touched per add vertical step
119125 static constexpr uint32_t contains_groups_per_vertical_layout =
120- contains_vertical_layout / words_per_group;
121- static constexpr uint32_t group_index_salt = 0x5bd1e995U ;
122- static constexpr uint32_t group_index_width = cuda::std::bit_width(words_per_group - 1 );
123- static constexpr uint32_t group_index_mask = words_per_group - 1 ;
126+ contains_vertical_layout /
127+ words_per_group; // /< CSBF: groups touched per contains vertical step
128+ static constexpr uint32_t group_index_salt =
129+ 0x5bd1e995U ; // /< CSBF: salt for selecting one word per group
130+ static constexpr uint32_t group_index_width = cuda::std::bit_width(
131+ words_per_group - 1 ); // /< CSBF: bits needed to encode an in-group word index
132+ static constexpr uint32_t group_index_mask =
133+ words_per_group - 1 ; // /< CSBF: mask for selecting an in-group word index
124134
125135 private:
126136 static constexpr uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1 );
@@ -135,6 +145,11 @@ class parametric_filter_policy {
135145 cuco::detail::int_div_ceil (pattern_bits, words_per_block);
136146
137147 public:
148+ /* *
149+ * @brief Constructs a parametric filter policy.
150+ *
151+ * @param hash Hash function used to generate fingerprints.
152+ */
138153 __host__ __device__ constexpr parametric_filter_policy (Hash hash = {}) : hash_{hash}
139154 {
140155 static_assert (pattern_bits >= min_pattern_bits,
@@ -178,28 +193,69 @@ class parametric_filter_policy {
178193 " within 32 bits" );
179194 }
180195
181- // Return {upper 32b, lower 32b} of 64b hash
196+ /* *
197+ * @brief Splits the 64-bit hash of a key into its upper and lower 32 bits.
198+ *
199+ * The upper half is used for block selection (via multiply-shift); the lower half drives the
200+ * per-word fingerprint pattern via salt-based multiplicative hashing.
201+ *
202+ * @param key Key to hash.
203+ *
204+ * @return `{upper 32 bits, lower 32 bits}` of the 64-bit hash.
205+ */
182206 __device__ constexpr cuda::std::pair<uint32_t , uint32_t > split_hash (hash_argument_type key) const
183207 {
184208 auto const hash_value = hash_ (key);
185209 return {static_cast <uint32_t >(hash_value >> 32 ), static_cast <uint32_t >(hash_value)};
186210 }
187211
212+ /* *
213+ * @brief Determines the filter block a key maps to via fast multiply-shift modulo.
214+ *
215+ * @tparam Extent Size type used to determine the number of blocks in the filter.
216+ *
217+ * @param upper_hash_value Upper 32 bits of the key's hash.
218+ * @param num_blocks Number of blocks in the filter.
219+ *
220+ * @return Block index in `[0, num_blocks)`.
221+ */
188222 template <class Extent >
189223 __device__ constexpr auto block_index (uint32_t upper_hash_value, Extent num_blocks) const
190224 {
191- // return upper_hash_value % num_blocks;
192225 return static_cast <uint32_t >((static_cast <uint64_t >(upper_hash_value) *
193226 static_cast <typename Extent::value_type>(num_blocks)) >>
194227 32 );
195228 }
196229
230+ /* *
231+ * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is 1.
232+ *
233+ * @tparam LoopIndex Outer-loop iteration index when `words_per_block / VerticalLayout > 1`.
234+ * @tparam VerticalLayout Number of contiguous words this call produces.
235+ *
236+ * @param lower_hash_value Lower 32 bits of the key's hash.
237+ *
238+ * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words.
239+ */
197240 template <uint32_t LoopIndex, uint32_t VerticalLayout>
198241 __device__ constexpr auto array_pattern (uint32_t lower_hash_value) const
199242 {
200243 return pattern_impl<LoopIndex, VerticalLayout>(lower_hash_value);
201244 }
202245
246+ /* *
247+ * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is > 1.
248+ *
249+ * @tparam LoopIndex Outer-loop iteration index.
250+ * @tparam HorizontalLayout Cooperative-group size cooperating on a single key.
251+ * @tparam VerticalLayout Number of contiguous words this call produces.
252+ *
253+ * @param lower_hash_value Lower 32 bits of the key's hash.
254+ * @param thread_index Caller's rank within the cooperative group.
255+ *
256+ * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words owned
257+ * by the calling thread.
258+ */
203259 template <uint32_t LoopIndex, uint32_t HorizontalLayout, uint32_t VerticalLayout>
204260 __device__ constexpr auto array_pattern (uint32_t lower_hash_value, uint32_t thread_index) const
205261 {
0 commit comments