Skip to content

Commit 91e8b8e

Browse files
Fix biased hash-list union estimate by routing it through the merger
1 parent 9392d19 commit 91e8b8e

4 files changed

Lines changed: 55 additions & 123 deletions

File tree

benches/hyperloglog_union.rs

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,47 @@ fn bench_hyperloglog_union(c: &mut Criterion) {
6060
group.finish();
6161
}
6262

63-
criterion_group!(benches, bench_hyperloglog_union);
63+
type HLL14 = HyperLogLog<Precision14, Bits6, <Precision14 as PackedRegister<Bits6>>::Array, XxHash64>;
64+
65+
/// Builds the largest counter that is still in hash-list mode (stops just before the
66+
/// insertion that would convert it to a fully-fledged HyperLogLog).
67+
fn build_full_hash_list(seed: u64) -> HLL14 {
68+
let mut hll = HLL14::default();
69+
for value in iter_random_values::<u64>(1_000_000, None, Some(seed)) {
70+
let mut candidate = hll.clone();
71+
candidate.insert(&value);
72+
if !candidate.is_hash_list() {
73+
break;
74+
}
75+
hll = candidate;
76+
}
77+
assert!(hll.is_hash_list());
78+
hll
79+
}
80+
81+
/// Compares the two ways of estimating a union cardinality when both operands are large
82+
/// hash lists (the regime where inclusion-exclusion is biased): the current
83+
/// inclusion-exclusion path versus the merger-based estimate.
84+
fn bench_union_estimate(c: &mut Criterion) {
85+
let mut group = c.benchmark_group("union_estimate_hash_list_p14");
86+
87+
let left = build_full_hash_list(0x00A1_1CE0);
88+
let right = build_full_hash_list(0x0000_B0B0);
89+
assert!(left.is_hash_list() && right.is_hash_list());
90+
91+
// Current: inclusion-exclusion (fast, biased low in this regime).
92+
group.bench_function("inclusion_exclusion", |b| {
93+
b.iter(|| black_box(black_box(&left).estimate_union_cardinality(black_box(&right))));
94+
});
95+
96+
// Proposed: build the merged counter and estimate its cardinality (accurate).
97+
group.bench_function("merger", |b| {
98+
b.iter(|| black_box((black_box(&left) | black_box(&right)).estimate_cardinality()));
99+
});
100+
101+
group.finish();
102+
}
103+
104+
criterion_group!(benches, bench_hyperloglog_union, bench_union_estimate);
64105

65106
criterion_main!(benches);

src/hyperloglog.rs

Lines changed: 13 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -475,48 +475,19 @@ impl<P: Precision, B: Bits, R: Registers<P, B>, H: HasherType> HyperLogLog<P, B,
475475
) -> f64 {
476476
match (self.is_hash_list(), other.is_hash_list()) {
477477
(true, true) => {
478-
let left_hash_bits = self.get_hash_bits().unwrap();
479-
let right_hash_bits = other.get_hash_bits().unwrap();
480-
assert!(left_hash_bits >= GapHash::<P, B>::SMALLEST_VIABLE_HASH_BITS);
481-
assert!(right_hash_bits >= GapHash::<P, B>::SMALLEST_VIABLE_HASH_BITS);
482-
483-
let left_shift = if left_hash_bits <= right_hash_bits {
484-
0
485-
} else {
486-
left_hash_bits - right_hash_bits
487-
};
488-
let right_shift = if right_hash_bits <= left_hash_bits {
489-
0
490-
} else {
491-
right_hash_bits - left_hash_bits
492-
};
493-
494-
let left_hashes = self.registers.as_ref();
495-
let right_hashes = other.registers.as_ref();
496-
let left_bit_index = self.get_writer_tell();
497-
let right_bit_index = other.get_writer_tell();
498-
499-
let intersection_cardinality = f64::from(intersection_from_sorted_iterators(
500-
GapHash::<P, B>::downgraded(
501-
left_hashes,
502-
self.get_number_of_hashes().unwrap(),
503-
left_hash_bits,
504-
left_bit_index,
505-
left_shift,
506-
),
507-
GapHash::<P, B>::downgraded(
508-
right_hashes,
509-
other.get_number_of_hashes().unwrap(),
510-
right_hash_bits,
511-
right_bit_index,
512-
right_shift,
513-
),
514-
));
515-
516-
let union_cardinality =
517-
self_cardinality + other_cardinality - intersection_cardinality;
518-
519-
correct_union_estimate(self_cardinality, other_cardinality, union_cardinality)
478+
// Build the union as a hash list and estimate its cardinality directly, so the
479+
// birthday-paradox correction is applied to the union the same way it is to a
480+
// single counter. Inclusion-exclusion (A + B - intersection) would subtract a
481+
// raw, uncorrected count of coinciding downgraded hashes; for sets with little
482+
// real overlap those coincidences are dominated by spurious birthday collisions,
483+
// which biases the union estimate low and increasingly so at higher precisions.
484+
let mut union = self.clone();
485+
union.merge(other);
486+
correct_union_estimate(
487+
self_cardinality,
488+
other_cardinality,
489+
union.estimate_cardinality(),
490+
)
520491
}
521492
(true, false) => {
522493
let hash_bits = self.get_hash_bits().unwrap();

src/utils.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,13 @@
99
1010
mod constants;
1111
mod hasher_type;
12-
mod intersection_from_sorted_iterators;
1312
mod matrix;
1413
mod number;
1514
mod random;
1615
mod variable_word;
1716

1817
pub use constants::*;
1918
pub use hasher_type::HasherType;
20-
pub(crate) use intersection_from_sorted_iterators::intersection_from_sorted_iterators;
2119
pub use matrix::Matrix;
2220
pub(crate) use number::{FloatOps, Number, PositiveInteger};
2321
pub use random::*;

src/utils/intersection_from_sorted_iterators.rs

Lines changed: 0 additions & 78 deletions
This file was deleted.

0 commit comments

Comments
 (0)