Skip to content

Commit abdb021

Browse files
Port Ertl secant-method cardinality MLE behind the mle feature
1 parent d59cb00 commit abdb021

2 files changed

Lines changed: 163 additions & 0 deletions

File tree

src/mle.rs

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,30 @@ impl<P: Precision, B: Bits, R: Registers<P, B>, H: HasherType> HyperLogLog<P, B,
4343
self.mle_union_from_registers(other)
4444
}
4545

46+
/// Returns the cardinality estimated with the single-counter Maximum Likelihood Estimation.
47+
///
48+
/// # Implementative details
49+
/// This is Ertl's secant-method maximum-likelihood estimator over the register
50+
/// multiplicities. It operates on the HyperLogLog register representation, so a hash-list
51+
/// operand is materialized into registers first. It is provided for completeness and
52+
/// comparison: it is less accurate, and substantially slower, than the default
53+
/// [`HyperLogLog::estimate_cardinality`] (HyperLogLog++ corrected) estimate.
54+
#[inline]
55+
pub fn estimate_cardinality_mle(&self) -> f64 {
56+
if self.is_hash_list() {
57+
let mut counter = self.clone();
58+
counter.convert_hash_list_to_hyperloglog().unwrap();
59+
return counter.estimate_cardinality_mle();
60+
}
61+
62+
mle_cardinality::<P, B>(
63+
self.registers.iter_registers(),
64+
self.harmonic_sum,
65+
self.is_full(),
66+
2,
67+
)
68+
}
69+
4670
/// Joint MLE union estimate assuming both counters are in HyperLogLog (register) mode.
4771
fn mle_union_from_registers(&self, other: &Self) -> f64 {
4872
// Maps a union harmonic sum to the HyperLogLog++ corrected cardinality, exactly as the
@@ -255,6 +279,117 @@ fn mle_union_cardinality<P: Precision, B: Bits, I: ExactSizeIterator<Item = [u8;
255279
phis[0].exp() + phis[1].exp() + phis[2].exp()
256280
}
257281

282+
#[allow(clippy::too_many_lines)]
283+
/// Single-counter cardinality via Ertl's secant-method Maximum Likelihood Estimation.
284+
///
285+
/// # Arguments
286+
/// * `registers` - Iterator over the counter's register values.
287+
/// * `harmonic_sum` - The counter's harmonic sum (sum of `2^-register`).
288+
/// * `is_full` - Whether the counter is saturated (returns infinity).
289+
/// * `error_exponent` - The secant method stops once the relative step is below
290+
/// `10^-error_exponent` scaled by the precision.
291+
fn mle_cardinality<P: Precision, B: Bits>(
292+
registers: impl Iterator<Item = u8>,
293+
harmonic_sum: f64,
294+
is_full: bool,
295+
error_exponent: i32,
296+
) -> f64 {
297+
if is_full {
298+
return f64::INFINITY;
299+
}
300+
301+
let multiplicities_len = 1_usize << B::NUMBER_OF_BITS;
302+
let mut multiplicities = vec![f64::ZERO; multiplicities_len];
303+
let q = multiplicities_len as u32 - 2;
304+
305+
let mut smallest_register_value: u32 = q;
306+
let mut largest_register_value: u32 = 0;
307+
308+
for register in registers {
309+
let register = u32::from(register);
310+
if register > 0 {
311+
smallest_register_value = smallest_register_value.min(register);
312+
}
313+
largest_register_value = largest_register_value.max(register);
314+
multiplicities[register as usize] += 1.0;
315+
}
316+
317+
smallest_register_value = smallest_register_value.max(1);
318+
largest_register_value = largest_register_value.min(q).max(1);
319+
320+
let number_of_registers = f64::integer_exp2(P::EXPONENT);
321+
322+
let c =
323+
multiplicities[multiplicities_len - 1] + multiplicities[largest_register_value as usize];
324+
325+
let mut g_prev: f64 = 0.0;
326+
let number_of_zero_registers = multiplicities[0];
327+
let reciprocal_saturated_registers = multiplicities[multiplicities_len - 1]
328+
* f64::integer_exp2_minus((multiplicities_len - 1) as u8);
329+
330+
let harmonic_sum_minus_zero_and_saturated =
331+
harmonic_sum - (number_of_zero_registers + reciprocal_saturated_registers);
332+
333+
let a = harmonic_sum_minus_zero_and_saturated + number_of_zero_registers;
334+
let b = harmonic_sum_minus_zero_and_saturated
335+
+ multiplicities[multiplicities_len - 1] * f64::integer_exp2_minus(q as u8);
336+
337+
let number_of_non_zero_registers = number_of_registers - number_of_zero_registers;
338+
339+
let mut x = if b <= 1.5 * a {
340+
number_of_non_zero_registers / (0.5 * b + a)
341+
} else {
342+
(number_of_non_zero_registers / b) * (b / a).ln_1p()
343+
};
344+
345+
// We begin the secant method iterations.
346+
let mut delta_x = x;
347+
let relative_error_limit = 10.0_f64.powi(-error_exponent) / number_of_registers.sqrt();
348+
349+
let forty_five_recip = 1.0 / 45.0;
350+
let four_seventy_two_point_five_recip = 1.0 / 472.5;
351+
352+
let taylor = |x_first: f64, h: f64| -> f64 { (x_first + h * (1.0 - h)) / (x_first + 1.0 - h) };
353+
354+
while delta_x > x * relative_error_limit {
355+
// Equivalent to `2 + floor(log2(x))`, saturating non-positive exponents to 0.
356+
let k: u32 = 2 + (x.log2().floor().max(0.0) as u32);
357+
358+
let maximal = largest_register_value.max(k);
359+
let mut x_first = x * f64::integer_exp2_minus((maximal + 1) as u8);
360+
let x_second = x_first * x_first;
361+
let x_forth = x_second * x_second;
362+
let mut taylor_series_approximation = x_first - x_second / 3.0
363+
+ x_forth * (forty_five_recip - x_second * four_seventy_two_point_five_recip);
364+
365+
for _ in largest_register_value..k {
366+
taylor_series_approximation = taylor(x_first, taylor_series_approximation);
367+
x_first *= 2.0;
368+
}
369+
370+
let mut g = c * taylor_series_approximation;
371+
372+
for register_value in (smallest_register_value..largest_register_value).rev() {
373+
taylor_series_approximation = taylor(x_first, taylor_series_approximation);
374+
g += multiplicities[register_value as usize] * taylor_series_approximation;
375+
x_first *= 2.0;
376+
}
377+
378+
g += x * a;
379+
380+
if g > g_prev && number_of_non_zero_registers >= g {
381+
delta_x *= (number_of_non_zero_registers - g) / (g - g_prev);
382+
} else {
383+
delta_x = 0.0;
384+
}
385+
386+
x += delta_x;
387+
g_prev = g;
388+
}
389+
390+
number_of_registers * x
391+
}
392+
258393
/// Trait for element-wise multiplication.
259394
trait ElementWiseMultiplication<Rhs = Self> {
260395
/// Element-wise multiplication.

tests/test_hll.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,3 +308,31 @@ fn test_mle_union_matches_exact() {
308308
Precision10::error_rate(),
309309
);
310310
}
311+
312+
/// The single-counter Maximum Likelihood Estimation of the cardinality (Ertl's secant-method
313+
/// estimator) must estimate a fully-fledged HyperLogLog counter within the precision's error
314+
/// rate. (It is known to be less accurate and much slower than the default HyperLogLog++
315+
/// corrected estimate, and its advantage over the uncorrected estimate is an average-over-
316+
/// cardinalities property rather than a per-point one; the benchmark quantifies both.)
317+
#[cfg(feature = "mle")]
318+
#[test]
319+
fn test_mle_cardinality_reasonable() {
320+
type Counter =
321+
HyperLogLog<Precision12, Bits6, <Precision12 as PackedRegister<Bits6>>::Array, XxHash>;
322+
323+
let mut hll: Counter = Default::default();
324+
for element in 0..100_000_u64 {
325+
hll.insert(&element);
326+
}
327+
assert!(!hll.is_hash_list());
328+
329+
let exact = 100_000.0_f64;
330+
let mle = hll.estimate_cardinality_mle();
331+
let mle_error = (mle - exact).abs() / exact;
332+
333+
assert!(
334+
mle_error <= Precision12::error_rate(),
335+
"MLE cardinality estimate {mle} differs from exact {exact} by {mle_error}, exceeding the error rate {}.",
336+
Precision12::error_rate(),
337+
);
338+
}

0 commit comments

Comments
 (0)