@@ -475,48 +475,19 @@ impl<P: Precision, B: Bits, R: Registers<P, B>, H: HasherType> HyperLogLog<P, B,
475475 ) -> f64 {
476476 match ( self . is_hash_list ( ) , other. is_hash_list ( ) ) {
477477 ( true , true ) => {
478- let left_hash_bits = self . get_hash_bits ( ) . unwrap ( ) ;
479- let right_hash_bits = other. get_hash_bits ( ) . unwrap ( ) ;
480- assert ! ( left_hash_bits >= GapHash :: <P , B >:: SMALLEST_VIABLE_HASH_BITS ) ;
481- assert ! ( right_hash_bits >= GapHash :: <P , B >:: SMALLEST_VIABLE_HASH_BITS ) ;
482-
483- let left_shift = if left_hash_bits <= right_hash_bits {
484- 0
485- } else {
486- left_hash_bits - right_hash_bits
487- } ;
488- let right_shift = if right_hash_bits <= left_hash_bits {
489- 0
490- } else {
491- right_hash_bits - left_hash_bits
492- } ;
493-
494- let left_hashes = self . registers . as_ref ( ) ;
495- let right_hashes = other. registers . as_ref ( ) ;
496- let left_bit_index = self . get_writer_tell ( ) ;
497- let right_bit_index = other. get_writer_tell ( ) ;
498-
499- let intersection_cardinality = f64:: from ( intersection_from_sorted_iterators (
500- GapHash :: < P , B > :: downgraded (
501- left_hashes,
502- self . get_number_of_hashes ( ) . unwrap ( ) ,
503- left_hash_bits,
504- left_bit_index,
505- left_shift,
506- ) ,
507- GapHash :: < P , B > :: downgraded (
508- right_hashes,
509- other. get_number_of_hashes ( ) . unwrap ( ) ,
510- right_hash_bits,
511- right_bit_index,
512- right_shift,
513- ) ,
514- ) ) ;
515-
516- let union_cardinality =
517- self_cardinality + other_cardinality - intersection_cardinality;
518-
519- correct_union_estimate ( self_cardinality, other_cardinality, union_cardinality)
478+ // Build the union as a hash list and estimate its cardinality directly, so the
479+ // birthday-paradox correction is applied to the union the same way it is to a
480+ // single counter. Inclusion-exclusion (A + B - intersection) would subtract a
481+ // raw, uncorrected count of coinciding downgraded hashes; for sets with little
482+ // real overlap those coincidences are dominated by spurious birthday collisions,
483+ // which biases the union estimate low and increasingly so at higher precisions.
484+ let mut union = self . clone ( ) ;
485+ union. merge ( other) ;
486+ correct_union_estimate (
487+ self_cardinality,
488+ other_cardinality,
489+ union. estimate_cardinality ( ) ,
490+ )
520491 }
521492 ( true , false ) => {
522493 let hash_bits = self . get_hash_bits ( ) . unwrap ( ) ;
0 commit comments