From 440afa11bce02c96470816fe6cf916d56bc382fa Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 18 Jul 2025 14:07:12 +0100 Subject: [PATCH 01/12] initial cut of customizable buildhashers in geo filters crate --- crates/geo_filters/evaluation/accuracy.rs | 17 ++--- crates/geo_filters/evaluation/performance.rs | 9 +-- crates/geo_filters/src/config.rs | 6 +- crates/geo_filters/src/diff_count.rs | 62 ++++++++++++------- crates/geo_filters/src/diff_count/sim_hash.rs | 16 ++--- crates/geo_filters/src/distinct_count.rs | 60 +++++++++++------- crates/geo_filters/src/evaluation/hll.rs | 12 ++-- .../geo_filters/src/evaluation/simulation.rs | 23 +++---- crates/geo_filters/src/lib.rs | 19 +++--- crates/geo_filters/tests/public_api.rs | 18 +++--- 10 files changed, 142 insertions(+), 100 deletions(-) diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index c3151b7..3db898f 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -2,6 +2,7 @@ use std::fs::File; use std::path::PathBuf; use clap::Parser; +use fnv::FnvBuildHasher; use geo_filters::config::VariableConfig; use itertools::Itertools; use once_cell::sync::Lazy; @@ -157,19 +158,19 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U16 => { let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U32 => { let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U64 => { let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) } } }), @@ -186,19 +187,19 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U16 => { let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U32 => { let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) } BucketType::U64 => { let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) } } }), diff --git a/crates/geo_filters/evaluation/performance.rs b/crates/geo_filters/evaluation/performance.rs index 4d7d706..eac4d65 100644 --- a/crates/geo_filters/evaluation/performance.rs +++ b/crates/geo_filters/evaluation/performance.rs @@ -1,4 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use fnv::FnvBuildHasher; use geo_filters::config::VariableConfig; use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13}; use geo_filters::distinct_count::GeoDistinctCount13; @@ -22,7 +23,7 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc = GeoDiffCount::new(c.clone()); + let mut gc = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); for i in 0..*size { gc.push(i); } @@ -61,7 +62,7 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc = GeoDiffCount::new(c.clone()); + let mut gc = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); for i in 0..*size { gc.push(i); black_box(gc.size()); @@ -106,8 +107,8 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc1 = GeoDiffCount::new(c.clone()); - let mut gc2 = GeoDiffCount::new(c.clone()); + let mut gc1 = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); + let mut gc2 = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); for i in 0..size { gc1.push(i * 2); gc2.push(i * 2 + 1); diff --git a/crates/geo_filters/src/config.rs b/crates/geo_filters/src/config.rs index e22d6df..013c4ce 100644 --- a/crates/geo_filters/src/config.rs +++ b/crates/geo_filters/src/config.rs @@ -306,12 +306,14 @@ pub(crate) fn take_ref(iter: &mut I, n: usize) -> impl Iterator>(f: impl Fn() -> C) -> (f32, f32) { + pub(crate) fn test_estimate>(f: impl Fn() -> C) -> (f32, f32) { let mut rnd = rand::rngs::StdRng::from_os_rng(); let cnt = 10000usize; let mut avg_precision = 0.0; diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 0c46263..07e0b12 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::cmp::Ordering; +use std::hash::BuildHasher; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -16,12 +17,13 @@ mod sim_hash; use bitvec::*; pub use config::{GeoDiffConfig13, GeoDiffConfig7}; +use fnv::FnvBuildHasher; /// Diff count filter with a relative error standard deviation of ~0.125. -pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>; +pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7, FnvBuildHasher>; /// Diff count filter with a relative error standard deviation of ~0.015. -pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13>; +pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, FnvBuildHasher>; /// Probabilistic diff count data structure based on geometric filters. /// @@ -57,14 +59,15 @@ pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13>; /// 1500 bytes achieves a precision of ~0.022 and could estimate the 10k items with an error /// of +/-22k items in the best case which is 20 times worse despite using 5 times more space! #[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -pub struct GeoDiffCount<'a, C: GeoConfig> { +pub struct GeoDiffCount<'a, C: GeoConfig, H: BuildHasher + Clone> { config: C, /// The bit positions are stored from largest to smallest. msb: Cow<'a, [C::BucketType]>, lsb: BitVec<'a>, + hash_builder: H, } -impl> std::fmt::Debug for GeoDiffCount<'_, C> { +impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDiffCount<'_, C, H> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -76,12 +79,13 @@ impl> std::fmt::Debug for GeoDiffCount<'_, C> { } } -impl> GeoDiffCount<'_, C> { - pub fn new(config: C) -> Self { +impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { + pub fn new(config: C, hash_builder: H) -> Self { Self { config, msb: Default::default(), lsb: Default::default(), + hash_builder, } } @@ -92,7 +96,7 @@ impl> GeoDiffCount<'_, C> { /// /// Note: we need a peekable iterator, such that we can extract the most significant bits without /// having to construct another iterator with the remaining `BitChunk`s. - fn from_bit_chunks>(config: C, chunks: I) -> Self { + fn from_bit_chunks>(config: C, hash_builder: H, chunks: I) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = Vec::default(); @@ -113,6 +117,7 @@ impl> GeoDiffCount<'_, C> { config, msb: Cow::from(msb), lsb, + hash_builder, }; result.debug_assert_invariants(); result @@ -248,11 +253,12 @@ impl> GeoDiffCount<'_, C> { // Consumers the current GeoDiffCount and produces an owned GeoDiffCount that // can live arbitrarily. - pub fn into_owned(self) -> GeoDiffCount<'static, C> { + pub fn into_owned(self) -> GeoDiffCount<'static, C, H> { GeoDiffCount { config: self.config, lsb: self.lsb.into_owned(), msb: Cow::Owned(self.msb.into_owned()), + hash_builder: self.hash_builder.clone(), } } @@ -290,13 +296,14 @@ impl> GeoDiffCount<'_, C> { /// after compression : 01 0 10 1 00 0 /// bitset of the returned filter : 010 101000 #[cfg(test)] -pub(crate) fn masked>( - diff_count: &GeoDiffCount<'_, C>, +pub(crate) fn masked, H: BuildHasher + Clone>( + diff_count: &GeoDiffCount<'_, C, H>, mask: usize, modulus: usize, -) -> GeoDiffCount<'static, C> { +) -> GeoDiffCount<'static, C, H> { GeoDiffCount::from_bit_chunks( diff_count.config.clone(), + diff_count.hash_builder.clone(), mask_bit_chunks(diff_count.bit_chunks(), mask as u64, modulus).peekable(), ) } @@ -304,21 +311,33 @@ pub(crate) fn masked>( /// Computes an xor of the two underlying bitsets. /// This operation corresponds to computing the symmetric difference of the two /// sets represented by the GeoDiffCounts. -pub(crate) fn xor>( - diff_count: &GeoDiffCount<'_, C>, - other: &GeoDiffCount<'_, C>, -) -> GeoDiffCount<'static, C> { +/// +/// SAFETY: The two GeoDiffCounts must have the same configuration and hash builder. +pub(crate) fn xor, H: BuildHasher + Clone>( + diff_count: &GeoDiffCount<'_, C, H>, + other: &GeoDiffCount<'_, C, H>, +) -> GeoDiffCount<'static, C, H> { assert!( diff_count.config == other.config, "combined filters must have the same configuration" ); + + // It would be really nice to be able to asser that our hash builders have + // the same state here, but from what I can see it is not common + // for hash builders to implement `PartialEq` or `Eq`. + GeoDiffCount::from_bit_chunks( diff_count.config.clone(), + diff_count.hash_builder.clone(), xor_bit_chunks(diff_count.bit_chunks(), other.bit_chunks()).peekable(), ) } -impl> Count for GeoDiffCount<'_, C> { +impl, H: BuildHasher + Clone> Count for GeoDiffCount<'_, C, H> { + fn hasher_builder(&self) -> &H { + &self.hash_builder + } + fn push_hash(&mut self, hash: u64) { self.xor_bit(self.config.hash_to_bucket(hash)); } @@ -340,8 +359,8 @@ impl> Count for GeoDiffCount<'_, C> { } fn bytes_in_memory(&self) -> usize { - let Self { config, msb, lsb } = self; - size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + let Self { config, msb, lsb, hash_builder } = self; + size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + size_of_val(hash_builder) } } @@ -359,7 +378,7 @@ mod tests { // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=50/msb=10 // - type GeoDiffCount7_50<'a> = GeoDiffCount<'a, FixedConfig>; + type GeoDiffCount7_50<'a> = GeoDiffCount<'a, FixedConfig, FnvBuildHasher>; #[test] fn test_geo_count() { @@ -535,6 +554,7 @@ mod tests { } let actual = GeoDiffCount::from_bit_chunks( expected.config.clone(), + expected.hash_builder.clone(), expected.bit_chunks().peekable(), ); assert_eq!(expected, actual); @@ -550,9 +570,9 @@ mod tests { assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec()); } - impl> GeoDiffCount<'_, C> { + impl> GeoDiffCount<'_, C, FnvBuildHasher> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { - let mut result = Self::new(config); + let mut result = Self::new(config, FnvBuildHasher::default()); for one in ones { result.xor_bit(one); } diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index cee2370..32ca0ae 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -1,6 +1,6 @@ //! Similarity hashing allows quickly finding similar sets with a reverse index. -use std::hash::{Hash, Hasher}; +use std::hash::{BuildHasher, Hash, Hasher}; use std::ops::Range; use fnv::FnvHasher; @@ -42,7 +42,7 @@ impl SimHash { } } -impl> GeoDiffCount<'_, C> { +impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { /// Given the expected size of a diff, this function returns the range of bucket ids which should /// be searched for in order to find geometric filters of the desired similarity. If at least half /// of the buckets in the range match, one found a match that has the expected diff size (or better). @@ -107,14 +107,14 @@ impl> GeoDiffCount<'_, C> { /// /// Property 2 is used to construct all sim hashes efficiently. /// Property 3 is used to search for similar GeoDiffCounts. -struct SimHashIterator<'a, C: GeoConfig> { - filter: &'a GeoDiffCount<'a, C>, +struct SimHashIterator<'a, C: GeoConfig, H: BuildHasher + Clone> { + filter: &'a GeoDiffCount<'a, C, H>, prev_bucket_id: BucketId, sim_hash: [u64; SIM_BUCKETS], } -impl<'a, C: GeoConfig> SimHashIterator<'a, C> { - pub fn new(filter: &'a GeoDiffCount<'a, C>) -> Self { +impl<'a, C: GeoConfig, H: BuildHasher + Clone> SimHashIterator<'a, C, H> { + pub fn new(filter: &'a GeoDiffCount<'a, C, H>) -> Self { let msb = filter.nth_most_significant_one(0); let prev_bucket_id = msb.map(|b| b.into_usize() / SIM_BUCKET_SIZE).unwrap_or(0) + SIM_BUCKETS; @@ -126,7 +126,7 @@ impl<'a, C: GeoConfig> SimHashIterator<'a, C> { } } -impl> Iterator for SimHashIterator<'_, C> { +impl, H: BuildHasher + Clone> Iterator for SimHashIterator<'_, C, H> { type Item = (BucketId, SimHash); fn next(&mut self) -> Option { @@ -148,7 +148,7 @@ impl> Iterator for SimHashIterator<'_, C> { } } -impl> GeoDiffCount<'_, C> { +impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { /// n specifies the desired zero-based index of the most significant one. /// The zero-based index of the desired one bit is returned. fn nth_most_significant_one(&self, mut n: usize) -> Option { diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index 6795cb8..50296bf 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -1,6 +1,7 @@ //! Geometric filter implementation for distinct count. use std::collections::VecDeque; +use std::hash::BuildHasher; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -14,33 +15,35 @@ mod bitdeque; mod config; pub use config::{GeoDistinctConfig13, GeoDistinctConfig7}; +use fnv::FnvBuildHasher; /// Distinct count filter with a relative error standard deviation of ~0.065. /// Uses at most 168 bytes of memory. -pub type GeoDistinctCount7<'a> = GeoDistinctCount<'a, GeoDistinctConfig7>; +pub type GeoDistinctCount7<'a> = GeoDistinctCount<'a, GeoDistinctConfig7, FnvBuildHasher>; /// Distinct count filter with a relative error standard deviation of ~0.0075. /// Uses at most 9248 bytes of memory. -pub type GeoDistinctCount13<'a> = GeoDistinctCount<'a, GeoDistinctConfig13>; +pub type GeoDistinctCount13<'a> = GeoDistinctCount<'a, GeoDistinctConfig13, FnvBuildHasher>; /// Probabilistic distinct count data structure based on geometric filters. /// /// The [`GeoDistinctCount`] falls into the category of probabilistic set size estimators. /// It has some similar properties as related filters like HyperLogLog, MinHash, etc, but uses less space. #[derive(Eq, PartialEq)] -pub struct GeoDistinctCount<'a, C: GeoConfig> { +pub struct GeoDistinctCount<'a, C: GeoConfig, H: BuildHasher> { config: C, + hash_builder: H, msb: VecDeque, lsb: BitDeque<'a>, } -impl + Default> Default for GeoDistinctCount<'_, C> { +impl + Default, H: BuildHasher + Clone + Default> Default for GeoDistinctCount<'_, C, H> { fn default() -> Self { - Self::new(C::default()) + Self::new(C::default(), H::default()) } } -impl> std::fmt::Debug for GeoDistinctCount<'_, C> { +impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDistinctCount<'_, C, H> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -52,17 +55,17 @@ impl> std::fmt::Debug for GeoDistinctCount<'_, C> { } } -impl> GeoDistinctCount<'_, C> { - pub fn new(config: C) -> Self { +impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> { + pub fn new(config: C, hash_builder: H) -> Self { let msb = Default::default(); let lsb = BitDeque::new(max_lsb_bytes::( config.max_bytes(), config.max_msb_len(), )); - Self { config, msb, lsb } + Self { config, msb, lsb, hash_builder } } - fn from_bit_chunks>(config: C, chunks: I) -> Self { + fn from_bit_chunks>(config: C, hash_builder: H, chunks: I) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = VecDeque::default(); @@ -83,7 +86,7 @@ impl> GeoDistinctCount<'_, C> { max_lsb_bytes::(config.max_bytes(), config.max_msb_len()), ); - let result = Self { config, msb, lsb }; + let result = Self { config, msb, lsb, hash_builder }; result.debug_assert_invariants(); result } @@ -128,7 +131,11 @@ impl> GeoDistinctCount<'_, C> { } } -impl> Count for GeoDistinctCount<'_, C> { +impl, H: BuildHasher + Clone> Count for GeoDistinctCount<'_, C, H> { + fn hasher_builder(&self) -> &H { + &self.hash_builder + } + fn push_hash(&mut self, hash: u64) { self.set_bit(self.config.hash_to_bucket(hash)); } @@ -174,12 +181,13 @@ impl> Count for GeoDistinctCount<'_, C> { } fn bytes_in_memory(&self) -> usize { - let Self { config, msb, lsb } = self; - size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + let Self { config, msb, lsb, hash_builder } = self; + size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + + size_of_val(hash_builder) } } -impl> GeoDistinctCount<'_, C> { +impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> { fn insert_into_lsb(&mut self, bucket: usize) { if !self.lsb.test_bit(bucket) { self.lsb.set_bit(bucket); @@ -208,16 +216,22 @@ impl> GeoDistinctCount<'_, C> { } } -fn or>( - a: &GeoDistinctCount<'_, C>, - b: &GeoDistinctCount<'_, C>, -) -> GeoDistinctCount<'static, C> { +fn or, H: BuildHasher + Clone>( + a: &GeoDistinctCount<'_, C, H>, + b: &GeoDistinctCount<'_, C, H>, +) -> GeoDistinctCount<'static, C, H> { assert!( a.config == b.config, "combined filters must have the same configuration" ); + + // It would be really nice to be able to asser that our hash builders have + // the same state here, but from what I can see it is not common + // for hash builders to implement `PartialEq` or `Eq`. + GeoDistinctCount::from_bit_chunks( a.config.clone(), + a.hash_builder.clone(), or_bit_chunks(a.bit_chunks(), b.bit_chunks()).peekable(), ) } @@ -362,7 +376,7 @@ mod tests { 13, 7800, (7800 - (msb.round() as usize) * 8) / 3, - ))) + ), FnvBuildHasher::default())) }, 3000, &[100000], @@ -381,7 +395,7 @@ mod tests { expected.push_hash(rnd.next_u64()); } let actual = - GeoDistinctCount::from_bit_chunks(expected.config.clone(), expected.bit_chunks()); + GeoDistinctCount::from_bit_chunks(expected.config.clone(), expected.hash_builder.clone(), expected.bit_chunks()); assert_eq!(expected, actual); } } @@ -395,9 +409,9 @@ mod tests { assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec()); } - impl> GeoDistinctCount<'_, C> { + impl> GeoDistinctCount<'_, C, FnvBuildHasher> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { - let mut result = Self::new(config); + let mut result = Self::new(config, FnvBuildHasher::default()); for one in ones { result.set_bit(one); } diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index 30b74bf..299db81 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -87,7 +87,7 @@ impl Hasher for NoopHasher { #[inline] fn write(&mut self, _: &[u8]) { - todo!("") + unimplemented!("NoopHasher only supports writing u64 values"); } #[inline] @@ -104,13 +104,17 @@ impl BuildHasher for BuildNoopHasher { } } -impl Count for Hll { +impl Count for Hll { + fn hasher_builder(&self) -> &BuildNoopHasher { + unimplemented!() + } + fn push_hash(&mut self, hash: u64) { self.inner.borrow_mut().insert(&hash) } fn push_sketch(&mut self, _other: &Self) { - todo!() + unimplemented!() } fn size(&self) -> f32 { @@ -118,7 +122,7 @@ impl Count for Hll { } fn size_with_sketch(&self, _other: &Self) -> f32 { - todo!() + unimplemented!() } fn bytes_in_memory(&self) -> usize { diff --git a/crates/geo_filters/src/evaluation/simulation.rs b/crates/geo_filters/src/evaluation/simulation.rs index ef736fd..1698b74 100644 --- a/crates/geo_filters/src/evaluation/simulation.rs +++ b/crates/geo_filters/src/evaluation/simulation.rs @@ -1,6 +1,7 @@ use std::io::Write; use std::time::Instant; +use fnv::FnvBuildHasher; use itertools::Itertools; use rand::{RngCore, SeedableRng}; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; @@ -20,37 +21,37 @@ pub trait SimulationCount { fn size(&self) -> f32; fn bytes_in_memory(&self) -> usize; } -impl + Clone> SimulationCount for GeoDiffCount<'_, C> { +impl + Clone> SimulationCount for GeoDiffCount<'_, C, FnvBuildHasher> { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } -impl> SimulationCount for GeoDistinctCount<'_, C> { +impl> SimulationCount for GeoDistinctCount<'_, C, FnvBuildHasher> { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } impl SimulationCount for Hll { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 819421b..04b7c52 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -13,7 +13,7 @@ pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; -use std::hash::{Hash, Hasher}; +use std::hash::{BuildHasher, Hash}; /// Marker trait to indicate the variant implemented by a [`Count`] instance. pub trait Method: Clone + Eq + PartialEq + Send + Sync {} @@ -29,13 +29,17 @@ pub struct Distinct {} impl Method for Distinct {} /// Trait for types solving the set cardinality estimation problem. -pub trait Count { +pub trait Count { + /// Get the current hash builder + fn hasher_builder(&self) -> &H; + /// Add the given hash to the set. fn push_hash(&mut self, hash: u64); /// Add the hash of the given item, computed with the default hasher, to the set. fn push(&mut self, item: I) { - self.push_hash(item_to_hash(item)) + let hash_builder = self.hasher_builder(); + self.push_hash(hash_builder.hash_one(item)); } /// Add the given sketch to this one. @@ -51,11 +55,4 @@ pub trait Count { /// Returns the number of bytes in memory used to represent this filter. fn bytes_in_memory(&self) -> usize; -} - -fn item_to_hash(item: I) -> u64 { - // TODO: replace with a stable hashing function! - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - item.hash(&mut hasher); - hasher.finish() -} +} \ No newline at end of file diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index d705d2d..2e0482f 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -1,3 +1,5 @@ +use fnv::FnvBuildHasher; + #[test] fn can_use_predefined_diff_count() { use geo_filters::diff_count::GeoDiffCount7; @@ -11,7 +13,7 @@ fn can_use_predefined_diff_count() { fn can_use_custom_diff_count() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; - let mut f = GeoDiffCount::::default(); + let mut f = GeoDiffCount::::default(); f.push(42); f.size(); } @@ -21,7 +23,7 @@ fn can_use_diff_count_with_predefined_config_value() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; let c = GeoDiffConfig7::default(); - let mut f = GeoDiffCount::new(c); + let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } @@ -32,7 +34,7 @@ fn can_use_diff_count_with_fixed_config_value() { use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; let c = FixedConfig::<_, u16, 7, 128, 10>::default(); - let mut f = GeoDiffCount::new(c); + let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } @@ -43,7 +45,7 @@ fn can_use_diff_count_with_variable_config_value() { use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; let c = VariableConfig::<_, u16>::new(7, 128, 10); - let mut f = GeoDiffCount::new(c); + let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } @@ -61,7 +63,7 @@ fn can_use_predefined_distinct_count() { fn can_use_custom_distinct_count() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; - let mut f = GeoDistinctCount::::default(); + let mut f = GeoDistinctCount::::default(); f.push(42); f.size(); } @@ -71,7 +73,7 @@ fn can_use_distinct_count_with_predefined_config_value() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; let c = GeoDistinctConfig7::default(); - let mut f = GeoDistinctCount::new(c); + let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } @@ -82,7 +84,7 @@ fn can_use_distinct_count_with_fixed_config_value() { use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; let c = FixedConfig::<_, u16, 7, 118, 11>::default(); - let mut f = GeoDistinctCount::new(c); + let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } @@ -93,7 +95,7 @@ fn can_use_distinct_count_with_variable_config_value() { use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; let c = VariableConfig::<_, u16>::new(7, 118, 11); - let mut f = GeoDistinctCount::new(c); + let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); f.push(42); f.size(); } From 77d67b5fbd0aefc0d0064e434946aaff0538ecf6 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 18 Jul 2025 14:21:36 +0100 Subject: [PATCH 02/12] update hardcoded test values --- crates/geo_filters/src/distinct_count.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index 50296bf..9253fcd 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -259,15 +259,20 @@ mod tests { #[test] fn test_geo_count() { + // Pairs of (n, expected) where n is the number of inserted items + // and expected is the expected size of the GeoDistinctCount. + // The output matching the expected values is dependent on the configuration + // and hashing function. Changes to these will lead to different results and the + // test will need to be updated. for (n, result) in [ (10, 10.0021105), (100, 100.21153), - (1000, 1001.81635), - (10000, 9951.017), - (30000, 29927.705), - (100000, 99553.24), - (1000000, 1003824.1), - (10000000, 10071972.0), + (1000, 1021.64075), + (10000, 11064.973), + (30000, 34484.45), + (100000, 112042.44), + (1000000, 827151.1), + (10000000, 3110472.8), ] { let mut geo_count = GeoDistinctCount13::default(); (0..n).for_each(|i| geo_count.push(i)); From 4259f654d0dd9c0529de172672152bc0325e7c0f Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 09:32:47 +0100 Subject: [PATCH 03/12] wip save point --- crates/geo_filters/evaluation/accuracy.rs | 17 ++-- crates/geo_filters/evaluation/performance.rs | 8 +- crates/geo_filters/src/build_hasher.rs | 27 ++++++ crates/geo_filters/src/config.rs | 83 ++++++++++++++---- crates/geo_filters/src/diff_count.rs | 73 ++++++++++------ crates/geo_filters/src/diff_count/config.rs | 5 +- crates/geo_filters/src/diff_count/sim_hash.rs | 13 +-- crates/geo_filters/src/distinct_count.rs | 85 ++++++++++++++----- .../geo_filters/src/distinct_count/config.rs | 5 +- crates/geo_filters/src/evaluation/hll.rs | 1 + crates/geo_filters/src/lib.rs | 7 +- crates/geo_filters/tests/public_api.rs | 4 +- 12 files changed, 238 insertions(+), 90 deletions(-) create mode 100644 crates/geo_filters/src/build_hasher.rs diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index 3db898f..2deea38 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use clap::Parser; use fnv::FnvBuildHasher; +use geo_filters::build_hasher::DefaultBuildHasher; use geo_filters::config::VariableConfig; use itertools::Itertools; use once_cell::sync::Lazy; @@ -158,19 +159,19 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U16 => { let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U32 => { let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U64 => { let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) } } }), @@ -187,19 +188,19 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U16 => { let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U32 => { let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) } BucketType::U64 => { let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), FnvBuildHasher::default()))) + Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) } } }), diff --git a/crates/geo_filters/evaluation/performance.rs b/crates/geo_filters/evaluation/performance.rs index eac4d65..2a493bd 100644 --- a/crates/geo_filters/evaluation/performance.rs +++ b/crates/geo_filters/evaluation/performance.rs @@ -23,7 +23,7 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); + let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { gc.push(i); } @@ -62,7 +62,7 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); + let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { gc.push(i); black_box(gc.size()); @@ -107,8 +107,8 @@ fn criterion_benchmark(c: &mut Criterion) { group.bench_function("geo_diff_count_var_13", |b| { let c = VariableConfig::<_, u32>::new(13, 7680, 256); b.iter(move || { - let mut gc1 = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); - let mut gc2 = GeoDiffCount::new(c.clone(), FnvBuildHasher::default()); + let mut gc1 = GeoDiffCount::new(c.clone()); + let mut gc2 = GeoDiffCount::new(c.clone()); for i in 0..size { gc1.push(i * 2); gc2.push(i * 2 + 1); diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs new file mode 100644 index 0000000..f861f22 --- /dev/null +++ b/crates/geo_filters/src/build_hasher.rs @@ -0,0 +1,27 @@ +use std::hash::{BuildHasher, BuildHasherDefault, DefaultHasher, Hasher as _}; + +/// Trait for a hasher factory that can be used to produce hashers +/// for use with geometric filters. +/// +/// It is a super set of [`BuildHasher`], enforcing additional requirements +/// on the hasher builder that are required for the geometric filters and +/// surrounding code. +/// +/// When performing operations across two different geometric filters, +/// the hashers must be equal, i.e. they must produce the same hash for the +/// same input. This is checked by the `hasher_eq` method. +pub trait GeoFilterBuildHasher: BuildHasher + Default + Clone + Send + Sync { + fn hasher_eq(&self, other: &Self) -> bool { + let v1 = self.build_hasher().finish(); + let v2 = other.build_hasher().finish(); + v1 == v2 + } +} + +impl GeoFilterBuildHasher for T +where + T: BuildHasher + Default + Clone + Send + Sync, +{ +} + +pub type DefaultBuildHasher = BuildHasherDefault; \ No newline at end of file diff --git a/crates/geo_filters/src/config.rs b/crates/geo_filters/src/config.rs index 013c4ce..aaaac6c 100644 --- a/crates/geo_filters/src/config.rs +++ b/crates/geo_filters/src/config.rs @@ -2,7 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; -use crate::Method; +use crate::{build_hasher::GeoFilterBuildHasher, Method}; mod bitchunks; mod buckets; @@ -79,8 +79,16 @@ pub trait GeoConfig: Clone + Eq + Sized + Send + Sync { /// Instantiating this type may panic if `T` is too small to hold the maximum possible /// bucket id determined by `B`, or `B` is larger than the largest statically defined /// lookup table. -#[derive(Clone, Eq, PartialEq)] -pub struct FixedConfig { +#[derive(Clone)] +pub struct FixedConfig< + M: Method, + T, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: GeoFilterBuildHasher, +> { + build_hasher: H, _phantom: PhantomData<(M, T)>, } @@ -90,7 +98,8 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - > GeoConfig for FixedConfig + H: GeoFilterBuildHasher, + > GeoConfig for FixedConfig { type BucketType = T; @@ -148,46 +157,87 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - > Default for FixedConfig + H: GeoFilterBuildHasher, + > Default for FixedConfig { fn default() -> Self { assert_bucket_type_large_enough::(B); assert_buckets_within_estimation_bound(B, BYTES * BITS_PER_BYTE); + assert!( B < M::get_lookups().len(), "B = {} is not available for fixed config, requires B < {}", B, M::get_lookups().len() ); + Self { _phantom: PhantomData, + build_hasher: H::default(), } } } +impl< + M: Method + Lookups, + T: IsBucketType, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: GeoFilterBuildHasher, + > PartialEq for FixedConfig +{ + fn eq(&self, other: &Self) -> bool { + self.build_hasher.hasher_eq(&other.build_hasher) + } +} + +impl< + M: Method + Lookups, + T: IsBucketType, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: GeoFilterBuildHasher, + > Eq for FixedConfig +{ +} + /// Geometric filter configuration using dynamic lookup tables. #[derive(Clone)] -pub struct VariableConfig { +pub struct VariableConfig { b: usize, bytes: usize, msb: usize, _phantom: PhantomData<(M, T)>, lookup: Arc, + build_hasher: H, } -impl Eq for VariableConfig {} +impl Eq for VariableConfig {} -impl PartialEq for VariableConfig { +impl PartialEq for VariableConfig { fn eq(&self, other: &Self) -> bool { - self.b == other.b && self.bytes == other.bytes && self.msb == other.msb + self.b == other.b + && self.bytes == other.bytes + && self.msb == other.msb + && self.build_hasher.hasher_eq(&other.build_hasher) } } -impl VariableConfig { +impl VariableConfig { /// Returns a new configuration value. See [`FixedConfig`] for the meaning /// of the parameters. This functions computes a new lookup table every time /// it is invoked, so make sure to share the resulting value as much as possible. pub fn new(b: usize, bytes: usize, msb: usize) -> Self { + Self::with_hasher(b, bytes, msb, H::default()) + } + + /// Returns a new configuration value, specifying the [`GeoFilterBuildHasher`]. + /// Useful, for example, if you wish to use a custom seed in your hashers. + /// + /// See [`Self::new`] for more. + pub fn with_hasher(b: usize, bytes: usize, msb: usize, build_hasher: H) -> Self { assert_bucket_type_large_enough::(b); assert_buckets_within_estimation_bound(b, bytes * BITS_PER_BYTE); Self { @@ -196,6 +246,7 @@ impl VariableConfig { msb, _phantom: PhantomData, lookup: Arc::new(M::new_lookup(b)), + build_hasher, } } @@ -205,7 +256,9 @@ impl VariableConfig { } } -impl GeoConfig for VariableConfig { +impl GeoConfig + for VariableConfig +{ type BucketType = T; #[inline] @@ -306,14 +359,14 @@ pub(crate) fn take_ref(iter: &mut I, n: usize) -> impl Iterator>(f: impl Fn() -> C) -> (f32, f32) { + pub(crate) fn test_estimate>( + f: impl Fn() -> C, + ) -> (f32, f32) { let mut rnd = rand::rngs::StdRng::from_os_rng(); let cnt = 10000usize; let mut avg_precision = 0.0; diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 07e0b12..1255776 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -2,9 +2,9 @@ use std::borrow::Cow; use std::cmp::Ordering; -use std::hash::BuildHasher; use std::mem::{size_of, size_of_val}; +use crate::build_hasher::{DefaultBuildHasher, GeoFilterBuildHasher}; use crate::config::{ count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones, mask_bit_chunks, take_ref, xor_bit_chunks, BitChunk, GeoConfig, IsBucketType, @@ -17,13 +17,12 @@ mod sim_hash; use bitvec::*; pub use config::{GeoDiffConfig13, GeoDiffConfig7}; -use fnv::FnvBuildHasher; /// Diff count filter with a relative error standard deviation of ~0.125. -pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7, FnvBuildHasher>; +pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7, DefaultBuildHasher>; /// Diff count filter with a relative error standard deviation of ~0.015. -pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, FnvBuildHasher>; +pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, DefaultBuildHasher>; /// Probabilistic diff count data structure based on geometric filters. /// @@ -59,15 +58,15 @@ pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, FnvBuildHasher>; /// 1500 bytes achieves a precision of ~0.022 and could estimate the 10k items with an error /// of +/-22k items in the best case which is 20 times worse despite using 5 times more space! #[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -pub struct GeoDiffCount<'a, C: GeoConfig, H: BuildHasher + Clone> { +pub struct GeoDiffCount<'a, C: GeoConfig, H: GeoFilterBuildHasher = DefaultBuildHasher> { config: C, /// The bit positions are stored from largest to smallest. msb: Cow<'a, [C::BucketType]>, lsb: BitVec<'a>, - hash_builder: H, + build_hasher: H, } -impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDiffCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> std::fmt::Debug for GeoDiffCount<'_, C, H> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -79,13 +78,22 @@ impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDiffCoun } } -impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { - pub fn new(config: C, hash_builder: H) -> Self { +impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { + pub fn new(config: C) -> Self { Self { config, msb: Default::default(), lsb: Default::default(), - hash_builder, + build_hasher: Default::default(), + } + } + + pub fn with_build_hasher(config: C, build_hasher: H) -> Self { + Self { + config, + msb: Default::default(), + lsb: Default::default(), + build_hasher, } } @@ -96,7 +104,11 @@ impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { /// /// Note: we need a peekable iterator, such that we can extract the most significant bits without /// having to construct another iterator with the remaining `BitChunk`s. - fn from_bit_chunks>(config: C, hash_builder: H, chunks: I) -> Self { + fn from_bit_chunks>( + config: C, + hash_builder: H, + chunks: I, + ) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = Vec::default(); @@ -117,7 +129,7 @@ impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { config, msb: Cow::from(msb), lsb, - hash_builder, + build_hasher: hash_builder, }; result.debug_assert_invariants(); result @@ -258,7 +270,7 @@ impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { config: self.config, lsb: self.lsb.into_owned(), msb: Cow::Owned(self.msb.into_owned()), - hash_builder: self.hash_builder.clone(), + build_hasher: self.build_hasher.clone(), } } @@ -296,14 +308,14 @@ impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { /// after compression : 01 0 10 1 00 0 /// bitset of the returned filter : 010 101000 #[cfg(test)] -pub(crate) fn masked, H: BuildHasher + Clone>( +pub(crate) fn masked, H: GeoFilterBuildHasher>( diff_count: &GeoDiffCount<'_, C, H>, mask: usize, modulus: usize, ) -> GeoDiffCount<'static, C, H> { GeoDiffCount::from_bit_chunks( diff_count.config.clone(), - diff_count.hash_builder.clone(), + diff_count.build_hasher.clone(), mask_bit_chunks(diff_count.bit_chunks(), mask as u64, modulus).peekable(), ) } @@ -311,9 +323,9 @@ pub(crate) fn masked, H: BuildHasher + Clone>( /// Computes an xor of the two underlying bitsets. /// This operation corresponds to computing the symmetric difference of the two /// sets represented by the GeoDiffCounts. -/// +/// /// SAFETY: The two GeoDiffCounts must have the same configuration and hash builder. -pub(crate) fn xor, H: BuildHasher + Clone>( +pub(crate) fn xor, H: GeoFilterBuildHasher>( diff_count: &GeoDiffCount<'_, C, H>, other: &GeoDiffCount<'_, C, H>, ) -> GeoDiffCount<'static, C, H> { @@ -328,14 +340,14 @@ pub(crate) fn xor, H: BuildHasher + Clone>( GeoDiffCount::from_bit_chunks( diff_count.config.clone(), - diff_count.hash_builder.clone(), + diff_count.build_hasher.clone(), xor_bit_chunks(diff_count.bit_chunks(), other.bit_chunks()).peekable(), ) } -impl, H: BuildHasher + Clone> Count for GeoDiffCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> Count for GeoDiffCount<'_, C, H> { fn hasher_builder(&self) -> &H { - &self.hash_builder + &self.build_hasher } fn push_hash(&mut self, hash: u64) { @@ -359,8 +371,16 @@ impl, H: BuildHasher + Clone> Count for GeoDiffCount } fn bytes_in_memory(&self) -> usize { - let Self { config, msb, lsb, hash_builder } = self; - size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + size_of_val(hash_builder) + let Self { + config, + msb, + lsb, + build_hasher: hash_builder, + } = self; + size_of_val(config) + + msb.len() * size_of::() + + lsb.bytes_in_memory() + + size_of_val(hash_builder) } } @@ -378,7 +398,8 @@ mod tests { // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=50/msb=10 // - type GeoDiffCount7_50<'a> = GeoDiffCount<'a, FixedConfig, FnvBuildHasher>; + type GeoDiffCount7_50<'a> = + GeoDiffCount<'a, FixedConfig>; #[test] fn test_geo_count() { @@ -554,7 +575,7 @@ mod tests { } let actual = GeoDiffCount::from_bit_chunks( expected.config.clone(), - expected.hash_builder.clone(), + expected.build_hasher.clone(), expected.bit_chunks().peekable(), ); assert_eq!(expected, actual); @@ -570,9 +591,9 @@ mod tests { assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec()); } - impl> GeoDiffCount<'_, C, FnvBuildHasher> { + impl> GeoDiffCount<'_, C, DefaultBuildHasher> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { - let mut result = Self::new(config, FnvBuildHasher::default()); + let mut result = Self::new(config); for one in ones { result.xor_bit(one); } diff --git a/crates/geo_filters/src/diff_count/config.rs b/crates/geo_filters/src/diff_count/config.rs index 7ad7447..ecb4dc9 100644 --- a/crates/geo_filters/src/diff_count/config.rs +++ b/crates/geo_filters/src/diff_count/config.rs @@ -1,5 +1,6 @@ use once_cell::sync::Lazy; +use crate::build_hasher::DefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -17,7 +18,7 @@ use crate::Diff; // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=112/msb={8,12,16,20} // -pub type GeoDiffConfig7 = FixedConfig; +pub type GeoDiffConfig7 = FixedConfig; /// Diff count configuration with a relative error standard deviation of ~0.015. // @@ -29,7 +30,7 @@ pub type GeoDiffConfig7 = FixedConfig; // // scripts/accuracy -n 1000 geo_diff/u32/b=13/bytes=7138/msb={128,192,256,384,512} // -pub type GeoDiffConfig13 = FixedConfig; +pub type GeoDiffConfig13 = FixedConfig; impl Lookups for Diff { #[inline] diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index 32ca0ae..8050b47 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -1,10 +1,11 @@ //! Similarity hashing allows quickly finding similar sets with a reverse index. -use std::hash::{BuildHasher, Hash, Hasher}; +use std::hash::{ Hash, Hasher}; use std::ops::Range; use fnv::FnvHasher; +use crate::build_hasher::GeoFilterBuildHasher; use crate::config::{BitChunk, GeoConfig, IsBucketType}; use crate::diff_count::GeoDiffCount; use crate::Diff; @@ -42,7 +43,7 @@ impl SimHash { } } -impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { /// Given the expected size of a diff, this function returns the range of bucket ids which should /// be searched for in order to find geometric filters of the desired similarity. If at least half /// of the buckets in the range match, one found a match that has the expected diff size (or better). @@ -107,13 +108,13 @@ impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { /// /// Property 2 is used to construct all sim hashes efficiently. /// Property 3 is used to search for similar GeoDiffCounts. -struct SimHashIterator<'a, C: GeoConfig, H: BuildHasher + Clone> { +struct SimHashIterator<'a, C: GeoConfig, H: GeoFilterBuildHasher> { filter: &'a GeoDiffCount<'a, C, H>, prev_bucket_id: BucketId, sim_hash: [u64; SIM_BUCKETS], } -impl<'a, C: GeoConfig, H: BuildHasher + Clone> SimHashIterator<'a, C, H> { +impl<'a, C: GeoConfig, H: GeoFilterBuildHasher> SimHashIterator<'a, C, H> { pub fn new(filter: &'a GeoDiffCount<'a, C, H>) -> Self { let msb = filter.nth_most_significant_one(0); let prev_bucket_id = @@ -126,7 +127,7 @@ impl<'a, C: GeoConfig, H: BuildHasher + Clone> SimHashIterator<'a, C, H> { } } -impl, H: BuildHasher + Clone> Iterator for SimHashIterator<'_, C, H> { +impl, H: GeoFilterBuildHasher> Iterator for SimHashIterator<'_, C, H> { type Item = (BucketId, SimHash); fn next(&mut self) -> Option { @@ -148,7 +149,7 @@ impl, H: BuildHasher + Clone> Iterator for SimHashIterator<'_ } } -impl, H: BuildHasher + Clone> GeoDiffCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { /// n specifies the desired zero-based index of the most significant one. /// The zero-based index of the desired one bit is returned. fn nth_most_significant_one(&self, mut n: usize) -> Option { diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index 9253fcd..f8d29ff 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -1,9 +1,9 @@ //! Geometric filter implementation for distinct count. use std::collections::VecDeque; -use std::hash::BuildHasher; use std::mem::{size_of, size_of_val}; +use crate::build_hasher::GeoFilterBuildHasher; use crate::config::{ count_ones_from_bitchunks, iter_bit_chunks, iter_ones, max_lsb_bytes, or_bit_chunks, take_ref, BitChunk, GeoConfig, IsBucketType, @@ -30,20 +30,24 @@ pub type GeoDistinctCount13<'a> = GeoDistinctCount<'a, GeoDistinctConfig13, FnvB /// The [`GeoDistinctCount`] falls into the category of probabilistic set size estimators. /// It has some similar properties as related filters like HyperLogLog, MinHash, etc, but uses less space. #[derive(Eq, PartialEq)] -pub struct GeoDistinctCount<'a, C: GeoConfig, H: BuildHasher> { +pub struct GeoDistinctCount<'a, C: GeoConfig, H: GeoFilterBuildHasher> { config: C, hash_builder: H, msb: VecDeque, lsb: BitDeque<'a>, } -impl + Default, H: BuildHasher + Clone + Default> Default for GeoDistinctCount<'_, C, H> { +impl + Default, H: GeoFilterBuildHasher> Default + for GeoDistinctCount<'_, C, H> +{ fn default() -> Self { - Self::new(C::default(), H::default()) + Self::with_build_hasher(C::default(), H::default()) } } -impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDistinctCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> std::fmt::Debug + for GeoDistinctCount<'_, C, H> +{ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -55,17 +59,30 @@ impl, H: BuildHasher + Clone> std::fmt::Debug for GeoDist } } -impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> { - pub fn new(config: C, hash_builder: H) -> Self { +impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> { + pub fn new(config: C) -> Self { + Self::with_build_hasher(config, H::default()) + } + + pub fn with_build_hasher(config: C, hash_builder: H) -> Self { let msb = Default::default(); let lsb = BitDeque::new(max_lsb_bytes::( config.max_bytes(), config.max_msb_len(), )); - Self { config, msb, lsb, hash_builder } + Self { + config, + msb, + lsb, + hash_builder, + } } - fn from_bit_chunks>(config: C, hash_builder: H, chunks: I) -> Self { + fn from_bit_chunks>( + config: C, + hash_builder: H, + chunks: I, + ) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = VecDeque::default(); @@ -86,7 +103,12 @@ impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> max_lsb_bytes::(config.max_bytes(), config.max_msb_len()), ); - let result = Self { config, msb, lsb, hash_builder }; + let result = Self { + config, + msb, + lsb, + hash_builder, + }; result.debug_assert_invariants(); result } @@ -131,7 +153,9 @@ impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> } } -impl, H: BuildHasher + Clone> Count for GeoDistinctCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> Count + for GeoDistinctCount<'_, C, H> +{ fn hasher_builder(&self) -> &H { &self.hash_builder } @@ -181,13 +205,20 @@ impl, H: BuildHasher + Clone> Count for GeoD } fn bytes_in_memory(&self) -> usize { - let Self { config, msb, lsb, hash_builder } = self; - size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() + - size_of_val(hash_builder) + let Self { + config, + msb, + lsb, + hash_builder, + } = self; + size_of_val(config) + + msb.len() * size_of::() + + lsb.bytes_in_memory() + + size_of_val(hash_builder) } } -impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> { +impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> { fn insert_into_lsb(&mut self, bucket: usize) { if !self.lsb.test_bit(bucket) { self.lsb.set_bit(bucket); @@ -216,7 +247,7 @@ impl, H: BuildHasher + Clone> GeoDistinctCount<'_, C, H> } } -fn or, H: BuildHasher + Clone>( +fn or, H: GeoFilterBuildHasher>( a: &GeoDistinctCount<'_, C, H>, b: &GeoDistinctCount<'_, C, H>, ) -> GeoDistinctCount<'static, C, H> { @@ -241,6 +272,7 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; + use crate::build_hasher::DefaultBuildHasher; use crate::config::{iter_ones, tests::test_estimate, FixedConfig, VariableConfig}; use crate::evaluation::simulation::simulate; @@ -248,7 +280,7 @@ mod tests { #[test] fn test_lookup_table() { - let c = FixedConfig::::default(); + let c = FixedConfig::::default(); for i in 0..c.max_bytes() * 4 { let hash = (c.phi_f64().powf(i as f64 + 0.5) * u64::MAX as f64).round() as u64; let a = c.hash_to_bucket(hash); @@ -262,7 +294,7 @@ mod tests { // Pairs of (n, expected) where n is the number of inserted items // and expected is the expected size of the GeoDistinctCount. // The output matching the expected values is dependent on the configuration - // and hashing function. Changes to these will lead to different results and the + // and hashing function. Changes to these will lead to different results and the // test will need to be updated. for (n, result) in [ (10, 10.0021105), @@ -377,11 +409,15 @@ mod tests { let msb = golden_section_min(1.0, 1000.0, |msb| { simulate( || { - Box::new(GeoDistinctCount::new(VariableConfig::<_, u32>::new( + Box::new(GeoDistinctCount::new(VariableConfig::< + _, + u32, + DefaultBuildHasher, + >::new( 13, 7800, (7800 - (msb.round() as usize) * 8) / 3, - ), FnvBuildHasher::default())) + ))) }, 3000, &[100000], @@ -399,8 +435,11 @@ mod tests { for _ in 0..1000 { expected.push_hash(rnd.next_u64()); } - let actual = - GeoDistinctCount::from_bit_chunks(expected.config.clone(), expected.hash_builder.clone(), expected.bit_chunks()); + let actual = GeoDistinctCount::from_bit_chunks( + expected.config.clone(), + expected.hash_builder.clone(), + expected.bit_chunks(), + ); assert_eq!(expected, actual); } } @@ -416,7 +455,7 @@ mod tests { impl> GeoDistinctCount<'_, C, FnvBuildHasher> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { - let mut result = Self::new(config, FnvBuildHasher::default()); + let mut result = Self::new(config); for one in ones { result.set_bit(one); } diff --git a/crates/geo_filters/src/distinct_count/config.rs b/crates/geo_filters/src/distinct_count/config.rs index 6498fe5..8118c91 100644 --- a/crates/geo_filters/src/distinct_count/config.rs +++ b/crates/geo_filters/src/distinct_count/config.rs @@ -1,5 +1,6 @@ use once_cell::sync::Lazy; +use crate::build_hasher::DefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -18,7 +19,7 @@ use crate::Distinct; // // scripts/accuracy -n 10000 geo_distinct/u16/b=7/bytes=136/msb={8,16,32,64} // -pub type GeoDistinctConfig7 = FixedConfig; +pub type GeoDistinctConfig7 = FixedConfig; /// Distinct count configuration with a relative error standard deviation of ~0.0075. /// Uses at most 9248 bytes of memory. @@ -31,7 +32,7 @@ pub type GeoDistinctConfig7 = FixedConfig; // // scripts/accuracy -n 10000 geo_distinct/u32/b=13/bytes=9216/msb={128,192,256,320,512,640} // -pub type GeoDistinctConfig13 = FixedConfig; +pub type GeoDistinctConfig13 = FixedConfig; impl Lookups for Distinct { #[inline] diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index 299db81..254f749 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -77,6 +77,7 @@ pub struct NoopHasher { hash: u64, } +#[derive(Clone, Default)] pub struct BuildNoopHasher {} impl Hasher for NoopHasher { diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 04b7c52..6e85136 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -12,8 +12,11 @@ pub mod diff_count; pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; +pub mod build_hasher; -use std::hash::{BuildHasher, Hash}; +use std::hash::Hash; + +use crate::build_hasher::GeoFilterBuildHasher; /// Marker trait to indicate the variant implemented by a [`Count`] instance. pub trait Method: Clone + Eq + PartialEq + Send + Sync {} @@ -29,7 +32,7 @@ pub struct Distinct {} impl Method for Distinct {} /// Trait for types solving the set cardinality estimation problem. -pub trait Count { +pub trait Count { /// Get the current hash builder fn hasher_builder(&self) -> &H; diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index 2e0482f..5849856 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -23,7 +23,7 @@ fn can_use_diff_count_with_predefined_config_value() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; let c = GeoDiffConfig7::default(); - let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); + let mut f = GeoDiffCount::new(c); f.push(42); f.size(); } @@ -34,7 +34,7 @@ fn can_use_diff_count_with_fixed_config_value() { use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; let c = FixedConfig::<_, u16, 7, 128, 10>::default(); - let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); + let mut f = GeoDiffCount::new(c); f.push(42); f.size(); } From 7e35a6bf0e89eb852413b3580ee8970647cbafdd Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 12:46:56 +0100 Subject: [PATCH 04/12] clean up type parameters --- crates/geo_filters/evaluation/accuracy.rs | 33 +++--- crates/geo_filters/evaluation/performance.rs | 8 +- crates/geo_filters/src/build_hasher.rs | 36 +++--- crates/geo_filters/src/config.rs | 62 +++++------ crates/geo_filters/src/diff_count.rs | 80 ++++---------- crates/geo_filters/src/diff_count/sim_hash.rs | 17 ++- crates/geo_filters/src/distinct_count.rs | 103 +++++------------- crates/geo_filters/src/evaluation/hll.rs | 11 +- .../geo_filters/src/evaluation/simulation.rs | 5 +- crates/geo_filters/src/lib.rs | 17 ++- crates/geo_filters/tests/public_api.rs | 22 ++-- 11 files changed, 156 insertions(+), 238 deletions(-) diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index 2deea38..6ae6df3 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -2,7 +2,6 @@ use std::fs::File; use std::path::PathBuf; use clap::Parser; -use fnv::FnvBuildHasher; use geo_filters::build_hasher::DefaultBuildHasher; use geo_filters::config::VariableConfig; use itertools::Itertools; @@ -158,20 +157,20 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new let [b, bytes, msb] = capture_usizes(&c, [2, 3, 4]); match t { BucketType::U8 => { - let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u8, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDiffCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u64, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } } }), @@ -187,20 +186,20 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { - let c = VariableConfig::<_, u8>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u8, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64>::new(b, bytes, msb); - Box::new(move || Box::new(GeoDistinctCount::new(c.clone(), DefaultBuildHasher::default()))) + let c = VariableConfig::<_, u64, DefaultBuildHasher>::new(b, bytes, msb); + Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } } }), diff --git a/crates/geo_filters/evaluation/performance.rs b/crates/geo_filters/evaluation/performance.rs index 2a493bd..947eecb 100644 --- a/crates/geo_filters/evaluation/performance.rs +++ b/crates/geo_filters/evaluation/performance.rs @@ -1,5 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use fnv::FnvBuildHasher; +use geo_filters::build_hasher::DefaultBuildHasher; use geo_filters::config::VariableConfig; use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13}; use geo_filters::distinct_count::GeoDistinctCount13; @@ -21,7 +21,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -60,7 +60,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -105,7 +105,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc1 = GeoDiffCount::new(c.clone()); let mut gc2 = GeoDiffCount::new(c.clone()); diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs index f861f22..dd7e37a 100644 --- a/crates/geo_filters/src/build_hasher.rs +++ b/crates/geo_filters/src/build_hasher.rs @@ -2,26 +2,34 @@ use std::hash::{BuildHasher, BuildHasherDefault, DefaultHasher, Hasher as _}; /// Trait for a hasher factory that can be used to produce hashers /// for use with geometric filters. -/// +/// /// It is a super set of [`BuildHasher`], enforcing additional requirements /// on the hasher builder that are required for the geometric filters and /// surrounding code. -/// +/// /// When performing operations across two different geometric filters, /// the hashers must be equal, i.e. they must produce the same hash for the -/// same input. This is checked by the `hasher_eq` method. -pub trait GeoFilterBuildHasher: BuildHasher + Default + Clone + Send + Sync { - fn hasher_eq(&self, other: &Self) -> bool { - let v1 = self.build_hasher().finish(); - let v2 = other.build_hasher().finish(); - v1 == v2 +/// same input. +pub trait ReproducibleBuildHasher: BuildHasher + Default + Clone { + #[inline] + fn debug_assert_hashers_eq() { + // In debug builds we check that hash outputs are the same for + // self and other. The library user should only have implemented + // our build hasher trait if this is already true, but we check + // here in case they have implemented the trait in error. + debug_assert_eq!( + Self::default().build_hasher().finish(), + Self::default().build_hasher().finish(), + "Hashers produced by GeoFilterBuildHasher do not produce the same output with the same input" + ); } } -impl GeoFilterBuildHasher for T -where - T: BuildHasher + Default + Clone + Send + Sync, -{ -} +// Note that this `BuildHasher` has a consistent implementation of `Default` +// but is NOT stable across releases of Rust. It is therefore dangerous +// to use if you plan on serializing the geofilters and reusing them due +// to the fact that you can serialize a filter made with one version and +// deserialize with another version of the hasher factor. +pub type DefaultBuildHasher = BuildHasherDefault; -pub type DefaultBuildHasher = BuildHasherDefault; \ No newline at end of file +impl ReproducibleBuildHasher for DefaultBuildHasher {} diff --git a/crates/geo_filters/src/config.rs b/crates/geo_filters/src/config.rs index aaaac6c..f1e2fa2 100644 --- a/crates/geo_filters/src/config.rs +++ b/crates/geo_filters/src/config.rs @@ -2,7 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; -use crate::{build_hasher::GeoFilterBuildHasher, Method}; +use crate::{build_hasher::ReproducibleBuildHasher, Method}; mod bitchunks; mod buckets; @@ -30,8 +30,9 @@ use once_cell::sync::Lazy; /// Those conversions can be shared across multiple geo filter instances. This way, the /// conversions can also be optimized via e.g. lookup tables without paying the cost with every /// new geo filter instance again and again. -pub trait GeoConfig: Clone + Eq + Sized + Send + Sync { +pub trait GeoConfig: Clone + Eq + Sized { type BucketType: IsBucketType + 'static; + type BuildHasher: ReproducibleBuildHasher; /// The number of most-significant bits that are stored sparsely as positions. fn max_msb_len(&self) -> usize; @@ -86,10 +87,9 @@ pub struct FixedConfig< const B: usize, const BYTES: usize, const MSB: usize, - H: GeoFilterBuildHasher, + H: ReproducibleBuildHasher, > { - build_hasher: H, - _phantom: PhantomData<(M, T)>, + _phantom: PhantomData<(M, T, H)>, } impl< @@ -98,10 +98,11 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - H: GeoFilterBuildHasher, + H: ReproducibleBuildHasher, > GeoConfig for FixedConfig { type BucketType = T; + type BuildHasher = H; #[inline] fn max_msb_len(&self) -> usize { @@ -157,7 +158,7 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - H: GeoFilterBuildHasher, + H: ReproducibleBuildHasher, > Default for FixedConfig { fn default() -> Self { @@ -173,7 +174,6 @@ impl< Self { _phantom: PhantomData, - build_hasher: H::default(), } } } @@ -184,11 +184,15 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - H: GeoFilterBuildHasher, + H: ReproducibleBuildHasher, > PartialEq for FixedConfig { - fn eq(&self, other: &Self) -> bool { - self.build_hasher.hasher_eq(&other.build_hasher) + fn eq(&self, _other: &Self) -> bool { + H::debug_assert_hashers_eq(); + + // The values of the fixed config are provided at compile time + // so no runtime computation is required + true } } @@ -198,46 +202,36 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - H: GeoFilterBuildHasher, + H: ReproducibleBuildHasher, > Eq for FixedConfig { } /// Geometric filter configuration using dynamic lookup tables. #[derive(Clone)] -pub struct VariableConfig { +pub struct VariableConfig { b: usize, bytes: usize, msb: usize, - _phantom: PhantomData<(M, T)>, + _phantom: PhantomData<(M, T, H)>, lookup: Arc, - build_hasher: H, } -impl Eq for VariableConfig {} +impl Eq for VariableConfig {} -impl PartialEq for VariableConfig { +impl PartialEq for VariableConfig { fn eq(&self, other: &Self) -> bool { - self.b == other.b - && self.bytes == other.bytes - && self.msb == other.msb - && self.build_hasher.hasher_eq(&other.build_hasher) + H::debug_assert_hashers_eq(); + + self.b == other.b && self.bytes == other.bytes && self.msb == other.msb } } -impl VariableConfig { +impl VariableConfig { /// Returns a new configuration value. See [`FixedConfig`] for the meaning /// of the parameters. This functions computes a new lookup table every time /// it is invoked, so make sure to share the resulting value as much as possible. pub fn new(b: usize, bytes: usize, msb: usize) -> Self { - Self::with_hasher(b, bytes, msb, H::default()) - } - - /// Returns a new configuration value, specifying the [`GeoFilterBuildHasher`]. - /// Useful, for example, if you wish to use a custom seed in your hashers. - /// - /// See [`Self::new`] for more. - pub fn with_hasher(b: usize, bytes: usize, msb: usize, build_hasher: H) -> Self { assert_bucket_type_large_enough::(b); assert_buckets_within_estimation_bound(b, bytes * BITS_PER_BYTE); Self { @@ -246,7 +240,6 @@ impl VariableConf msb, _phantom: PhantomData, lookup: Arc::new(M::new_lookup(b)), - build_hasher, } } @@ -256,10 +249,11 @@ impl VariableConf } } -impl GeoConfig +impl GeoConfig for VariableConfig { type BucketType = T; + type BuildHasher = H; #[inline] fn max_msb_len(&self) -> usize { @@ -361,10 +355,10 @@ pub(crate) fn take_ref(iter: &mut I, n: usize) -> impl Iterator>( + pub(crate) fn test_estimate>( f: impl Fn() -> C, ) -> (f32, f32) { let mut rnd = rand::rngs::StdRng::from_os_rng(); diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 1255776..f5b366d 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -4,7 +4,6 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::mem::{size_of, size_of_val}; -use crate::build_hasher::{DefaultBuildHasher, GeoFilterBuildHasher}; use crate::config::{ count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones, mask_bit_chunks, take_ref, xor_bit_chunks, BitChunk, GeoConfig, IsBucketType, @@ -19,10 +18,10 @@ use bitvec::*; pub use config::{GeoDiffConfig13, GeoDiffConfig7}; /// Diff count filter with a relative error standard deviation of ~0.125. -pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7, DefaultBuildHasher>; +pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>; /// Diff count filter with a relative error standard deviation of ~0.015. -pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, DefaultBuildHasher>; +pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13>; /// Probabilistic diff count data structure based on geometric filters. /// @@ -58,15 +57,14 @@ pub type GeoDiffCount13<'a> = GeoDiffCount<'a, GeoDiffConfig13, DefaultBuildHash /// 1500 bytes achieves a precision of ~0.022 and could estimate the 10k items with an error /// of +/-22k items in the best case which is 20 times worse despite using 5 times more space! #[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -pub struct GeoDiffCount<'a, C: GeoConfig, H: GeoFilterBuildHasher = DefaultBuildHasher> { +pub struct GeoDiffCount<'a, C: GeoConfig> { config: C, /// The bit positions are stored from largest to smallest. msb: Cow<'a, [C::BucketType]>, lsb: BitVec<'a>, - build_hasher: H, } -impl, H: GeoFilterBuildHasher> std::fmt::Debug for GeoDiffCount<'_, C, H> { +impl> std::fmt::Debug for GeoDiffCount<'_, C> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -78,22 +76,12 @@ impl, H: GeoFilterBuildHasher> std::fmt::Debug for GeoDiffCou } } -impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { +impl> GeoDiffCount<'_, C> { pub fn new(config: C) -> Self { Self { config, msb: Default::default(), lsb: Default::default(), - build_hasher: Default::default(), - } - } - - pub fn with_build_hasher(config: C, build_hasher: H) -> Self { - Self { - config, - msb: Default::default(), - lsb: Default::default(), - build_hasher, } } @@ -104,11 +92,7 @@ impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { /// /// Note: we need a peekable iterator, such that we can extract the most significant bits without /// having to construct another iterator with the remaining `BitChunk`s. - fn from_bit_chunks>( - config: C, - hash_builder: H, - chunks: I, - ) -> Self { + fn from_bit_chunks>(config: C, chunks: I) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = Vec::default(); @@ -129,7 +113,6 @@ impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { config, msb: Cow::from(msb), lsb, - build_hasher: hash_builder, }; result.debug_assert_invariants(); result @@ -265,12 +248,11 @@ impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { // Consumers the current GeoDiffCount and produces an owned GeoDiffCount that // can live arbitrarily. - pub fn into_owned(self) -> GeoDiffCount<'static, C, H> { + pub fn into_owned(self) -> GeoDiffCount<'static, C> { GeoDiffCount { config: self.config, lsb: self.lsb.into_owned(), msb: Cow::Owned(self.msb.into_owned()), - build_hasher: self.build_hasher.clone(), } } @@ -308,14 +290,13 @@ impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { /// after compression : 01 0 10 1 00 0 /// bitset of the returned filter : 010 101000 #[cfg(test)] -pub(crate) fn masked, H: GeoFilterBuildHasher>( - diff_count: &GeoDiffCount<'_, C, H>, +pub(crate) fn masked>( + diff_count: &GeoDiffCount<'_, C>, mask: usize, modulus: usize, -) -> GeoDiffCount<'static, C, H> { +) -> GeoDiffCount<'static, C> { GeoDiffCount::from_bit_chunks( diff_count.config.clone(), - diff_count.build_hasher.clone(), mask_bit_chunks(diff_count.bit_chunks(), mask as u64, modulus).peekable(), ) } @@ -325,31 +306,22 @@ pub(crate) fn masked, H: GeoFilterBuildHasher>( /// sets represented by the GeoDiffCounts. /// /// SAFETY: The two GeoDiffCounts must have the same configuration and hash builder. -pub(crate) fn xor, H: GeoFilterBuildHasher>( - diff_count: &GeoDiffCount<'_, C, H>, - other: &GeoDiffCount<'_, C, H>, -) -> GeoDiffCount<'static, C, H> { +pub(crate) fn xor>( + diff_count: &GeoDiffCount<'_, C>, + other: &GeoDiffCount<'_, C>, +) -> GeoDiffCount<'static, C> { assert!( diff_count.config == other.config, "combined filters must have the same configuration" ); - // It would be really nice to be able to asser that our hash builders have - // the same state here, but from what I can see it is not common - // for hash builders to implement `PartialEq` or `Eq`. - GeoDiffCount::from_bit_chunks( diff_count.config.clone(), - diff_count.build_hasher.clone(), xor_bit_chunks(diff_count.bit_chunks(), other.bit_chunks()).peekable(), ) } -impl, H: GeoFilterBuildHasher> Count for GeoDiffCount<'_, C, H> { - fn hasher_builder(&self) -> &H { - &self.build_hasher - } - +impl> Count for GeoDiffCount<'_, C> { fn push_hash(&mut self, hash: u64) { self.xor_bit(self.config.hash_to_bucket(hash)); } @@ -371,16 +343,9 @@ impl, H: GeoFilterBuildHasher> Count for GeoDiffCoun } fn bytes_in_memory(&self) -> usize { - let Self { - config, - msb, - lsb, - build_hasher: hash_builder, - } = self; - size_of_val(config) - + msb.len() * size_of::() - + lsb.bytes_in_memory() - + size_of_val(hash_builder) + let Self { config, msb, lsb } = self; + + size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() } } @@ -389,7 +354,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::config::{iter_ones, tests::test_estimate, FixedConfig}; + use crate::{ + build_hasher::DefaultBuildHasher, + config::{iter_ones, tests::test_estimate, FixedConfig}, + }; use super::*; @@ -414,6 +382,7 @@ mod tests { (10000000, 10194611.0), ] { let mut geo_count = GeoDiffCount13::default(); + (0..n).for_each(|i| geo_count.push(i)); assert_eq!(result, geo_count.size()); } @@ -575,7 +544,6 @@ mod tests { } let actual = GeoDiffCount::from_bit_chunks( expected.config.clone(), - expected.build_hasher.clone(), expected.bit_chunks().peekable(), ); assert_eq!(expected, actual); @@ -591,7 +559,7 @@ mod tests { assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec()); } - impl> GeoDiffCount<'_, C, DefaultBuildHasher> { + impl> GeoDiffCount<'_, C> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { let mut result = Self::new(config); for one in ones { diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index 8050b47..cee2370 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -1,11 +1,10 @@ //! Similarity hashing allows quickly finding similar sets with a reverse index. -use std::hash::{ Hash, Hasher}; +use std::hash::{Hash, Hasher}; use std::ops::Range; use fnv::FnvHasher; -use crate::build_hasher::GeoFilterBuildHasher; use crate::config::{BitChunk, GeoConfig, IsBucketType}; use crate::diff_count::GeoDiffCount; use crate::Diff; @@ -43,7 +42,7 @@ impl SimHash { } } -impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { +impl> GeoDiffCount<'_, C> { /// Given the expected size of a diff, this function returns the range of bucket ids which should /// be searched for in order to find geometric filters of the desired similarity. If at least half /// of the buckets in the range match, one found a match that has the expected diff size (or better). @@ -108,14 +107,14 @@ impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { /// /// Property 2 is used to construct all sim hashes efficiently. /// Property 3 is used to search for similar GeoDiffCounts. -struct SimHashIterator<'a, C: GeoConfig, H: GeoFilterBuildHasher> { - filter: &'a GeoDiffCount<'a, C, H>, +struct SimHashIterator<'a, C: GeoConfig> { + filter: &'a GeoDiffCount<'a, C>, prev_bucket_id: BucketId, sim_hash: [u64; SIM_BUCKETS], } -impl<'a, C: GeoConfig, H: GeoFilterBuildHasher> SimHashIterator<'a, C, H> { - pub fn new(filter: &'a GeoDiffCount<'a, C, H>) -> Self { +impl<'a, C: GeoConfig> SimHashIterator<'a, C> { + pub fn new(filter: &'a GeoDiffCount<'a, C>) -> Self { let msb = filter.nth_most_significant_one(0); let prev_bucket_id = msb.map(|b| b.into_usize() / SIM_BUCKET_SIZE).unwrap_or(0) + SIM_BUCKETS; @@ -127,7 +126,7 @@ impl<'a, C: GeoConfig, H: GeoFilterBuildHasher> SimHashIterator<'a, C, H> } } -impl, H: GeoFilterBuildHasher> Iterator for SimHashIterator<'_, C, H> { +impl> Iterator for SimHashIterator<'_, C> { type Item = (BucketId, SimHash); fn next(&mut self) -> Option { @@ -149,7 +148,7 @@ impl, H: GeoFilterBuildHasher> Iterator for SimHashIterator<' } } -impl, H: GeoFilterBuildHasher> GeoDiffCount<'_, C, H> { +impl> GeoDiffCount<'_, C> { /// n specifies the desired zero-based index of the most significant one. /// The zero-based index of the desired one bit is returned. fn nth_most_significant_one(&self, mut n: usize) -> Option { diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index f8d29ff..cdb68f9 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -3,7 +3,6 @@ use std::collections::VecDeque; use std::mem::{size_of, size_of_val}; -use crate::build_hasher::GeoFilterBuildHasher; use crate::config::{ count_ones_from_bitchunks, iter_bit_chunks, iter_ones, max_lsb_bytes, or_bit_chunks, take_ref, BitChunk, GeoConfig, IsBucketType, @@ -15,39 +14,33 @@ mod bitdeque; mod config; pub use config::{GeoDistinctConfig13, GeoDistinctConfig7}; -use fnv::FnvBuildHasher; /// Distinct count filter with a relative error standard deviation of ~0.065. /// Uses at most 168 bytes of memory. -pub type GeoDistinctCount7<'a> = GeoDistinctCount<'a, GeoDistinctConfig7, FnvBuildHasher>; +pub type GeoDistinctCount7<'a> = GeoDistinctCount<'a, GeoDistinctConfig7>; /// Distinct count filter with a relative error standard deviation of ~0.0075. /// Uses at most 9248 bytes of memory. -pub type GeoDistinctCount13<'a> = GeoDistinctCount<'a, GeoDistinctConfig13, FnvBuildHasher>; +pub type GeoDistinctCount13<'a> = GeoDistinctCount<'a, GeoDistinctConfig13>; /// Probabilistic distinct count data structure based on geometric filters. /// /// The [`GeoDistinctCount`] falls into the category of probabilistic set size estimators. /// It has some similar properties as related filters like HyperLogLog, MinHash, etc, but uses less space. #[derive(Eq, PartialEq)] -pub struct GeoDistinctCount<'a, C: GeoConfig, H: GeoFilterBuildHasher> { +pub struct GeoDistinctCount<'a, C: GeoConfig> { config: C, - hash_builder: H, msb: VecDeque, lsb: BitDeque<'a>, } -impl + Default, H: GeoFilterBuildHasher> Default - for GeoDistinctCount<'_, C, H> -{ +impl + Default> Default for GeoDistinctCount<'_, C> { fn default() -> Self { - Self::with_build_hasher(C::default(), H::default()) + Self::new(C::default()) } } -impl, H: GeoFilterBuildHasher> std::fmt::Debug - for GeoDistinctCount<'_, C, H> -{ +impl> std::fmt::Debug for GeoDistinctCount<'_, C> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, @@ -59,30 +52,17 @@ impl, H: GeoFilterBuildHasher> std::fmt::Debug } } -impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> { +impl> GeoDistinctCount<'_, C> { pub fn new(config: C) -> Self { - Self::with_build_hasher(config, H::default()) - } - - pub fn with_build_hasher(config: C, hash_builder: H) -> Self { let msb = Default::default(); let lsb = BitDeque::new(max_lsb_bytes::( config.max_bytes(), config.max_msb_len(), )); - Self { - config, - msb, - lsb, - hash_builder, - } + Self { config, msb, lsb } } - fn from_bit_chunks>( - config: C, - hash_builder: H, - chunks: I, - ) -> Self { + fn from_bit_chunks>(config: C, chunks: I) -> Self { let mut ones = iter_ones::(chunks.peekable()); let mut msb = VecDeque::default(); @@ -103,12 +83,7 @@ impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> max_lsb_bytes::(config.max_bytes(), config.max_msb_len()), ); - let result = Self { - config, - msb, - lsb, - hash_builder, - }; + let result = Self { config, msb, lsb }; result.debug_assert_invariants(); result } @@ -153,13 +128,7 @@ impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> } } -impl, H: GeoFilterBuildHasher> Count - for GeoDistinctCount<'_, C, H> -{ - fn hasher_builder(&self) -> &H { - &self.hash_builder - } - +impl> Count for GeoDistinctCount<'_, C> { fn push_hash(&mut self, hash: u64) { self.set_bit(self.config.hash_to_bucket(hash)); } @@ -205,20 +174,12 @@ impl, H: GeoFilterBuildHasher> Count } fn bytes_in_memory(&self) -> usize { - let Self { - config, - msb, - lsb, - hash_builder, - } = self; - size_of_val(config) - + msb.len() * size_of::() - + lsb.bytes_in_memory() - + size_of_val(hash_builder) + let Self { config, msb, lsb } = self; + size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() } } -impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> { +impl> GeoDistinctCount<'_, C> { fn insert_into_lsb(&mut self, bucket: usize) { if !self.lsb.test_bit(bucket) { self.lsb.set_bit(bucket); @@ -247,22 +208,17 @@ impl, H: GeoFilterBuildHasher> GeoDistinctCount<'_, C, H> } } -fn or, H: GeoFilterBuildHasher>( - a: &GeoDistinctCount<'_, C, H>, - b: &GeoDistinctCount<'_, C, H>, -) -> GeoDistinctCount<'static, C, H> { +fn or>( + a: &GeoDistinctCount<'_, C>, + b: &GeoDistinctCount<'_, C>, +) -> GeoDistinctCount<'static, C> { assert!( a.config == b.config, "combined filters must have the same configuration" ); - // It would be really nice to be able to asser that our hash builders have - // the same state here, but from what I can see it is not common - // for hash builders to implement `PartialEq` or `Eq`. - - GeoDistinctCount::from_bit_chunks( + GeoDistinctCount::<'static, C>::from_bit_chunks( a.config.clone(), - a.hash_builder.clone(), or_bit_chunks(a.bit_chunks(), b.bit_chunks()).peekable(), ) } @@ -299,12 +255,12 @@ mod tests { for (n, result) in [ (10, 10.0021105), (100, 100.21153), - (1000, 1021.64075), - (10000, 11064.973), - (30000, 34484.45), - (100000, 112042.44), - (1000000, 827151.1), - (10000000, 3110472.8), + (1000, 1001.81635), + (10000, 9951.017), + (30000, 29927.705), + (100000, 99553.24), + (1000000, 1003824.1), + (10000000, 10071972.0), ] { let mut geo_count = GeoDistinctCount13::default(); (0..n).for_each(|i| geo_count.push(i)); @@ -435,11 +391,8 @@ mod tests { for _ in 0..1000 { expected.push_hash(rnd.next_u64()); } - let actual = GeoDistinctCount::from_bit_chunks( - expected.config.clone(), - expected.hash_builder.clone(), - expected.bit_chunks(), - ); + let actual = + GeoDistinctCount::from_bit_chunks(expected.config.clone(), expected.bit_chunks()); assert_eq!(expected, actual); } } @@ -453,7 +406,7 @@ mod tests { assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec()); } - impl> GeoDistinctCount<'_, C, FnvBuildHasher> { + impl> GeoDistinctCount<'_, C> { fn from_ones(config: C, ones: impl IntoIterator) -> Self { let mut result = Self::new(config); for one in ones { diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index 254f749..d6210f9 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -8,7 +8,10 @@ use std::{ use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; -use crate::{Count, Distinct}; +use crate::{ + build_hasher::{DefaultBuildHasher, ReproducibleBuildHasher}, + Count, Distinct, +}; /// Uses at most 192 bytes. /// The relative error has a standard deviation of ~0.065. @@ -105,11 +108,9 @@ impl BuildHasher for BuildNoopHasher { } } -impl Count for Hll { - fn hasher_builder(&self) -> &BuildNoopHasher { - unimplemented!() - } +impl ReproducibleBuildHasher for BuildNoopHasher {} +impl Count for Hll { fn push_hash(&mut self, hash: u64) { self.inner.borrow_mut().insert(&hash) } diff --git a/crates/geo_filters/src/evaluation/simulation.rs b/crates/geo_filters/src/evaluation/simulation.rs index 1698b74..badedfe 100644 --- a/crates/geo_filters/src/evaluation/simulation.rs +++ b/crates/geo_filters/src/evaluation/simulation.rs @@ -1,7 +1,6 @@ use std::io::Write; use std::time::Instant; -use fnv::FnvBuildHasher; use itertools::Itertools; use rand::{RngCore, SeedableRng}; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; @@ -21,7 +20,7 @@ pub trait SimulationCount { fn size(&self) -> f32; fn bytes_in_memory(&self) -> usize; } -impl + Clone> SimulationCount for GeoDiffCount<'_, C, FnvBuildHasher> { +impl + Clone> SimulationCount for GeoDiffCount<'_, C> { fn push_hash(&mut self, hash: u64) { >::push_hash(self, hash) } @@ -32,7 +31,7 @@ impl + Clone> SimulationCount for GeoDiffCount<'_, C, FnvBuil >::bytes_in_memory(self) } } -impl> SimulationCount for GeoDistinctCount<'_, C, FnvBuildHasher> { +impl> SimulationCount for GeoDistinctCount<'_, C> { fn push_hash(&mut self, hash: u64) { >::push_hash(self, hash) } diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 6e85136..4d97e23 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -7,16 +7,16 @@ //! Supports estimating the size of the union of two sets with a precision related to the estimated size. //! It has some similar properties as related filters like HyperLogLog, MinHash, etc, but uses less space. +pub mod build_hasher; pub mod config; pub mod diff_count; pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; -pub mod build_hasher; use std::hash::Hash; -use crate::build_hasher::GeoFilterBuildHasher; +use crate::build_hasher::ReproducibleBuildHasher; /// Marker trait to indicate the variant implemented by a [`Count`] instance. pub trait Method: Clone + Eq + PartialEq + Send + Sync {} @@ -32,17 +32,14 @@ pub struct Distinct {} impl Method for Distinct {} /// Trait for types solving the set cardinality estimation problem. -pub trait Count { - /// Get the current hash builder - fn hasher_builder(&self) -> &H; - +pub trait Count { /// Add the given hash to the set. fn push_hash(&mut self, hash: u64); - /// Add the hash of the given item, computed with the default hasher, to the set. + /// Add the hash of the given item, computed with the configured hasher, to the set. fn push(&mut self, item: I) { - let hash_builder = self.hasher_builder(); - self.push_hash(hash_builder.hash_one(item)); + let build_hasher = H::default(); + self.push_hash(build_hasher.hash_one(item)); } /// Add the given sketch to this one. @@ -58,4 +55,4 @@ pub trait Count { /// Returns the number of bytes in memory used to represent this filter. fn bytes_in_memory(&self) -> usize; -} \ No newline at end of file +} diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index 5849856..5656286 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -1,4 +1,4 @@ -use fnv::FnvBuildHasher; +use geo_filters::build_hasher::DefaultBuildHasher; #[test] fn can_use_predefined_diff_count() { @@ -13,7 +13,7 @@ fn can_use_predefined_diff_count() { fn can_use_custom_diff_count() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; - let mut f = GeoDiffCount::::default(); + let mut f = GeoDiffCount::::default(); f.push(42); f.size(); } @@ -33,7 +33,7 @@ fn can_use_diff_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 128, 10>::default(); + let c = FixedConfig::<_, u16, 7, 128, 10, DefaultBuildHasher>::default(); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -44,8 +44,8 @@ fn can_use_diff_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = VariableConfig::<_, u16>::new(7, 128, 10); - let mut f = GeoDiffCount::new(c, FnvBuildHasher::default()); + let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(7, 128, 10); + let mut f = GeoDiffCount::new(c); f.push(42); f.size(); } @@ -63,7 +63,7 @@ fn can_use_predefined_distinct_count() { fn can_use_custom_distinct_count() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; - let mut f = GeoDistinctCount::::default(); + let mut f = GeoDistinctCount::::default(); f.push(42); f.size(); } @@ -73,7 +73,7 @@ fn can_use_distinct_count_with_predefined_config_value() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; let c = GeoDistinctConfig7::default(); - let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); + let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); } @@ -83,8 +83,8 @@ fn can_use_distinct_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 118, 11>::default(); - let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); + let c = FixedConfig::<_, u16, 7, 118, 11, DefaultBuildHasher>::default(); + let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); } @@ -94,8 +94,8 @@ fn can_use_distinct_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = VariableConfig::<_, u16>::new(7, 118, 11); - let mut f = GeoDistinctCount::new(c, FnvBuildHasher::default()); + let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(7, 118, 11); + let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); } From 7f450d73aa92f0ac12ca76c006ff27430c384f45 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 12:53:24 +0100 Subject: [PATCH 05/12] new clippy rule --- crates/geo_filters/src/config/buckets.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/geo_filters/src/config/buckets.rs b/crates/geo_filters/src/config/buckets.rs index 0d673c0..55cac2b 100644 --- a/crates/geo_filters/src/config/buckets.rs +++ b/crates/geo_filters/src/config/buckets.rs @@ -21,8 +21,7 @@ where fn into_block(self) -> u64 { assert!( self.into_usize() < BITS_PER_BLOCK, - "position in block must be less then 64, got {:?}", - self + "position in block must be less then 64, got {self:?}", ); 1u64 << self.into_usize() } From 86e75826276b5079eba47142024ae3be9df0497b Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 14:18:19 +0100 Subject: [PATCH 06/12] docs --- .../docs/choosing-a-hash-function.md | 113 ++++++++++++++++++ crates/geo_filters/src/build_hasher.rs | 3 + 2 files changed, 116 insertions(+) create mode 100644 crates/geo_filters/docs/choosing-a-hash-function.md diff --git a/crates/geo_filters/docs/choosing-a-hash-function.md b/crates/geo_filters/docs/choosing-a-hash-function.md new file mode 100644 index 0000000..c486238 --- /dev/null +++ b/crates/geo_filters/docs/choosing-a-hash-function.md @@ -0,0 +1,113 @@ +# Choosing a hash function + +## Reproducibility + +This library uses hash functions to assign values to buckets deterministically. The same item +will hash to the same value, and modify the same bit in the geofilter. + +When comparing geofilters it is important that the same hash functions, using the same seed +values, have been used for *both* filters. Attempting to compare geofilters which have been +produced using different hash functions or the same hash function with different seeds will +produce nonsensical results. + +Similar to the Rust standard library, this crate uses the `BuildHasher` trait and creates +a new `Hasher` for every item processed. + +To help prevent mistakes caused by mismatching hash functions or seeds we introduce a trait +`ReproducibleBuildHasher` which you must implement if you wish to use a custom hashing function. +By marking a `BuildHasher` with this trait you're asserting that `Hasher`s produced using +`Default::default` will hash identical items to the same `u64` value across multiple calls +to `BuildHasher::hash_one`. + +The following is an example of some incorrect code which produces nonsense results: + +```rust +// Implement our marker trait for `RandomState`. +// You should _NOT_ do this as `RandomState::default` does not produce +// reproducible hashers. +impl ReproducibleBuildHasher for RandomState {} + +#[test] +fn test_different_hash_functions() { + // The last parameter in this FixedConfig means we're using RandomState as the BuildHasher + pub type FixedConfigRandom = FixedConfig; + + let mut a = GeoDiffCount::new(FixedConfigRandom::default()); + let mut b = GeoDiffCount::new(FixedConfigRandom::default()); + + // Add our values + for n in 0..100 { + a.push(n); + b.push(n); + } + + // We have inserted the same items into both filters so we'd expect the + // symmetric difference to be zero if all is well. + let diff_size = a.size_with_sketch(&b); + + // But all is not well. This assertion fails! + assert_eq!(diff_size, 0.0); +} +``` + +The actual value returned in this example is ~200. This makes sense because the geofilter +thinks that there are 100 unique values in each of the filters, so the difference is approximated +as being ~200. If we were to rerun the above example with a genuinely reproducable `BuildHasher` +then the resulting diff size would be `0`. + +In debug builds, as part of the config's `eq` implementation, our library will assert that the `BuildHasher`s +produce the same `u64` value when given the same input but this is not enabled in release builds. + +## Stability + +Following from this, it might be important that your hash functions and seed values are stable, meaning, +that they won't change from one release to another. + +The default function provided in this library is *NOT* stable as it is based on the Rust standard libraries +`DefaultHasher` which does not have a specified algorithm and may change across releases of Rust. + +Stability is especially important to consider if you are using serialized geofilters which may have +been created in a previous version of the Rust standard library. + +This library provides an implementation of `ReproducibleBuildHasher` for the `FnvBuildHasher` provided +by the `fnv` crate version `1.0`. This is a _stable_ hash function in that it won't change unexpectedly +but it doesn't have good diffusion properties. This means if your input items have low entropy (for +example numbers from `0..10000`) you will find that the geofilter is not able to produce accurate estimations. + +## Uniformity and Diffusion + +In order to produce accurate estimations it is important that your hash function is able to produce evenly +distributed outputs for your input items. + +This property must be balanced against the performance requirements of your system as stronger hashing +algorithms are often slower. + +Depending on your input data, different functions may be more or less appropriate. For example, if your input +items have high entropy (e.g. SHA256 values) then the diffusion of your hash function might matter less. + +## Implementing your own `ReproducibleBuildHasher` type + +If you are using a hash function that you have not implemented yourself you will not be able to implement +`ReproducibleBuildHasher` on that type directly due to Rust's orphan rules. The easiest way to get around this +is to create a newtype which proxies the underlying `BuildHasher`. + +In addition to `BuildHasher` `ReproducibleBuildHasher` needs `Default` and `Clone`, which is usually implemented +on `BuildHasher`s, so you can probably just `#[derive(...)]` those. If your `BuildHasher` doesn't have those +traits then you may need to implement them manually. + +Here is an example of how to use new types to mark your `BuildHasher` as reproducible. + +```rust +#[derive(Clone, Default)] +pub struct MyBuildHasher(BuildHasherDefault); + +impl BuildHasher for MyBuildHasher { + type Hasher = DefaultHasher; + + fn build_hasher(&self) -> Self::Hasher { + self.0.build_hasher() + } +} + +impl ReproducibleBuildHasher for MyBuildHasher {} +``` diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs index dd7e37a..5fd9775 100644 --- a/crates/geo_filters/src/build_hasher.rs +++ b/crates/geo_filters/src/build_hasher.rs @@ -1,5 +1,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, DefaultHasher, Hasher as _}; +use fnv::FnvBuildHasher; + /// Trait for a hasher factory that can be used to produce hashers /// for use with geometric filters. /// @@ -33,3 +35,4 @@ pub trait ReproducibleBuildHasher: BuildHasher + Default + Clone { pub type DefaultBuildHasher = BuildHasherDefault; impl ReproducibleBuildHasher for DefaultBuildHasher {} +impl ReproducibleBuildHasher for FnvBuildHasher {} From 885d8dbc3e5a21f8de39f06ef4da6af3a13d66e7 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 14:36:32 +0100 Subject: [PATCH 07/12] fix test lints --- crates/geo_filters/src/config/lookup.rs | 4 ++-- crates/geo_filters/src/evaluation/simulation.rs | 4 ++-- crates/string-offsets/src/lib.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/geo_filters/src/config/lookup.rs b/crates/geo_filters/src/config/lookup.rs index 1e73757..2081f72 100644 --- a/crates/geo_filters/src/config/lookup.rs +++ b/crates/geo_filters/src/config/lookup.rs @@ -54,13 +54,13 @@ mod tests { #[test] fn test_lookup_7() { let var = lookup_random_hashes_variance::<7>(1 << 16); - assert!(var < 1e-4, "variance {} too large", var); + assert!(var < 1e-4, "variance {var} too large"); } #[test] fn test_lookup_13() { let var = lookup_random_hashes_variance::<13>(1 << 16); - assert!(var < 1e-4, "variance {} too large", var); + assert!(var < 1e-4, "variance {var} too large"); } fn lookup_random_hashes_variance(n: u64) -> f64 { diff --git a/crates/geo_filters/src/evaluation/simulation.rs b/crates/geo_filters/src/evaluation/simulation.rs index badedfe..d6987c9 100644 --- a/crates/geo_filters/src/evaluation/simulation.rs +++ b/crates/geo_filters/src/evaluation/simulation.rs @@ -98,7 +98,7 @@ pub fn run_simulations( println!("Parameters:"); println!(); println!(" number of configs = {}", configs.len()); - println!(" number of samples = {}", samples); + println!(" number of samples = {samples}"); println!(" number of sets = {}", set_sizes.len()); println!(); @@ -107,7 +107,7 @@ pub fn run_simulations( let results = configs .iter() .map(|(name, f)| { - print!(" {} ... ", name); + print!(" {name} ... "); std::io::stdout().flush().expect("stdout can be flushed"); let t = Instant::now(); let result = simulate(f, samples, set_sizes); diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index 02a26aa..35900ad 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -449,7 +449,7 @@ mod tests { 0 => 0, 1..=3 => 1, 4 => 2, - _ => panic!("invalid utf8 char width: {}", len), + _ => panic!("invalid utf8 char width: {len}"), } } From be373ee6a6116b117e07982b395eeff24e4ab9a9 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 15:09:54 +0100 Subject: [PATCH 08/12] reworks --- .../docs/choosing-a-hash-function.md | 2 ++ crates/geo_filters/src/build_hasher.rs | 12 ++++++------ crates/geo_filters/src/config.rs | 6 ++---- crates/geo_filters/src/diff_count.rs | 8 ++++++-- crates/geo_filters/src/distinct_count.rs | 4 +++- crates/geo_filters/src/evaluation/hll.rs | 8 ++++++-- .../geo_filters/src/evaluation/simulation.rs | 18 +++++++++--------- crates/geo_filters/src/lib.rs | 8 +++++--- 8 files changed, 39 insertions(+), 27 deletions(-) diff --git a/crates/geo_filters/docs/choosing-a-hash-function.md b/crates/geo_filters/docs/choosing-a-hash-function.md index c486238..e2bb0c7 100644 --- a/crates/geo_filters/docs/choosing-a-hash-function.md +++ b/crates/geo_filters/docs/choosing-a-hash-function.md @@ -22,6 +22,8 @@ to `BuildHasher::hash_one`. The following is an example of some incorrect code which produces nonsense results: ```rust +use std::hash::RandomState; + // Implement our marker trait for `RandomState`. // You should _NOT_ do this as `RandomState::default` does not produce // reproducible hashers. diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs index 5fd9775..9310ac3 100644 --- a/crates/geo_filters/src/build_hasher.rs +++ b/crates/geo_filters/src/build_hasher.rs @@ -22,16 +22,16 @@ pub trait ReproducibleBuildHasher: BuildHasher + Default + Clone { debug_assert_eq!( Self::default().build_hasher().finish(), Self::default().build_hasher().finish(), - "Hashers produced by GeoFilterBuildHasher do not produce the same output with the same input" + "Hashers produced by ReproducibleBuildHasher do not produce the same output with the same input" ); } } -// Note that this `BuildHasher` has a consistent implementation of `Default` -// but is NOT stable across releases of Rust. It is therefore dangerous -// to use if you plan on serializing the geofilters and reusing them due -// to the fact that you can serialize a filter made with one version and -// deserialize with another version of the hasher factor. +/// Note that this `BuildHasher` has a consistent implementation of `Default` +/// but is NOT stable across releases of Rust. It is therefore dangerous +/// to use if you plan on serializing the geofilters and reusing them due +/// to the fact that you can serialize a filter made with one version and +/// deserialize with another version of the hasher factor. pub type DefaultBuildHasher = BuildHasherDefault; impl ReproducibleBuildHasher for DefaultBuildHasher {} diff --git a/crates/geo_filters/src/config.rs b/crates/geo_filters/src/config.rs index f1e2fa2..d66da44 100644 --- a/crates/geo_filters/src/config.rs +++ b/crates/geo_filters/src/config.rs @@ -355,12 +355,10 @@ pub(crate) fn take_ref(iter: &mut I, n: usize) -> impl Iterator>( - f: impl Fn() -> C, - ) -> (f32, f32) { + pub(crate) fn test_estimate>(f: impl Fn() -> C) -> (f32, f32) { let mut rnd = rand::rngs::StdRng::from_os_rng(); let cnt = 10000usize; let mut avg_precision = 0.0; diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index f5b366d..63d945e 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -305,7 +305,9 @@ pub(crate) fn masked>( /// This operation corresponds to computing the symmetric difference of the two /// sets represented by the GeoDiffCounts. /// -/// SAFETY: The two GeoDiffCounts must have the same configuration and hash builder. +/// # Panics +/// +/// Panics if the configuration of the geofilters is not identical. pub(crate) fn xor>( diff_count: &GeoDiffCount<'_, C>, other: &GeoDiffCount<'_, C>, @@ -321,7 +323,9 @@ pub(crate) fn xor>( ) } -impl> Count for GeoDiffCount<'_, C> { +impl> Count for GeoDiffCount<'_, C> { + type BuildHasher = C::BuildHasher; + fn push_hash(&mut self, hash: u64) { self.xor_bit(self.config.hash_to_bucket(hash)); } diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index cdb68f9..0cff6b6 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -128,7 +128,9 @@ impl> GeoDistinctCount<'_, C> { } } -impl> Count for GeoDistinctCount<'_, C> { +impl> Count for GeoDistinctCount<'_, C> { + type BuildHasher = C::BuildHasher; + fn push_hash(&mut self, hash: u64) { self.set_bit(self.config.hash_to_bucket(hash)); } diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index d6210f9..1284e73 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -91,7 +91,9 @@ impl Hasher for NoopHasher { #[inline] fn write(&mut self, _: &[u8]) { - unimplemented!("NoopHasher only supports writing u64 values"); + unimplemented!( + "NoopHasher does not support arbitrary byte sequences. Use write_u64 instead" + ); } #[inline] @@ -110,7 +112,9 @@ impl BuildHasher for BuildNoopHasher { impl ReproducibleBuildHasher for BuildNoopHasher {} -impl Count for Hll { +impl Count for Hll { + type BuildHasher = DefaultBuildHasher; + fn push_hash(&mut self, hash: u64) { self.inner.borrow_mut().insert(&hash) } diff --git a/crates/geo_filters/src/evaluation/simulation.rs b/crates/geo_filters/src/evaluation/simulation.rs index d6987c9..eedb7d7 100644 --- a/crates/geo_filters/src/evaluation/simulation.rs +++ b/crates/geo_filters/src/evaluation/simulation.rs @@ -22,35 +22,35 @@ pub trait SimulationCount { } impl + Clone> SimulationCount for GeoDiffCount<'_, C> { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } impl> SimulationCount for GeoDistinctCount<'_, C> { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } impl SimulationCount for Hll { fn push_hash(&mut self, hash: u64) { - >::push_hash(self, hash) + >::push_hash(self, hash) } fn size(&self) -> f32 { - >::size(self) + >::size(self) } fn bytes_in_memory(&self) -> usize { - >::bytes_in_memory(self) + >::bytes_in_memory(self) } } diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 4d97e23..13cf5bf 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -14,7 +14,7 @@ pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; -use std::hash::Hash; +use std::hash::{BuildHasher as _, Hash}; use crate::build_hasher::ReproducibleBuildHasher; @@ -32,13 +32,15 @@ pub struct Distinct {} impl Method for Distinct {} /// Trait for types solving the set cardinality estimation problem. -pub trait Count { +pub trait Count { + type BuildHasher: ReproducibleBuildHasher; + /// Add the given hash to the set. fn push_hash(&mut self, hash: u64); /// Add the hash of the given item, computed with the configured hasher, to the set. fn push(&mut self, item: I) { - let build_hasher = H::default(); + let build_hasher = Self::BuildHasher::default(); self.push_hash(build_hasher.hash_one(item)); } From 4eb1f73eaf6ca473bf4799fd83b9adbe3cd02667 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 15:43:23 +0100 Subject: [PATCH 09/12] Further reworks - Parametize config type aliases - Remove dependency on BuildHasher from Count - Add readme test - Clarify in type alias that default build hasher is unstable --- crates/geo_filters/evaluation/accuracy.rs | 24 +++++++++----- crates/geo_filters/evaluation/performance.rs | 8 ++--- crates/geo_filters/src/build_hasher.rs | 4 +-- crates/geo_filters/src/diff_count.rs | 12 ++++--- crates/geo_filters/src/diff_count/config.rs | 12 +++---- crates/geo_filters/src/distinct_count.rs | 15 ++++++--- .../geo_filters/src/distinct_count/config.rs | 12 ++++--- crates/geo_filters/src/evaluation/hll.rs | 9 +++-- crates/geo_filters/src/lib.rs | 11 ++----- crates/geo_filters/tests/public_api.rs | 33 +++++++++++++++---- 10 files changed, 86 insertions(+), 54 deletions(-) diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index 6ae6df3..6248f07 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::path::PathBuf; use clap::Parser; -use geo_filters::build_hasher::DefaultBuildHasher; +use geo_filters::build_hasher::UnstableDefaultBuildHasher; use geo_filters::config::VariableConfig; use itertools::Itertools; use once_cell::sync::Lazy; @@ -157,19 +157,22 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new let [b, bytes, msb] = capture_usizes(&c, [2, 3, 4]); match t { BucketType::U8 => { - let c = VariableConfig::<_, u8, DefaultBuildHasher>::new(b, bytes, msb); + let c = VariableConfig::<_, u8, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u64, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } } @@ -186,19 +189,22 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { - let c = VariableConfig::<_, u8, DefaultBuildHasher>::new(b, bytes, msb); + let c = VariableConfig::<_, u8, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64, DefaultBuildHasher>::new(b, bytes, msb); + let c = + VariableConfig::<_, u64, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } } diff --git a/crates/geo_filters/evaluation/performance.rs b/crates/geo_filters/evaluation/performance.rs index 947eecb..31a3fee 100644 --- a/crates/geo_filters/evaluation/performance.rs +++ b/crates/geo_filters/evaluation/performance.rs @@ -1,5 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use geo_filters::build_hasher::DefaultBuildHasher; +use geo_filters::build_hasher::UnstableDefaultBuildHasher; use geo_filters::config::VariableConfig; use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13}; use geo_filters::distinct_count::GeoDistinctCount13; @@ -21,7 +21,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -60,7 +60,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -105,7 +105,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32, DefaultBuildHasher>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc1 = GeoDiffCount::new(c.clone()); let mut gc2 = GeoDiffCount::new(c.clone()); diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs index 9310ac3..0509bfd 100644 --- a/crates/geo_filters/src/build_hasher.rs +++ b/crates/geo_filters/src/build_hasher.rs @@ -32,7 +32,7 @@ pub trait ReproducibleBuildHasher: BuildHasher + Default + Clone { /// to use if you plan on serializing the geofilters and reusing them due /// to the fact that you can serialize a filter made with one version and /// deserialize with another version of the hasher factor. -pub type DefaultBuildHasher = BuildHasherDefault; +pub type UnstableDefaultBuildHasher = BuildHasherDefault; -impl ReproducibleBuildHasher for DefaultBuildHasher {} +impl ReproducibleBuildHasher for UnstableDefaultBuildHasher {} impl ReproducibleBuildHasher for FnvBuildHasher {} diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 63d945e..7e784c6 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::cmp::Ordering; +use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -324,12 +325,15 @@ pub(crate) fn xor>( } impl> Count for GeoDiffCount<'_, C> { - type BuildHasher = C::BuildHasher; - fn push_hash(&mut self, hash: u64) { self.xor_bit(self.config.hash_to_bucket(hash)); } + fn push(&mut self, item: I) { + let build_hasher = C::BuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, other: &Self) { *self = xor(self, other); } @@ -359,7 +363,7 @@ mod tests { use rand::{RngCore, SeedableRng}; use crate::{ - build_hasher::DefaultBuildHasher, + build_hasher::UnstableDefaultBuildHasher, config::{iter_ones, tests::test_estimate, FixedConfig}, }; @@ -371,7 +375,7 @@ mod tests { // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=50/msb=10 // type GeoDiffCount7_50<'a> = - GeoDiffCount<'a, FixedConfig>; + GeoDiffCount<'a, FixedConfig>; #[test] fn test_geo_count() { diff --git a/crates/geo_filters/src/diff_count/config.rs b/crates/geo_filters/src/diff_count/config.rs index ecb4dc9..365c04d 100644 --- a/crates/geo_filters/src/diff_count/config.rs +++ b/crates/geo_filters/src/diff_count/config.rs @@ -1,6 +1,6 @@ use once_cell::sync::Lazy; -use crate::build_hasher::DefaultBuildHasher; +use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -18,7 +18,7 @@ use crate::Diff; // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=112/msb={8,12,16,20} // -pub type GeoDiffConfig7 = FixedConfig; +pub type GeoDiffConfig7 = FixedConfig; /// Diff count configuration with a relative error standard deviation of ~0.015. // @@ -30,7 +30,7 @@ pub type GeoDiffConfig7 = FixedConfig // // scripts/accuracy -n 1000 geo_diff/u32/b=13/bytes=7138/msb={128,192,256,384,512} // -pub type GeoDiffConfig13 = FixedConfig; +pub type GeoDiffConfig13 = FixedConfig; impl Lookups for Diff { #[inline] @@ -125,7 +125,7 @@ mod tests { #[test] fn test_bit_from_hash() { - let config = GeoDiffConfig7::default(); + let config = GeoDiffConfig7::::default(); assert_eq!(config.hash_to_bucket(u64::MAX), 0); assert_eq!( config.hash_to_bucket(0) as usize, @@ -170,7 +170,7 @@ mod tests { #[test] fn test_estimation_lut_7() { - let c = GeoDiffConfig7::default(); + let c = GeoDiffConfig7::::default(); let err = (0..600) .step_by(1) .map(|i| { @@ -189,7 +189,7 @@ mod tests { #[test] fn test_estimation_lut_13() { - let c = GeoDiffConfig13::default(); + let c = GeoDiffConfig13::::default(); let err = (0..24000) .step_by(100) .map(|i| { diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index 0cff6b6..8c2465c 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -1,6 +1,7 @@ //! Geometric filter implementation for distinct count. use std::collections::VecDeque; +use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -129,12 +130,15 @@ impl> GeoDistinctCount<'_, C> { } impl> Count for GeoDistinctCount<'_, C> { - type BuildHasher = C::BuildHasher; - fn push_hash(&mut self, hash: u64) { self.set_bit(self.config.hash_to_bucket(hash)); } + fn push(&mut self, item: I) { + let build_hasher = C::BuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, other: &Self) { *self = or(self, other) } @@ -230,7 +234,7 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::build_hasher::DefaultBuildHasher; + use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::{iter_ones, tests::test_estimate, FixedConfig, VariableConfig}; use crate::evaluation::simulation::simulate; @@ -238,7 +242,8 @@ mod tests { #[test] fn test_lookup_table() { - let c = FixedConfig::::default(); + let c = + FixedConfig::::default(); for i in 0..c.max_bytes() * 4 { let hash = (c.phi_f64().powf(i as f64 + 0.5) * u64::MAX as f64).round() as u64; let a = c.hash_to_bucket(hash); @@ -370,7 +375,7 @@ mod tests { Box::new(GeoDistinctCount::new(VariableConfig::< _, u32, - DefaultBuildHasher, + UnstableDefaultBuildHasher, >::new( 13, 7800, diff --git a/crates/geo_filters/src/distinct_count/config.rs b/crates/geo_filters/src/distinct_count/config.rs index 8118c91..1ac5163 100644 --- a/crates/geo_filters/src/distinct_count/config.rs +++ b/crates/geo_filters/src/distinct_count/config.rs @@ -1,6 +1,6 @@ use once_cell::sync::Lazy; -use crate::build_hasher::DefaultBuildHasher; +use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -19,7 +19,8 @@ use crate::Distinct; // // scripts/accuracy -n 10000 geo_distinct/u16/b=7/bytes=136/msb={8,16,32,64} // -pub type GeoDistinctConfig7 = FixedConfig; +pub type GeoDistinctConfig7 = + FixedConfig; /// Distinct count configuration with a relative error standard deviation of ~0.0075. /// Uses at most 9248 bytes of memory. @@ -32,7 +33,8 @@ pub type GeoDistinctConfig7 = FixedConfig; +pub type GeoDistinctConfig13 = + FixedConfig; impl Lookups for Distinct { #[inline] @@ -108,7 +110,7 @@ mod tests { #[test] fn test_estimation_lut_7() { - let c = GeoDistinctConfig7::default(); + let c = GeoDistinctConfig7::::default(); let err = (0..600) .step_by(1) .map(|i| { @@ -127,7 +129,7 @@ mod tests { #[test] fn test_estimation_lut_13() { - let c = GeoDistinctConfig13::default(); + let c = GeoDistinctConfig13::::default(); let err = (0..24000) .step_by(1) .map(|i| { diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index 1284e73..8c506c0 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -9,7 +9,7 @@ use std::{ use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use crate::{ - build_hasher::{DefaultBuildHasher, ReproducibleBuildHasher}, + build_hasher::{ReproducibleBuildHasher, UnstableDefaultBuildHasher}, Count, Distinct, }; @@ -113,12 +113,15 @@ impl BuildHasher for BuildNoopHasher { impl ReproducibleBuildHasher for BuildNoopHasher {} impl Count for Hll { - type BuildHasher = DefaultBuildHasher; - fn push_hash(&mut self, hash: u64) { self.inner.borrow_mut().insert(&hash) } + fn push(&mut self, item: I) { + let build_hasher = UnstableDefaultBuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, _other: &Self) { unimplemented!() } diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 13cf5bf..8ec8fb3 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -14,9 +14,7 @@ pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; -use std::hash::{BuildHasher as _, Hash}; - -use crate::build_hasher::ReproducibleBuildHasher; +use std::hash::Hash; /// Marker trait to indicate the variant implemented by a [`Count`] instance. pub trait Method: Clone + Eq + PartialEq + Send + Sync {} @@ -33,16 +31,11 @@ impl Method for Distinct {} /// Trait for types solving the set cardinality estimation problem. pub trait Count { - type BuildHasher: ReproducibleBuildHasher; - /// Add the given hash to the set. fn push_hash(&mut self, hash: u64); /// Add the hash of the given item, computed with the configured hasher, to the set. - fn push(&mut self, item: I) { - let build_hasher = Self::BuildHasher::default(); - self.push_hash(build_hasher.hash_one(item)); - } + fn push(&mut self, item: I); /// Add the given sketch to this one. /// If only the size of the combined set is needed, [`Self::size_with_sketch`] is more efficient and should be used. diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index 5656286..82d435d 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -1,4 +1,23 @@ -use geo_filters::build_hasher::DefaultBuildHasher; +use geo_filters::build_hasher::UnstableDefaultBuildHasher; + +// Mirrors what we have in the README. +// If this breaks then it should be fixed an the readme updated. +#[test] +fn readme() { + use geo_filters::distinct_count::GeoDistinctCount13; + use geo_filters::Count; + + let mut c1 = GeoDistinctCount13::default(); + c1.push(1); + c1.push(2); + + let mut c2 = GeoDistinctCount13::default(); + c2.push(2); + c2.push(3); + + let estimated_size = c1.size_with_sketch(&c2); + assert!(estimated_size >= 3.0_f32 * 0.9 && estimated_size <= 3.0_f32 * 1.1); +} #[test] fn can_use_predefined_diff_count() { @@ -22,7 +41,7 @@ fn can_use_custom_diff_count() { fn can_use_diff_count_with_predefined_config_value() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; - let c = GeoDiffConfig7::default(); + let c = GeoDiffConfig7::::default(); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -33,7 +52,7 @@ fn can_use_diff_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 128, 10, DefaultBuildHasher>::default(); + let c = FixedConfig::<_, u16, 7, 128, 10, UnstableDefaultBuildHasher>::default(); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -44,7 +63,7 @@ fn can_use_diff_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(7, 128, 10); + let c = VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(7, 128, 10); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -72,7 +91,7 @@ fn can_use_custom_distinct_count() { fn can_use_distinct_count_with_predefined_config_value() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; - let c = GeoDistinctConfig7::default(); + let c = GeoDistinctConfig7::::default(); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); @@ -83,7 +102,7 @@ fn can_use_distinct_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 118, 11, DefaultBuildHasher>::default(); + let c = FixedConfig::<_, u16, 7, 118, 11, UnstableDefaultBuildHasher>::default(); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); @@ -94,7 +113,7 @@ fn can_use_distinct_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = VariableConfig::<_, u16, DefaultBuildHasher>::new(7, 118, 11); + let c = VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(7, 118, 11); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); From 132a244da96ec978586e2ebcfb2050542d982e9b Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 16:17:01 +0100 Subject: [PATCH 10/12] more clippy --- crates/geo_filters/tests/public_api.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index 82d435d..8a2de2e 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -16,7 +16,7 @@ fn readme() { c2.push(3); let estimated_size = c1.size_with_sketch(&c2); - assert!(estimated_size >= 3.0_f32 * 0.9 && estimated_size <= 3.0_f32 * 1.1); + assert!((3.0_f32 * 0.9..=3.0_f32 * 1.1).contains(&estimated_size)); } #[test] From ea774b51dda476608c52f4a52e08ac4e342ce8b9 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 16:58:54 +0100 Subject: [PATCH 11/12] remove readme test and make readme doc tests --- crates/geo_filters/src/lib.rs | 4 ++++ crates/geo_filters/tests/public_api.rs | 19 ------------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 8ec8fb3..c810877 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -51,3 +51,7 @@ pub trait Count { /// Returns the number of bytes in memory used to represent this filter. fn bytes_in_memory(&self) -> usize; } + +#[doc = include_str!("../README.md")] +#[cfg(doctest)] +pub struct ReadmeDocTests; diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index 8a2de2e..4afff72 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -1,24 +1,5 @@ use geo_filters::build_hasher::UnstableDefaultBuildHasher; -// Mirrors what we have in the README. -// If this breaks then it should be fixed an the readme updated. -#[test] -fn readme() { - use geo_filters::distinct_count::GeoDistinctCount13; - use geo_filters::Count; - - let mut c1 = GeoDistinctCount13::default(); - c1.push(1); - c1.push(2); - - let mut c2 = GeoDistinctCount13::default(); - c2.push(2); - c2.push(3); - - let estimated_size = c1.size_with_sketch(&c2); - assert!((3.0_f32 * 0.9..=3.0_f32 * 1.1).contains(&estimated_size)); -} - #[test] fn can_use_predefined_diff_count() { use geo_filters::diff_count::GeoDiffCount7; From b9b0baec3ea1ce0ed2f8dfa3ec71324d0a9301e6 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Mon, 21 Jul 2025 17:16:33 +0100 Subject: [PATCH 12/12] fix spelling mistake --- crates/geo_filters/docs/choosing-a-hash-function.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/geo_filters/docs/choosing-a-hash-function.md b/crates/geo_filters/docs/choosing-a-hash-function.md index e2bb0c7..5115eeb 100644 --- a/crates/geo_filters/docs/choosing-a-hash-function.md +++ b/crates/geo_filters/docs/choosing-a-hash-function.md @@ -54,7 +54,7 @@ fn test_different_hash_functions() { The actual value returned in this example is ~200. This makes sense because the geofilter thinks that there are 100 unique values in each of the filters, so the difference is approximated -as being ~200. If we were to rerun the above example with a genuinely reproducable `BuildHasher` +as being ~200. If we were to rerun the above example with a genuinely reproducible `BuildHasher` then the resulting diff size would be `0`. In debug builds, as part of the config's `eq` implementation, our library will assert that the `BuildHasher`s