From ebc338c5de8faf64979477d9a62ab1fae100d2ad Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Wed, 30 Jul 2025 20:13:28 +0100 Subject: [PATCH 1/7] Add min_matches method to GeoDiffCount --- crates/geo_filters/src/diff_count/sim_hash.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index dc8ff5b..8b1978a 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -49,6 +49,11 @@ impl SimHash { } impl> GeoDiffCount<'_, C> { + /// TODO document, and maybe get better name + pub fn min_matches(&self) -> usize { + SIM_BUCKETS / 2 + } + /// Given the expected size of a diff, this function returns the range of bucket ids which should /// be searched for in order to find geometric filters of the desired similarity. If at least half /// of the buckets in the range match, one found a match that has the expected diff size (or better). From 04e1d127873b681dc7ac2acd58e3e16e88088bd7 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Wed, 30 Jul 2025 20:15:29 +0100 Subject: [PATCH 2/7] Restrict sim_hash constants to module visibility --- crates/geo_filters/src/diff_count.rs | 2 +- crates/geo_filters/src/diff_count/sim_hash.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 8c5c4bf..6b2d04c 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -18,7 +18,7 @@ mod sim_hash; use bitvec::*; pub use config::{GeoDiffConfig13, GeoDiffConfig7}; -pub use sim_hash::{SimHash, SIM_BUCKETS, SIM_BUCKET_SIZE}; +pub use sim_hash::SimHash; /// Diff count filter with a relative error standard deviation of ~0.125. pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>; diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index 8b1978a..c10e4f1 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -12,13 +12,13 @@ use crate::Diff; use super::BitVec; // TODO migrate these const values to be defined in configuration -// The current values are only really appropriate for smaller -// configurations +// The current values are only really appropriate for the smaller +// diff configuration. /// Number of bits covered by each SimHash bucket. -pub const SIM_BUCKET_SIZE: usize = 6; +const SIM_BUCKET_SIZE: usize = 6; /// Number of consecutive SimHash buckets used for searching. -pub const SIM_BUCKETS: usize = 20; +const SIM_BUCKETS: usize = 20; pub type BucketId = usize; From 10591b0ba6bdd2b985ad36849330030eba833811 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 1 Aug 2025 13:45:32 +0100 Subject: [PATCH 3/7] Refactor sim_hashes_search to return min_matches and iterator --- crates/geo_filters/src/diff_count/sim_hash.rs | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index c10e4f1..9e38d1e 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -49,11 +49,6 @@ impl SimHash { } impl> GeoDiffCount<'_, C> { - /// TODO document, and maybe get better name - pub fn min_matches(&self) -> usize { - SIM_BUCKETS / 2 - } - /// Given the expected size of a diff, this function returns the range of bucket ids which should /// be searched for in order to find geometric filters of the desired similarity. If at least half /// of the buckets in the range match, one found a match that has the expected diff size (or better). @@ -82,7 +77,7 @@ impl> GeoDiffCount<'_, C> { /// The first argument in the tuple is the bucket id of the `SimHash` which can be used /// to select a certain subset of `SimHashes`. SimHashes are returned in decreasing order /// of bucket ids, since that's their natural construction order. - pub fn sim_hashes(&self) -> impl Iterator + '_ { + pub fn sim_hashes(&self) -> impl ExactSizeIterator + '_ { SimHashIterator::new(self) } @@ -94,15 +89,31 @@ impl> GeoDiffCount<'_, C> { .map(|(_, sim_hash)| sim_hash) } + /// Get the `SimHash`es for this filter for the purpose of performing a search. + /// + /// Returns an iterator of the `SimHash`es and a number representing the minimum number + /// of matches required to consider this filter a match to a given filter, given + /// the expected diff size. + /// + /// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero. + /// in this case, all the buckets must match. Similarly, small differences can be found by + /// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences + /// SIM_BUCKETS / 2 many buckets have to match. pub fn sim_hashes_search( &self, expected_diff_size: usize, - ) -> impl Iterator + '_ { + ) -> (impl Iterator + '_, usize) { let range = self.sim_hash_range(expected_diff_size); - self.sim_hashes() + let sim_hash_iter = self.sim_hashes(); + let min_matches = sim_hash_iter + .len() + .saturating_sub(expected_diff_size) + .max(SIM_BUCKETS / 2); + let filtered_iter = sim_hash_iter .skip_while(move |(bucket_id, _)| *bucket_id >= range.end) .take_while(move |(bucket_id, _)| *bucket_id >= range.start) - .map(|(_, sim_hash)| sim_hash) + .map(|(_, sim_hash)| sim_hash); + (filtered_iter, min_matches) } } @@ -157,8 +168,14 @@ impl> Iterator for SimHashIterator<'_, C> { SimHash::new(self.prev_bucket_id, self.sim_hash[bucket]), )) } + + fn size_hint(&self) -> (usize, Option) { + (self.prev_bucket_id, Some(self.prev_bucket_id)) + } } +impl> ExactSizeIterator for SimHashIterator<'_, C> {} + impl> GeoDiffCount<'_, C> { /// n specifies the desired zero-based index of the most significant one. /// The zero-based index of the desired one bit is returned. From 04b84be0b72089eabd5c13e6ad0ad7b840db24f3 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 1 Aug 2025 14:03:12 +0100 Subject: [PATCH 4/7] Fix sim_hashes_search filtering and add unit test --- crates/geo_filters/src/diff_count/sim_hash.rs | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index 9e38d1e..747777d 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -104,12 +104,12 @@ impl> GeoDiffCount<'_, C> { expected_diff_size: usize, ) -> (impl Iterator + '_, usize) { let range = self.sim_hash_range(expected_diff_size); - let sim_hash_iter = self.sim_hashes(); - let min_matches = sim_hash_iter + let min_matches = range .len() .saturating_sub(expected_diff_size) .max(SIM_BUCKETS / 2); - let filtered_iter = sim_hash_iter + let filtered_iter = self + .sim_hashes() .skip_while(move |(bucket_id, _)| *bucket_id >= range.end) .take_while(move |(bucket_id, _)| *bucket_id >= range.start) .map(|(_, sim_hash)| sim_hash); @@ -222,3 +222,28 @@ impl BitVec<'_> { None } } + +#[cfg(test)] +mod tests { + use rand::Rng as _; + + use crate::{ + diff_count::{sim_hash::SIM_BUCKETS, GeoDiffCount7}, + test_rng::prng_test_harness, + }; + + #[test] + fn sim_hash_iter_min_matches() { + prng_test_harness(100, |rng| { + let i = rng.random_range(0..1000); + let filter = GeoDiffCount7::pseudorandom_filter(i); + let expected_diff = rng.random_range(0..i); + let (iter, min_matches) = filter.sim_hashes_search(expected_diff); + let actual_count = iter.count(); + let expected_min_matches = actual_count + .saturating_sub(expected_diff) + .max(SIM_BUCKETS / 2); + assert_eq!(min_matches, expected_min_matches) + }); + } +} From 04160acb0d7ff143620a2d7faa0558cb9a50bc52 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 1 Aug 2025 14:03:42 +0100 Subject: [PATCH 5/7] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- crates/geo_filters/src/diff_count/sim_hash.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index 747777d..b1f3df2 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -96,7 +96,7 @@ impl> GeoDiffCount<'_, C> { /// the expected diff size. /// /// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero. - /// in this case, all the buckets must match. Similarly, small differences can be found by + /// In this case, all the buckets must match. Similarly, small differences can be found by /// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences /// SIM_BUCKETS / 2 many buckets have to match. pub fn sim_hashes_search( From 0ae46949e4d300213d414c103aa9dd2b940c4b0b Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 1 Aug 2025 14:06:27 +0100 Subject: [PATCH 6/7] Fix sim hash filtering to use correct iterator length --- crates/geo_filters/src/diff_count/sim_hash.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index b1f3df2..ca4d8a3 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -104,12 +104,10 @@ impl> GeoDiffCount<'_, C> { expected_diff_size: usize, ) -> (impl Iterator + '_, usize) { let range = self.sim_hash_range(expected_diff_size); - let min_matches = range - .len() - .saturating_sub(expected_diff_size) - .max(SIM_BUCKETS / 2); - let filtered_iter = self - .sim_hashes() + let sim_hash_iter = self.sim_hashes(); + let n = range.len().min(sim_hash_iter.len()); + let min_matches = n.saturating_sub(expected_diff_size).max(SIM_BUCKETS / 2); + let filtered_iter = sim_hash_iter .skip_while(move |(bucket_id, _)| *bucket_id >= range.end) .take_while(move |(bucket_id, _)| *bucket_id >= range.start) .map(|(_, sim_hash)| sim_hash); From 728814cc7c7b2ffaee1220dde630eb6f5ab705ce Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Fri, 1 Aug 2025 14:36:24 +0100 Subject: [PATCH 7/7] remove needless test for geo_filters sim_hash.rs --- crates/geo_filters/src/diff_count/sim_hash.rs | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/crates/geo_filters/src/diff_count/sim_hash.rs b/crates/geo_filters/src/diff_count/sim_hash.rs index ca4d8a3..5c92aa6 100644 --- a/crates/geo_filters/src/diff_count/sim_hash.rs +++ b/crates/geo_filters/src/diff_count/sim_hash.rs @@ -220,28 +220,3 @@ impl BitVec<'_> { None } } - -#[cfg(test)] -mod tests { - use rand::Rng as _; - - use crate::{ - diff_count::{sim_hash::SIM_BUCKETS, GeoDiffCount7}, - test_rng::prng_test_harness, - }; - - #[test] - fn sim_hash_iter_min_matches() { - prng_test_harness(100, |rng| { - let i = rng.random_range(0..1000); - let filter = GeoDiffCount7::pseudorandom_filter(i); - let expected_diff = rng.random_range(0..i); - let (iter, min_matches) = filter.sim_hashes_search(expected_diff); - let actual_count = iter.count(); - let expected_min_matches = actual_count - .saturating_sub(expected_diff) - .max(SIM_BUCKETS / 2); - assert_eq!(min_matches, expected_min_matches) - }); - } -}