Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/geo_filters/src/diff_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ mod sim_hash;

use bitvec::*;
pub use config::{GeoDiffConfig13, GeoDiffConfig7};
pub use sim_hash::{SimHash, SIM_BUCKETS, SIM_BUCKET_SIZE};
pub use sim_hash::SimHash;

/// Diff count filter with a relative error standard deviation of ~0.125.
pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>;
Expand Down
61 changes: 53 additions & 8 deletions crates/geo_filters/src/diff_count/sim_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ use crate::Diff;
use super::BitVec;

// TODO migrate these const values to be defined in configuration
// The current values are only really appropriate for smaller
// configurations
// The current values are only really appropriate for the smaller
// diff configuration.

/// Number of bits covered by each SimHash bucket.
pub const SIM_BUCKET_SIZE: usize = 6;
const SIM_BUCKET_SIZE: usize = 6;
/// Number of consecutive SimHash buckets used for searching.
pub const SIM_BUCKETS: usize = 20;
const SIM_BUCKETS: usize = 20;

pub type BucketId = usize;

Expand Down Expand Up @@ -77,7 +77,7 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
/// The first argument in the tuple is the bucket id of the `SimHash` which can be used
/// to select a certain subset of `SimHashes`. SimHashes are returned in decreasing order
/// of bucket ids, since that's their natural construction order.
pub fn sim_hashes(&self) -> impl Iterator<Item = (BucketId, SimHash)> + '_ {
pub fn sim_hashes(&self) -> impl ExactSizeIterator<Item = (BucketId, SimHash)> + '_ {
SimHashIterator::new(self)
}

Expand All @@ -89,15 +89,29 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
.map(|(_, sim_hash)| sim_hash)
}

/// Get the `SimHash`es for this filter for the purpose of performing a search.
///
/// Returns an iterator of the `SimHash`es and a number representing the minimum number
/// of matches required to consider this filter a match to a given filter, given
/// the expected diff size.
///
/// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero.
/// In this case, all the buckets must match. Similarly, small differences can be found by
/// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences
/// SIM_BUCKETS / 2 many buckets have to match.
pub fn sim_hashes_search(
&self,
expected_diff_size: usize,
) -> impl Iterator<Item = SimHash> + '_ {
) -> (impl Iterator<Item = SimHash> + '_, usize) {
let range = self.sim_hash_range(expected_diff_size);
self.sim_hashes()
let sim_hash_iter = self.sim_hashes();
let n = range.len().min(sim_hash_iter.len());
let min_matches = n.saturating_sub(expected_diff_size).max(SIM_BUCKETS / 2);
let filtered_iter = sim_hash_iter
Comment thread
itsibitzi marked this conversation as resolved.
.skip_while(move |(bucket_id, _)| *bucket_id >= range.end)
.take_while(move |(bucket_id, _)| *bucket_id >= range.start)
.map(|(_, sim_hash)| sim_hash)
.map(|(_, sim_hash)| sim_hash);
(filtered_iter, min_matches)
}
}

Expand Down Expand Up @@ -152,8 +166,14 @@ impl<C: GeoConfig<Diff>> Iterator for SimHashIterator<'_, C> {
SimHash::new(self.prev_bucket_id, self.sim_hash[bucket]),
))
}

fn size_hint(&self) -> (usize, Option<usize>) {
(self.prev_bucket_id, Some(self.prev_bucket_id))
}
}

impl<C: GeoConfig<Diff>> ExactSizeIterator for SimHashIterator<'_, C> {}

impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
/// n specifies the desired zero-based index of the most significant one.
/// The zero-based index of the desired one bit is returned.
Expand Down Expand Up @@ -200,3 +220,28 @@ impl BitVec<'_> {
None
}
}

#[cfg(test)]
mod tests {
use rand::Rng as _;

use crate::{
diff_count::{sim_hash::SIM_BUCKETS, GeoDiffCount7},
test_rng::prng_test_harness,
};

#[test]
fn sim_hash_iter_min_matches() {
prng_test_harness(100, |rng| {
let i = rng.random_range(0..1000);
let filter = GeoDiffCount7::pseudorandom_filter(i);
let expected_diff = rng.random_range(0..i);
let (iter, min_matches) = filter.sim_hashes_search(expected_diff);
let actual_count = iter.count();
let expected_min_matches = actual_count
.saturating_sub(expected_diff)
.max(SIM_BUCKETS / 2);
assert_eq!(min_matches, expected_min_matches)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

? This is exactly the same code as in the implementation.
I don't think that test is very useful then...

Copy link
Copy Markdown
Contributor Author

@itsibitzi itsibitzi Aug 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was using it when experimenting if the length of the iterator differs to the length of the range. I left it in because I thought it might catch regressions in changes to sim_hashes_search.

Will remove it for now.

});
}
}