-
Notifications
You must be signed in to change notification settings - Fork 16
SimHash encapsulation
#77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
ebc338c
04e1d12
10591b0
04b84be
04160ac
0ae4694
728814c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,13 +12,13 @@ use crate::Diff; | |
| use super::BitVec; | ||
|
|
||
| // TODO migrate these const values to be defined in configuration | ||
| // The current values are only really appropriate for smaller | ||
| // configurations | ||
| // The current values are only really appropriate for the smaller | ||
| // diff configuration. | ||
|
|
||
| /// Number of bits covered by each SimHash bucket. | ||
| pub const SIM_BUCKET_SIZE: usize = 6; | ||
| const SIM_BUCKET_SIZE: usize = 6; | ||
| /// Number of consecutive SimHash buckets used for searching. | ||
| pub const SIM_BUCKETS: usize = 20; | ||
| const SIM_BUCKETS: usize = 20; | ||
|
|
||
| pub type BucketId = usize; | ||
|
|
||
|
|
@@ -77,7 +77,7 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> { | |
| /// The first argument in the tuple is the bucket id of the `SimHash` which can be used | ||
| /// to select a certain subset of `SimHashes`. SimHashes are returned in decreasing order | ||
| /// of bucket ids, since that's their natural construction order. | ||
| pub fn sim_hashes(&self) -> impl Iterator<Item = (BucketId, SimHash)> + '_ { | ||
| pub fn sim_hashes(&self) -> impl ExactSizeIterator<Item = (BucketId, SimHash)> + '_ { | ||
| SimHashIterator::new(self) | ||
| } | ||
|
|
||
|
|
@@ -89,15 +89,29 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> { | |
| .map(|(_, sim_hash)| sim_hash) | ||
| } | ||
|
|
||
| /// Get the `SimHash`es for this filter for the purpose of performing a search. | ||
| /// | ||
| /// Returns an iterator of the `SimHash`es and a number representing the minimum number | ||
| /// of matches required to consider this filter a match to a given filter, given | ||
| /// the expected diff size. | ||
| /// | ||
| /// The geo_filter can be used to do an "exact" search by setting expected_diff_size to zero. | ||
| /// In this case, all the buckets must match. Similarly, small differences can be found by | ||
| /// requiring (SIM_BUCKETS - expected_diff_size) many buckets to match. For larger differences | ||
| /// SIM_BUCKETS / 2 many buckets have to match. | ||
| pub fn sim_hashes_search( | ||
| &self, | ||
| expected_diff_size: usize, | ||
| ) -> impl Iterator<Item = SimHash> + '_ { | ||
| ) -> (impl Iterator<Item = SimHash> + '_, usize) { | ||
| let range = self.sim_hash_range(expected_diff_size); | ||
| self.sim_hashes() | ||
| let sim_hash_iter = self.sim_hashes(); | ||
| let n = range.len().min(sim_hash_iter.len()); | ||
| let min_matches = n.saturating_sub(expected_diff_size).max(SIM_BUCKETS / 2); | ||
| let filtered_iter = sim_hash_iter | ||
| .skip_while(move |(bucket_id, _)| *bucket_id >= range.end) | ||
| .take_while(move |(bucket_id, _)| *bucket_id >= range.start) | ||
| .map(|(_, sim_hash)| sim_hash) | ||
| .map(|(_, sim_hash)| sim_hash); | ||
| (filtered_iter, min_matches) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -152,8 +166,14 @@ impl<C: GeoConfig<Diff>> Iterator for SimHashIterator<'_, C> { | |
| SimHash::new(self.prev_bucket_id, self.sim_hash[bucket]), | ||
| )) | ||
| } | ||
|
|
||
| fn size_hint(&self) -> (usize, Option<usize>) { | ||
| (self.prev_bucket_id, Some(self.prev_bucket_id)) | ||
| } | ||
| } | ||
|
|
||
| impl<C: GeoConfig<Diff>> ExactSizeIterator for SimHashIterator<'_, C> {} | ||
|
|
||
| impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> { | ||
| /// n specifies the desired zero-based index of the most significant one. | ||
| /// The zero-based index of the desired one bit is returned. | ||
|
|
@@ -200,3 +220,28 @@ impl BitVec<'_> { | |
| None | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use rand::Rng as _; | ||
|
|
||
| use crate::{ | ||
| diff_count::{sim_hash::SIM_BUCKETS, GeoDiffCount7}, | ||
| test_rng::prng_test_harness, | ||
| }; | ||
|
|
||
| #[test] | ||
| fn sim_hash_iter_min_matches() { | ||
| prng_test_harness(100, |rng| { | ||
| let i = rng.random_range(0..1000); | ||
| let filter = GeoDiffCount7::pseudorandom_filter(i); | ||
| let expected_diff = rng.random_range(0..i); | ||
| let (iter, min_matches) = filter.sim_hashes_search(expected_diff); | ||
| let actual_count = iter.count(); | ||
| let expected_min_matches = actual_count | ||
| .saturating_sub(expected_diff) | ||
| .max(SIM_BUCKETS / 2); | ||
| assert_eq!(min_matches, expected_min_matches) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? This is exactly the same code as in the implementation.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was using it when experimenting if the length of the iterator differs to the length of the range. I left it in because I thought it might catch regressions in changes to Will remove it for now. |
||
| }); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.