Skip to content
4 changes: 4 additions & 0 deletions crates/geo_filters/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ bench = false

[features]
default = []
test-support = ["dep:rand", "dep:rand_chacha"]
serde = ["dep:serde"]
evaluation = [
"dep:clap",
"dep:hyperloglogplus",
Expand All @@ -31,6 +33,8 @@ once_cell = "1.18"
rand = { version = "0.9", optional = true }
rayon = { version = "1.7", optional = true }
regex = { version = "1", optional = true }
serde = { version = "1.0", default-features = false, optional = true }
rand_chacha = { version = "0.9", optional = true }

[dev-dependencies]
criterion = "0.7"
Expand Down
4 changes: 3 additions & 1 deletion crates/geo_filters/evaluation/performance.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::hint::black_box;

use criterion::{criterion_group, criterion_main, Criterion};
use geo_filters::build_hasher::UnstableDefaultBuildHasher;
use geo_filters::config::VariableConfig;
use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13};
Expand Down
5 changes: 3 additions & 2 deletions crates/geo_filters/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,13 +353,14 @@ pub(crate) fn take_ref<I: Iterator>(iter: &mut I, n: usize) -> impl Iterator<Ite

#[cfg(test)]
pub(crate) mod tests {
use rand::{rngs::StdRng, RngCore};
use rand::RngCore;
use rand_chacha::ChaCha12Rng;

use crate::{Count, Method};

/// Runs estimation trials and returns the average precision and variance.
pub(crate) fn test_estimate<M: Method, C: Count<M>>(
rnd: &mut StdRng,
rnd: &mut ChaCha12Rng,
f: impl Fn() -> C,
) -> (f32, f32) {
let cnt = 10000usize;
Expand Down
5 changes: 3 additions & 2 deletions crates/geo_filters/src/config/lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ impl HashToBucketLookup {

#[cfg(test)]
mod tests {
use rand::{rngs::StdRng, RngCore};
use rand::RngCore;
use rand_chacha::ChaCha12Rng;

use crate::{
config::{hash_to_bucket, phi_f64},
Expand All @@ -70,7 +71,7 @@ mod tests {
});
}

fn lookup_random_hashes_variance<const B: usize>(rnd: &mut StdRng, n: u64) -> f64 {
fn lookup_random_hashes_variance<const B: usize>(rnd: &mut ChaCha12Rng, n: u64) -> f64 {
let phi = phi_f64(B);
let buckets = HashToBucketLookup::new(B);

Expand Down
93 changes: 65 additions & 28 deletions crates/geo_filters/src/diff_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ mod sim_hash;

use bitvec::*;
pub use config::{GeoDiffConfig13, GeoDiffConfig7};
pub use sim_hash::{SimHash, SIM_BUCKETS, SIM_BUCKET_SIZE};

/// Diff count filter with a relative error standard deviation of ~0.125.
pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>;
Expand Down Expand Up @@ -302,7 +303,7 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {

/// Create a new [`GeoDiffCount`] from a slice of bytes
#[cfg(target_endian = "little")]
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
pub fn from_bytes_with_config(c: C, buf: &'a [u8]) -> Self {
if buf.is_empty() {
return Self::new(c);
}
Expand Down Expand Up @@ -338,6 +339,53 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
bytes_written += self.lsb.write(writer)?;
Ok(bytes_written)
}

#[cfg(any(test, feature = "test-support"))]
pub fn from_ones_with_config(config: C, ones: impl IntoIterator<Item = C::BucketType>) -> Self {
let mut result = Self::new(config);
for one in ones {
result.xor_bit(one);
}
result
}

#[cfg(any(test, feature = "test-support"))]
pub fn iter_ones(&self) -> impl Iterator<Item = C::BucketType> + '_ {
iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize)
}

/// Generate a pseudo-random filter. The RNG used to build the filter
/// is seeded using the number of items so for a given number of items
/// the resulting geofilter should always be the same.
#[cfg(any(test, feature = "test-support"))]
pub fn pseudorandom_filter_with_config(config: C, items: usize) -> Self {
use rand::RngCore;
use rand_chacha::rand_core::SeedableRng;

let mut rng = rand_chacha::ChaCha12Rng::seed_from_u64(items as u64);
let mut filter = Self::new(config);
for _ in 0..items {
filter.push_hash(rng.next_u64());
}
filter
}
}

impl<'a, C: GeoConfig<Diff> + Default> GeoDiffCount<'a, C> {
#[cfg(target_endian = "little")]
pub fn from_bytes(buf: &'a [u8]) -> Self {
Self::from_bytes_with_config(C::default(), buf)
}

#[cfg(any(test, feature = "test-support"))]
pub fn from_ones(ones: impl IntoIterator<Item = C::BucketType>) -> Self {
Self::from_ones_with_config(C::default(), ones)
}

#[cfg(any(test, feature = "test-support"))]
pub fn pseudorandom_filter(items: usize) -> Self {
Self::pseudorandom_filter_with_config(C::default(), items)
}
}

/// Applies a repeated bit mask to the underlying filter.
Expand Down Expand Up @@ -419,11 +467,12 @@ mod tests {
use std::io::Write;

use itertools::Itertools;
use rand::{rngs::StdRng, seq::IteratorRandom, RngCore};
use rand::{seq::IteratorRandom, RngCore};
use rand_chacha::ChaCha12Rng;

use crate::{
build_hasher::UnstableDefaultBuildHasher,
config::{iter_ones, tests::test_estimate, FixedConfig},
config::{tests::test_estimate, FixedConfig},
test_rng::prng_test_harness,
};

Expand Down Expand Up @@ -458,8 +507,8 @@ mod tests {

#[test]
fn test_xor() {
let a = GeoDiffCount7::from_ones(Default::default(), 0..1000);
let b = GeoDiffCount7::from_ones(Default::default(), 10..1010);
let a = GeoDiffCount7::from_ones(0..1000);
let b = GeoDiffCount7::from_ones(10..1010);
let c = xor(&a, &b);
let d = xor(&a, &b);
assert_eq!(a.iter_ones().count(), 1000);
Expand All @@ -479,7 +528,7 @@ mod tests {
m.xor_bit(10);
assert!(m.iter_ones().collect_vec().is_empty());

let mut m = GeoDiffCount7::from_ones(Default::default(), 0..100);
let mut m = GeoDiffCount7::from_ones(0..100);
assert_eq!(m.iter_ones().count(), 100);
m.xor_bit(10);
assert_eq!(m.iter_ones().count(), 99);
Expand Down Expand Up @@ -561,20 +610,19 @@ mod tests {
// masked bitset : 010000 100100 000000
// after compression : 01 0 10 1 00 0
// bitset of the returned filter : 010 101000
let m = GeoDiffCount7::from_ones(Default::default(), [16, 15, 13, 11, 9, 8, 6, 3, 1]);
let m = GeoDiffCount7::from_ones([16, 15, 13, 11, 9, 8, 6, 3, 1]);
let n = masked(&m, 0b110100, 6);
assert_eq!(n.iter_ones().collect_vec(), vec![16, 11, 8]);

for i in 0..100 {
let m = GeoDiffCount7::from_ones(Default::default(), (0..i).collect_vec());
let m = GeoDiffCount7::from_ones((0..i).collect_vec());
let n = masked(&m, 0b111, 3);
assert_eq!(m, n);
}

for i in 0..300 {
let m = GeoDiffCount7::from_ones(Default::default(), (0..i).collect_vec());
let slow =
GeoDiffCount::from_ones(Default::default(), masked(&m, 0b110, 3).iter_ones());
let m = GeoDiffCount7::from_ones((0..i).collect_vec());
let slow = GeoDiffCount::from_ones(masked(&m, 0b110, 3).iter_ones());
let n = masked(&m, 0b110, 3);
assert_eq!(slow, n, "in iteration: {i}");
}
Expand Down Expand Up @@ -626,20 +674,6 @@ mod tests {
assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec());
}

impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
fn from_ones(config: C, ones: impl IntoIterator<Item = C::BucketType>) -> Self {
let mut result = Self::new(config);
for one in ones {
result.xor_bit(one);
}
result
}

fn iter_ones(&self) -> impl Iterator<Item = C::BucketType> + '_ {
iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize)
}
}

#[test]
fn test_serialization_empty() {
let before = GeoDiffCount7::default();
Expand All @@ -649,15 +683,15 @@ mod tests {

assert_eq!(writer.len(), 0);

let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer);
let after = GeoDiffCount7::from_bytes_with_config(before.config.clone(), &writer);

assert_eq!(before, after);
}

// This helper exists in order to easily test serializing types with different
// bucket types in the MSB sparse bit field representation. See tests below.
#[cfg(target_endian = "little")]
fn serialization_round_trip<C: GeoConfig<Diff> + Default>(rnd: &mut StdRng) {
fn serialization_round_trip<C: GeoConfig<Diff> + Default>(rnd: &mut ChaCha12Rng) {
// Run 100 simulations of random values being put into
// a diff counter. "Serializing" to a vector to emulate
// writing to a disk, and then deserializing and asserting
Expand All @@ -676,7 +710,10 @@ mod tests {
let pad_amount = (0..8).choose(rnd).unwrap();
writer.write_all(&padding[..pad_amount]).unwrap();
before.write(&mut writer).unwrap();
let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad_amount..]);
let after = GeoDiffCount::<'_, C>::from_bytes_with_config(
before.config.clone(),
&writer[pad_amount..],
);
assert_eq!(before, after);
}

Expand Down
10 changes: 8 additions & 2 deletions crates/geo_filters/src/diff_count/sim_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,22 @@ use crate::Diff;

use super::BitVec;

// TODO migrate these const values to be defined in configuration
// The current values are only really appropriate for smaller
// configurations

/// Number of bits covered by each SimHash bucket.
pub(crate) const SIM_BUCKET_SIZE: usize = 6;
pub const SIM_BUCKET_SIZE: usize = 6;
Comment thread
itsibitzi marked this conversation as resolved.
/// Number of consecutive SimHash buckets used for searching.
pub(crate) const SIM_BUCKETS: usize = 20;
pub const SIM_BUCKETS: usize = 20;

pub type BucketId = usize;

/// SimHash is a hash computed over a continuous range of bits from a GeoDiffCount.
/// It is used to quickly find similar sets with a reverse index.
#[derive(Copy, Clone, Default, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "serde", serde(transparent))]
pub struct SimHash(pub u64);

impl SimHash {
Expand Down
9 changes: 5 additions & 4 deletions crates/geo_filters/src/test_rng.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::panic::{catch_unwind, resume_unwind, AssertUnwindSafe};

use rand::{rngs::StdRng, SeedableRng as _};
use rand::SeedableRng as _;
use rand_chacha::ChaCha12Rng;

/// Provides a seeded random number generator to tests which require some
/// degree of randomization. If the test panics the harness will print the
Expand All @@ -12,7 +13,7 @@ use rand::{rngs::StdRng, SeedableRng as _};
/// is only ran once with this seed.
pub fn prng_test_harness<F>(iterations: usize, mut test_fn: F)
where
F: FnMut(&mut StdRng),
F: FnMut(&mut ChaCha12Rng),
{
let maybe_manual_seed = std::env::var("TEST_SEED")
.map(|s| s.parse::<u64>().expect("Parse TEST_SEED to u64"))
Expand All @@ -21,12 +22,12 @@ where
let maybe_panic = catch_unwind(AssertUnwindSafe(|| {
if let Some(manual_seed) = maybe_manual_seed {
seed = manual_seed;
let mut rng = StdRng::seed_from_u64(seed);
let mut rng = ChaCha12Rng::seed_from_u64(seed);
test_fn(&mut rng);
} else {
for _ in 0..iterations {
seed = rand::random();
let mut rng = StdRng::seed_from_u64(seed);
let mut rng = ChaCha12Rng::seed_from_u64(seed);
test_fn(&mut rng);
}
}
Expand Down