diff --git a/Cargo.toml b/Cargo.toml index e0d77cb..3c2d864 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,3 +16,19 @@ repository = "https://github.com/rapidfuzz/strsim-rs" documentation = "https://docs.rs/strsim/" exclude = ["/.github", "/dev"] categories = ["text-processing"] + +[dev-dependencies] +criterion = "0.8" + +[[bench]] +name = "benches" +harness = false +path = "benches/benches.rs" + +[profile.bench] +opt-level = 3 +debug = false +lto = "fat" +codegen-units = 1 +incremental = false +rpath = false \ No newline at end of file diff --git a/benches/benches.rs b/benches/benches.rs index 15c7041..eb785d2 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -1,95 +1,230 @@ -//! Benchmarks for strsim. - -#![feature(test)] +//! Benchmarks for strsim using Criterion. +use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; extern crate strsim; -extern crate test; -use self::test::Bencher; -#[bench] -fn bench_hamming(bencher: &mut Bencher) { +use std::hint::black_box; + +fn run_benchmark(c: &mut Criterion, name: &str, f: F) +where + F: Fn() + 'static, +{ + c.bench_function(name, |b| b.iter(|| black_box(f()))); +} + +fn run_benchmark_setup(c: &mut Criterion, name: &str, mut setup: S, mut f: F) +where + S: FnMut() -> I, + F: FnMut(I) -> O, +{ + c.bench_function(name, |b| { + b.iter_batched( + || setup(), + |input| black_box(f(input)), + criterion::BatchSize::SmallInput, + ) + }); +} + +/* -------------------------------------------------------------------------- */ +/* Hamming distance */ +/* -------------------------------------------------------------------------- */ +fn bench_hamming(c: &mut Criterion) { let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG"; let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC"; - bencher.iter(|| { + + run_benchmark(c, "hamming", || { + // `unwrap` is kept because the original benchmark did it. strsim::hamming(a, b).unwrap(); - }) + }); } -#[bench] -fn bench_jaro(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Jaro */ +/* -------------------------------------------------------------------------- */ +fn bench_jaro(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "jaro", || { strsim::jaro(a, b); - }) + }); } -#[bench] -fn bench_jaro_winkler(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Jaro‑Winkler */ +/* -------------------------------------------------------------------------- */ +fn bench_jaro_winkler(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "jaro_winkler", || { strsim::jaro_winkler(a, b); - }) + }); +} + +fn bench_jaro_longstring(c: &mut Criterion) { + let a = "abcd".repeat(3000); + let b = "abce".repeat(3000); + run_benchmark_setup( + c, + "jaro_longstring", + || (a.clone(), b.clone()), + |(a, b)| { + strsim::jaro(&a, &b); + }, + ); } -#[bench] -fn bench_levenshtein(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Levenshtein */ +/* -------------------------------------------------------------------------- */ +fn bench_levenshtein(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "levenshtein", || { strsim::levenshtein(a, b); - }) + }); } -#[bench] -fn bench_levenshtein_on_u8(bencher: &mut Bencher) { - bencher.iter(|| { +/* -------------------------------------------------------------------------- */ +/* Levenshtein on `u8` slices */ +/* -------------------------------------------------------------------------- */ +fn bench_levenshtein_on_u8(c: &mut Criterion) { + run_benchmark(c, "levenshtein_u8", || { strsim::generic_levenshtein(&vec![0u8; 30], &vec![7u8; 31]); - }) + }); } -#[bench] -fn bench_normalized_levenshtein(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Normalized Levenshtein */ +/* -------------------------------------------------------------------------- */ +fn bench_normalized_levenshtein(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "normalized_levenshtein", || { strsim::normalized_levenshtein(a, b); - }) + }); } -#[bench] -fn bench_osa_distance(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* OSA distance */ +/* -------------------------------------------------------------------------- */ +fn bench_osa_distance(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "osa_distance", || { strsim::osa_distance(a, b); - }) + }); } -#[bench] -fn bench_damerau_levenshtein(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Damerau‑Levenshtein */ +/* -------------------------------------------------------------------------- */ +fn bench_damerau_levenshtein(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "damerau_levenshtein", || { strsim::damerau_levenshtein(a, b); - }) + }); } -#[bench] -fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Normalized Damerau‑Levenshtein */ +/* -------------------------------------------------------------------------- */ +fn bench_normalized_damerau_levenshtein(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "normalized_damerau_levenshtein", || { strsim::normalized_damerau_levenshtein(a, b); - }) + }); } -#[bench] -fn bench_sorensen_dice(bencher: &mut Bencher) { +/* -------------------------------------------------------------------------- */ +/* Sørensen‑Dice */ +/* -------------------------------------------------------------------------- */ +fn bench_sorensen_dice(c: &mut Criterion) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { + + run_benchmark(c, "sorensen_dice", || { strsim::sorensen_dice(a, b); - }) + }); +} + +/* -------------------------------------------------------------------------- */ +/* Long Sørensen‑Dice (multiple inputs, larger data) */ +/* -------------------------------------------------------------------------- */ +fn bench_sorensen_dice_long(c: &mut Criterion) { + // A collection of string pairs with varying lengths and characteristics. + let pairs = [ + // Short, similar strings + ("night", "nacht"), + // Medium, partially overlapping + ("rust programming language", "rust language programming"), + // Long, realistic sentences + ( + "The quick brown fox jumps over the lazy dog while the sun sets behind the hills", + "A swift auburn fox leaped over a sleepy canine as dusk fell beyond the mountains", + ), + // Persian thing + ( + "در گذر زمان خواهی آموخت هر کسی ارزش جنگیدن ندارد", + "در گذر زمان خواهی فهمید هر جایی ارزش ماندن ندارد", + ), + // Very long repetitive patterns + (&"abcde".repeat(2000), &"abfde".repeat(2000)), + // Unicode strings with diacritics + ("café au lait", "cafe au lait"), + // Strings with emojis + ("😀😃😄😁😆", "😀😃😄😁😅"), + ]; + + // Benchmark each pair individually to capture variance. + for (i, (a, b)) in pairs.iter().enumerate() { + let name = format!("sorensen_dice_long_{}", i); + // Clone the original `&str` values into owned `String`s once (setup phase). + let a_owned = a.to_string(); + let b_owned = b.to_string(); + run_benchmark_setup( + c, + &name, + || { + // Setup phase: prepare owned strings. + let _a = a_owned.clone(); + let _b = b_owned.clone(); + (a, b) + }, + |(a, b)| { + strsim::sorensen_dice(&a, &b); + }, + ); + } +} + +criterion_group! { + name = benches; + config = Criterion::default() + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(3)); + targets = + bench_hamming, + bench_jaro, + bench_jaro_winkler, + bench_jaro_longstring, + bench_levenshtein, + bench_levenshtein_on_u8, + bench_normalized_levenshtein, + bench_osa_distance, + bench_damerau_levenshtein, + bench_normalized_damerau_levenshtein, + bench_sorensen_dice, + bench_sorensen_dice_long } +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs index 309c065..8744555 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,9 +26,134 @@ use std::convert::TryFrom; use std::error::Error; use std::fmt::{self, Display, Formatter}; use std::hash::Hash; +use std::iter::FusedIterator; use std::mem; use std::str::Chars; +/// A simple bit vector implementation that uses a slice of `usize` as backing +struct BitSlice<'a> { + data: &'a mut [usize], + len: usize, +} + +impl<'a> BitSlice<'a> { + fn new(data: &'a mut [usize], len: usize) -> Self { + assert!( + len <= data.len() * usize::BITS as usize, + "BitVec length {} exceeds capacity {} bits (data slice length {} usize words)", + len, + data.len() * usize::BITS as usize, + data.len() + ); + BitSlice { data, len } + } + + fn set(&mut self, index: usize) { + let word_index = index / (usize::BITS as usize); + let bit_index = index % (usize::BITS as usize); + self.data[word_index] |= 1 << bit_index; + } + + #[allow(unused)] + fn get(&self, index: usize) -> bool { + let word_index = index / (usize::BITS as usize); + let bit_index = index % (usize::BITS as usize); + (self.data[word_index] & (1 << bit_index)) != 0 + } + + fn set_if_not_set(&mut self, index: usize) -> bool { + let word_index = index / (usize::BITS as usize); + let bit_index = index % (usize::BITS as usize); + let mask = 1 << bit_index; + let was_set = (self.data[word_index] & mask) != 0; + if !was_set { + self.data[word_index] |= mask; + } + was_set + } + + pub fn iter(&'a self) -> BitIterator<'a> { + BitIterator { + bitvec: self, + index: 0, + register: 0, + } + } +} + +struct BitIterator<'a> { + bitvec: &'a BitSlice<'a>, + index: usize, + register: usize, +} + +impl<'a> Iterator for BitIterator<'a> { + type Item = bool; + fn next(&mut self) -> Option { + if self.index == self.bitvec.len { + return None; + } + + let bit_index = self.index % (usize::BITS as usize); + + if bit_index == 0 { + let word_index = self.index / (usize::BITS as usize); + self.register = self.bitvec.data[word_index]; + } + + let result = (self.register & (1 << bit_index)) != 0; + self.index += 1; + Some(result) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.bitvec.len - self.index; + (remaining, Some(remaining)) + } +} + +// Exact size iterator since we can always calculate the remaining length +impl<'a> ExactSizeIterator for BitIterator<'a> { + fn len(&self) -> usize { + self.bitvec.len - self.index + } +} + +// Optimization, we will return None once we are exhausted +impl<'a> FusedIterator for BitIterator<'a> {} + +const MAX_STACK_BYTES: usize = 2048; + +enum HybridBuffer { + Stack([T; MAX_STACK_BYTES / std::mem::size_of::()]), + Heap(Vec), +} + +impl HybridBuffer { + fn new(len: usize) -> Self { + const MAX_ELEMENTS: usize = MAX_STACK_BYTES / std::mem::size_of::(); + if len <= MAX_ELEMENTS { + HybridBuffer::Stack([T::default(); MAX_ELEMENTS]) + } else { + HybridBuffer::Heap(vec![T::default(); len]) + } + } + + #[inline] + fn as_mut(&mut self) -> &mut [T] { + match self { + HybridBuffer::Stack(arr) => &mut arr[..], + HybridBuffer::Heap(vec) => vec.as_mut_slice(), + } + } + + #[inline] + fn split_at_mut(&mut self, mid: usize) -> (&mut [T], &mut [T]) { + let slice = self.as_mut(); + slice.split_at_mut(mid) + } +} + #[derive(Debug, PartialEq)] pub enum StrSimError { DifferentLengthArgs, @@ -106,21 +231,26 @@ where search_range = search_range.saturating_sub(1); // combine memory allocations to reduce runtime - let mut flags_memory = vec![false; a_len + b_len]; - let (a_flags, b_flags) = flags_memory.split_at_mut(a_len); + let size_in_usize = ((a_len + b_len) + usize::BITS as usize - 1) / usize::BITS as usize + 2; + let mut flags_memory = HybridBuffer::new(size_in_usize); + let split_index = (b_len + usize::BITS as usize - 1) / usize::BITS as usize; + // Split the pre‑allocated buffer into the two slices. + let (a_slice, b_slice) = flags_memory.split_at_mut(split_index); + + // Initialise the BitVecs with the correctly sized slices. + let mut a_flags = BitSlice::new(a_slice, a_len); + let mut b_flags = BitSlice::new(b_slice, b_len); let mut matches = 0_usize; for (i, a_elem) in a.into_iter().enumerate() { // prevent integer wrapping let min_bound = i.saturating_sub(search_range); - let max_bound = min(b_len, i + search_range + 1); for (j, b_elem) in b.into_iter().enumerate().take(max_bound) { - if min_bound <= j && a_elem == b_elem && !b_flags[j] { - a_flags[i] = true; - b_flags[j] = true; + if min_bound <= j && a_elem == b_elem && !b_flags.set_if_not_set(j) { + a_flags.set(i); matches += 1; break; } @@ -131,10 +261,10 @@ where if matches != 0 { let mut b_iter = b_flags.iter().zip(b); for (a_flag, ch1) in a_flags.iter().zip(a) { - if *a_flag { + if a_flag { loop { if let Some((b_flag, ch2)) = b_iter.next() { - if !*b_flag { + if !b_flag { continue; } @@ -234,7 +364,12 @@ where { let b_len = b.into_iter().count(); - let mut cache: Vec = (1..b_len + 1).collect(); + let mut buffer = HybridBuffer::new(b_len); + + let (cache, _) = buffer.split_at_mut(b_len); + for (j, _) in b.into_iter().enumerate() { + cache[j] = j + 1; + } let mut result = b_len; @@ -297,9 +432,20 @@ pub fn osa_distance(a: &str, b: &str) -> usize { let b_len = b.chars().count(); // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size // this leads to significantly worse code gen when swapping the vectors below - let mut prev_two_distances: Vec = (0..b_len + 1).collect(); - let mut prev_distances: Vec = (0..b_len + 1).collect(); - let mut curr_distances: Vec = vec![0; b_len + 1]; + let mut buffer = HybridBuffer::new((b_len + 1) * 3); + let (mut prev_two_distances, rem) = buffer.split_at_mut(b_len + 1); + let (mut prev_distances, rem) = rem.split_at_mut(b_len + 1); + let mut curr_distances = rem.split_at_mut(b_len + 1).0; + // Initialise the three distance slices so that each element contains its index. + for i in 0..prev_two_distances.len() { + prev_two_distances[i] = i; + } + for i in 0..prev_distances.len() { + prev_distances[i] = i; + } + for i in 0..curr_distances.len() { + curr_distances[i] = i; + } let mut prev_a_char = char::MAX; let mut prev_b_char = char::MAX; @@ -361,7 +507,8 @@ where } let width = a_len + 2; - let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; + let mut buffer = HybridBuffer::new((a_len + 2) * (b_len + 2)); + let distances = buffer.split_at_mut((a_len + 2) * (b_len + 2)).0; let max_distance = a_len + b_len; distances[0] = max_distance; @@ -617,11 +764,19 @@ where let mut last_row_id = HybridGrowingHashmapChar::::default(); let size = len2 + 2; - let mut fr = vec![max_val; size]; - let mut r1 = vec![max_val; size]; - let mut r: Vec = (max_val..max_val + 1) - .chain(0..(size - 1) as isize) - .collect(); + let mut buffer: HybridBuffer = HybridBuffer::new(size * 3); + + let (fr, rem) = buffer.split_at_mut(size); + let (mut r1, mut r) = rem.split_at_mut(size); + r = &mut r[..size]; + + fr.iter_mut().for_each(|x| *x = max_val); + r1.iter_mut().for_each(|x| *x = max_val); + + r[0] = max_val; + for i in 1..size { + r[i] = (i - 1) as isize; + } for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) { mem::swap(&mut r, &mut r1);