Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
0d83453
core implementation
aneubeck Aug 11, 2025
89f8ad4
Update README.md
aneubeck Aug 12, 2025
b03f0b7
Apply suggestion from @Copilot
aneubeck Aug 12, 2025
0dbf1c9
finish proof
aneubeck Aug 13, 2025
953d9bd
Merge branch 'aneubeck/sampling' of https://github.com/github/rust-ge…
aneubeck Aug 13, 2025
a5eb91e
Update README.md
aneubeck Aug 13, 2025
220624d
Update README.md
aneubeck Aug 13, 2025
fc69a9e
Update README.md
aneubeck Aug 13, 2025
90259e9
Update README.md
aneubeck Aug 13, 2025
8480ea3
Replace key with hasher traits
aneubeck Aug 13, 2025
0baaafc
Update lib.rs
aneubeck Aug 13, 2025
0dcb137
Update README.md
aneubeck Aug 13, 2025
d4b8410
Update crates/consistent-hashing/README.md
aneubeck Aug 15, 2025
0935ea0
Update crates/consistent-hashing/README.md
aneubeck Aug 15, 2025
99c69f3
Update crates/consistent-hashing/README.md
aneubeck Aug 15, 2025
496f539
Update crates/consistent-hashing/README.md
aneubeck Aug 15, 2025
23f3080
add benchmark
aneubeck Aug 15, 2025
5d52237
remove second vector
aneubeck Aug 15, 2025
f6e29f7
Update README.md
aneubeck Aug 15, 2025
d20f9b6
Update performance.rs
aneubeck Aug 15, 2025
1dde97c
make linter happy
aneubeck Aug 15, 2025
9171444
some more docu + better choose_k implementation
aneubeck May 18, 2026
20048d8
fix some tests
aneubeck May 18, 2026
e9fe3bd
fix remaining tests
aneubeck May 18, 2026
bec3c8f
clippy + fmt
aneubeck May 18, 2026
da37dd7
Update bounded_load.rs
aneubeck May 19, 2026
48e3b0b
Update bounded_load.rs
aneubeck May 19, 2026
25a42f8
Update bounded_load.rs
aneubeck May 19, 2026
04da223
Update README.md
aneubeck May 19, 2026
df712d9
Merge branch 'main' into aneubeck/sampling
aneubeck May 19, 2026
f3bbcd9
add a proper consistentnodemap implementation + documentation
aneubeck May 20, 2026
7deb027
Update README.md
aneubeck May 20, 2026
e54eaa2
Update README.md
aneubeck May 20, 2026
cad9115
Update README.md
aneubeck May 20, 2026
4e3dc65
clippy
aneubeck May 20, 2026
2dda8bd
Update README.md
aneubeck May 20, 2026
83174e2
rename crate
aneubeck May 20, 2026
bd4083f
upgrade dependencies
aneubeck May 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"crates/*",
"crates/bpe/benchmarks",
"crates/bpe/tests",
"crates/consistent-choose-k/benchmarks",
]
resolver = "2"

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ A collection of useful algorithms written in Rust. Currently contains:
- [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
- [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
- [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate.
- [`consistent-choose-k`](crates/consistent-choose-k): constant time consistent hashing algorithms with support for replication and bounded load.
- [`sparse-ngrams`](crates/sparse-ngrams): fast sparse n-gram extraction from byte slices. Selects variable-length n-grams (2–8 bytes) deterministically using bigram frequency priorities, suitable for substring search indexes.
- [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.

Expand Down
2 changes: 1 addition & 1 deletion crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ unicode-normalization = "0.1"

[dev-dependencies]
bpe = { version = "0.2", path = "../bpe", features = ["rand"] }
tiktoken-rs = "0.9"
tiktoken-rs = "0.11"

[build-dependencies]
base64 = "0.22"
Expand Down
2 changes: 1 addition & 1 deletion crates/bpe/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ serde = { version = "1", features = ["derive"] }

[dev-dependencies]
bpe = { path = "." }
tiktoken-rs = "0.9"
tiktoken-rs = "0.11"

[package.metadata.docs.rs]
all-features = true
4 changes: 2 additions & 2 deletions crates/bpe/benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ bpe = { path = "../../bpe", features = ["rand", "tiktoken"] }
bpe-openai = { path = "../../bpe-openai" }
criterion = "0.8"
rand = "0.10"
tiktoken-rs = "0.9"
tokenizers = { version = "0.22", features = ["http"] }
tiktoken-rs = "0.11"
tokenizers = { version = "0.23", features = ["http"] }
2 changes: 1 addition & 1 deletion crates/bpe/tests/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ bpe = { path = "../../bpe", features = ["rand"] }
bpe-openai = { path = "../../bpe-openai" }
itertools = "0.14"
rand = "0.10"
tiktoken-rs = "0.9"
tiktoken-rs = "0.11"
20 changes: 20 additions & 0 deletions crates/consistent-choose-k/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[package]
name = "consistent-choose-k"
version = "0.1.0"
edition = "2021"
description = "Stateless consistent choose-k hashing for replication, failover, and bounded-load placement."
repository = "https://github.com/github/rust-gems"
homepage = "https://github.com/github/rust-gems/tree/main/crates/consistent-choose-k"
documentation = "https://docs.rs/consistent-choose-k"
readme = "README.md"
license = "MIT"
keywords = ["consistent", "hashing", "replication", "choose-k", "sampling"]
categories = ["algorithms", "data-structures", "mathematics", "science"]

[lib]
crate-type = ["lib", "staticlib"]
bench = false

[dependencies]

[dev-dependencies]
197 changes: 197 additions & 0 deletions crates/consistent-choose-k/README.md

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions crates/consistent-choose-k/benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "consistent-choose-k-benchmarks"
edition = "2021"

[[bench]]
name = "performance"
path = "performance.rs"
harness = false
test = false

[dependencies]
consistent-choose-k = { path = "../" }

criterion = { version = "0.8", features = ["csv_output"] }
rand = "0.10"
18 changes: 18 additions & 0 deletions crates/consistent-choose-k/benchmarks/criterion.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# save report in this directory, even if a custom target directory is set
criterion_home = "./target/criterion"

# The colors table allows users to configure the colors used by the charts
# cargo-criterion generates.
[colors]
# Color-blind friendly color scheme from https://personal.sron.nl/~pault/.
comparison_colors = [
{r = 51, g = 34, b = 136 }, # indigo
{r = 136, g = 204, b = 238 }, # cyan
{r = 68, g = 170, b = 153 }, # teal
{r = 17, g = 119, b = 51 }, # green
{r = 153, g = 153, b = 51 }, # olive
{r = 221, g = 204, b = 119 }, # sand
{r = 204, g = 102, b = 119 }, # rose
{r = 136, g = 34, b = 85 }, # wine
{r = 170, g = 68, b = 153 }, # purple
]
87 changes: 87 additions & 0 deletions crates/consistent-choose-k/benchmarks/performance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use std::{
hash::{DefaultHasher, Hash},
hint::black_box,
time::Duration,
};

use consistent_choose_k::{ConsistentChooseKHasher, ConsistentHasher};
use criterion::{
criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
Throughput,
};
use rand::{rng, RngExt};

fn throughput_benchmark(c: &mut Criterion) {
let keys: Vec<u64> = rng().random_iter().take(1000).collect();

let mut group = c.benchmark_group("choose");
group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for n in [1usize, 10, 100, 1000, 10000] {
group.throughput(Throughput::Elements(keys.len() as u64));
group.bench_with_input(BenchmarkId::new("1", n), &n, |b, n| {
b.iter_batched(
|| &keys,
|keys| {
for key in keys {
let mut h = DefaultHasher::default();
key.hash(&mut h);
black_box(ConsistentHasher::new(h).prev(*n + 1));
}
},
criterion::BatchSize::SmallInput,
)
});
for k in [1, 2, 3, 10, 100] {
group.bench_with_input(BenchmarkId::new(format!("k_{k}"), n), &n, |b, n| {
b.iter_batched(
|| &keys,
|keys| {
for key in keys {
let mut h = DefaultHasher::default();
key.hash(&mut h);
black_box(ConsistentChooseKHasher::new_with_k(h, *n + k, k));
}
},
criterion::BatchSize::SmallInput,
)
});
}
}
group.finish();
}

fn append_vs_new_with_k(c: &mut Criterion) {
let mut group = c.benchmark_group("append_vs_new_with_k");
group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for n in [10usize, 100, 1000, 10000] {
for k in [2, 3, 10, 100] {
group.bench_function(BenchmarkId::new(format!("new_with_k/k_{k}"), n), |b| {
b.iter(|| {
let h = DefaultHasher::default();
black_box(ConsistentChooseKHasher::new_with_k(h, n + k, k));
})
});
group.bench_function(BenchmarkId::new(format!("append/k_{k}"), n), |b| {
b.iter(|| {
let h = DefaultHasher::default();
let mut iter = ConsistentChooseKHasher::new(h, n + k);
for _ in 0..k {
black_box(iter.grow_k());
}
})
});
}
}
group.finish();
}

criterion_group!(
name = benches;
config = Criterion::default()
.warm_up_time(Duration::from_millis(500))
.measurement_time(Duration::from_millis(4000))
.nresamples(1000);

targets = throughput_benchmark, append_vs_new_with_k,
);
criterion_main!(benches);
Loading