core implementation

aneubeck · aneubeck · commit 0d8345310f9b · 2025-08-11T14:50:01.000+02:00
diff --git a/crates/consistent-hashing/Cargo.toml b/crates/consistent-hashing/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "consistent-hashing"
+version = "0.1.0"
+edition = "2021"
+description = "Constant time consistent hashing algorithms."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["probabilistic", "algorithm", "consistent hashing", "jump hashing", "rendezvous hashing"]
+categories = ["algorithms", "data-structures", "mathematics", "science"]
+
+[lib]
+crate-type = ["lib", "staticlib"]
+bench = false
+
+[dependencies]
+
+[dev-dependencies]
diff --git a/crates/consistent-hashing/README.md b/crates/consistent-hashing/README.md
@@ -0,0 +1,60 @@
+# Consistent Hashing
+
+Consistent hashing maps keys to a changing set of nodes (shards, servers) so that when nodes join or leave, only a small fraction of keys move. It is used in distributed caches, databases, object stores, and load balancers to achieve scalability and high availability with minimal data reshuffling.
+
+Common algorithms
+- [Consistent hashing](https://en.wikipedia.org/wiki/Consistent_hashing) (hash ring with virtual nodes)
+- [Rendezvous hashing](https://en.wikipedia.org/wiki/Rendezvous_hashing)
+- [Jump consistent hash](https://en.wikipedia.org/wiki/Jump_consistent_hash)
+- [Maglev hashing](https://research.google/pubs/pub44824) 
+- [AnchorHash: A Scalable Consistent Hash](https://arxiv.org/abs/1812.09674)
+- [DXHash](https://arxiv.org/abs/2107.07930)
+- [JumpBackHash](https://arxiv.org/abs/2403.18682)
+
+## Complexity summary
+
+where `N` is the number of nodes and `R` is the number of replicas.
+
+| Algorithm               | Lookup per key       | Node add/remove                        | Memory                    | Replication support                              |
+|-------------------------|----------------------|----------------------------------------|---------------------------|--------------------------------------------------|
+| Hash ring (with vnodes) | O(log N) binary search over N points; O(1) with specialized structures | O(log N) to insert/remove points         | O(N) points               | Yes: take next R distinct successors; O(log N + R) |
+| Rendezvous              | O(N) score per node; top-1 | O(1) (no state to rebalance)     | O(N) node list            | Yes: pick top R scores; O(N log R) |
+| Jump consistent hash    | O(log(N))            | O(1)                                   | O(1)                      | Not native               |
+| AnchorHash              | O(1) expected        | O(1) expected/amortized                | O(N)                      | Not native               |
+| DXHash                  | O(1) expected        | O(1) expected                          | O(N)                      | Not native               |
+| JumpBackHash            | O(1)                 | O(1) expected                          | O(1)                      | Not native               |
+
+Replication of keys
+- Hash ring: replicate by walking clockwise to the next R distinct nodes. Virtual nodes help spread replicas evenly and avoid hotspots.
+- Rendezvous hashing: replicate by selecting the top R nodes by score for the key. This naturally yields R distinct owners and supports weights.
+- Jump consistent hash: the base function returns one bucket. Replication can be achieved by hashing (key, replica_index) and collecting R distinct buckets; this is simple but lacks the single-pass global ranking HRW provides.
+
+Why replication matters
+- Tolerates node failures and maintenance without data unavailability.
+- Distributes read/write load across multiple owners, reducing hotspots.
+- Enables fast recovery and higher tail-latency resilience.
+
+## N-Choose-R replication
+
+We define the consistent `n-choose-rk` replication as follows:
+
+1. for a given number `n` of nodes, choose `k` distinct nodes `S`.
+2. for a given `key` the chosen set of nodes must be uniformly chosen from all possible sets of size `k`.
+3. when `n` increases by one, exactly one node in the chosen set will be changed with probability `k/(n+1)`.
+
+For simplicity, nodes are represented by integers `0..n`.
+Given `k` independent consistent hash functions `h_i(n)` for a given key, the following algorithm will have the desired properties:
+
+```
+fn consistent_choose_k<Key>(key: Key, k: usize, n: usize) -> Vec<usize> {
+    (0..k).rev().scan(n, |n, k| Some(consistent_choose_next(key, k, n))).collect()
+}
+
+fn consistent_choose_next<Key>(key: Key, k: usize, n: usize) -> usize {
+    (0..k).map(|k| consistent_hash(key, k, n - k) + k).max()
+}
+
+fn consistent_hash<Key>(key: Key, k: usize, n: usize) -> usize {
+    // compute the k-th independent consistent hash for `key` and `n` nodes.
+}
+```
diff --git a/crates/consistent-hashing/src/lib.rs b/crates/consistent-hashing/src/lib.rs
@@ -0,0 +1,292 @@
+use std::hash::{DefaultHasher, Hash, Hasher};
+
+/// One building block for the consistent hashing algorithm is a consistent
+/// hash iterator which enumerates all the hashes for a given for a specific bucket.
+/// A bucket covers the range `(1<<bit)..(2<<bit)`.
+#[derive(Default)]
+struct BucketIterator {
+    hasher: DefaultHasher,
+    n: usize,
+    is_first: bool,
+    bit: u64,
+}
+
+impl BucketIterator {
+    fn new(key: u64, n: usize, bit: u64) -> Self {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        bit.hash(&mut hasher);
+        Self {
+            hasher,
+            n,
+            is_first: true,
+            bit,
+        }
+    }
+}
+
+impl Iterator for BucketIterator {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.bit == 0 {
+            return None;
+        }
+        if self.is_first {
+            let res = self.hasher.finish() % self.bit + self.bit;
+            if res < self.n as u64 {
+                self.n = res as usize;
+                return Some(self.n);
+            }
+            self.is_first = false;
+        }
+        loop {
+            478392.hash(&mut self.hasher);
+            let res = self.hasher.finish() % (self.bit * 2);
+            if res & self.bit == 0 {
+                return None;
+            }
+            if res < self.n as u64 {
+                self.n = res as usize;
+                return Some(self.n);
+            }
+        }
+    }
+}
+
+/// An iterator which enumerates all the consistent hashes for a given key
+/// from largest to smallest in the range `0..n`.
+pub struct ConsistentHashRevIterator {
+    bits: u64,
+    key: u64,
+    n: usize,
+    inner: BucketIterator,
+}
+
+impl ConsistentHashRevIterator {
+    pub fn new(key: u64, n: usize) -> Self {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let bits = hasher.finish() % n.next_power_of_two() as u64;
+        let inner = BucketIterator::default();
+        Self {
+            bits,
+            key,
+            n,
+            inner,
+        }
+    }
+}
+
+impl Iterator for ConsistentHashRevIterator {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.n == 0 {
+            return None;
+        }
+        if let Some(res) = self.inner.next() {
+            return Some(res);
+        }
+        while self.bits > 0 {
+            let bit = 1 << self.bits.ilog2();
+            self.bits ^= bit;
+            self.inner = BucketIterator::new(self.key, self.n, bit);
+            if let Some(res) = self.inner.next() {
+                return Some(res);
+            }
+        }
+        self.n = 0;
+        Some(self.n)
+    }
+}
+
+/// Same as `ConsistentHashRevIterator`, but iterates from smallest to largest
+/// for the range `n..`.
+pub struct ConsistentHashIterator {
+    bits: u64,
+    key: u64,
+    n: usize,
+    stack: Vec<usize>,
+}
+
+impl ConsistentHashIterator {
+    pub fn new(key: u64, n: usize) -> Self {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let mut bits = hasher.finish() as u64;
+        bits &= !((n + 2).next_power_of_two() as u64 / 2 - 1);
+        let stack = if n == 0 { vec![0] } else { vec![] };
+        Self {
+            bits,
+            key,
+            n,
+            stack,
+        }
+    }
+}
+
+impl Iterator for ConsistentHashIterator {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(res) = self.stack.pop() {
+            return Some(res);
+        }
+        while self.bits > 0 {
+            let bit = self.bits & !(self.bits - 1);
+            self.bits &= self.bits - 1;
+            let inner = BucketIterator::new(self.key, bit as usize * 2, bit);
+            self.stack = inner.take_while(|x| *x >= self.n).collect();
+            if let Some(res) = self.stack.pop() {
+                return Some(res);
+            }
+        }
+        None
+    }
+}
+
+/// Wrapper around `ConsistentHashIterator` and `ConsistentHashRevIterator` to compute
+/// the next or previous consistent hash for a given key for a given number of nodes `n`.
+pub struct ConsistentHasher {
+    key: u64,
+}
+
+impl ConsistentHasher {
+    pub fn new(key: u64) -> Self {
+        Self { key }
+    }
+
+    pub fn prev(&self, n: usize) -> usize {
+        let mut sampler = ConsistentHashRevIterator::new(self.key, n);
+        sampler.next().expect("n must be > 0!")
+    }
+
+    pub fn next(&self, n: usize) -> usize {
+        let mut sampler = ConsistentHashIterator::new(self.key, n);
+        sampler.next().expect("Exceeded iterator bounds :(")
+    }
+}
+
+/// Implementation of a consistent choose k hashing algorithm.
+/// It returns k distinct consistent hashes in the range `0..n`.
+/// The hashes are consistent when `n` changes and when `k` changes!
+/// I.e. on average exactly `1/(n+1)` (resp. `1/(k+1)`) many hashes will change
+/// when `n` (resp. `k`) increases by one. Additionally, the returned `k` tuple
+/// is guaranteed to be uniformely chosen from all possible `n-choose-k` tuples.
+pub struct ConsistentChooseKHasher {
+    key: u64,
+    k: usize,
+}
+
+impl ConsistentChooseKHasher {
+    pub fn new(key: u64, k: usize) -> Self {
+        Self { key, k }
+    }
+
+    // TODO: Implement this as an iterator!
+    pub fn prev(&self, mut n: usize) -> Vec<usize> {
+        let mut samples = Vec::with_capacity(self.k);
+        let mut samplers: Vec<_> = (0..self.k)
+            .map(|i| ConsistentHashRevIterator::new(self.key + 43987492 * i as u64, n - i).peekable())
+            .collect();
+        for i in (0..self.k).rev() {
+            let mut max = 0;
+            for k in 0..=i {
+                while samplers[k].peek() >= Some(&(n - k)) && n - k > 0 {
+                    samplers[k].next();
+                }
+                max = max.max(samplers[k].peek().unwrap() + k);
+            }
+            samples.push(max);
+            n = max;
+        }
+        samples.sort();
+        samples
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_uniform_1() {
+        for k in 0..100 {
+            let sampler = ConsistentHasher::new(k);
+            for n in 0..1000 {
+                assert!(sampler.prev(n + 1) <= sampler.prev(n + 2));
+                let next = sampler.next(n);
+                assert_eq!(next, sampler.prev(next + 1));
+            }
+            let mut iter_rev: Vec<_> = ConsistentHashIterator::new(k, 0)
+                .take_while(|x| *x < 1000)
+                .collect();
+            iter_rev.reverse();
+            let iter: Vec<_> = ConsistentHashRevIterator::new(k, 1000).collect();
+            assert_eq!(iter, iter_rev);
+        }
+        let mut stats = vec![0; 13];
+        for i in 0..100000 {
+            let sampler = ConsistentHasher::new(i);
+            let x = sampler.prev(stats.len());
+            stats[x] += 1;
+        }
+        println!("{stats:?}");
+    }
+
+    #[test]
+    fn test_uniform_k() {
+        const K: usize = 3;
+        for k in 0..100 {
+            let sampler = ConsistentChooseKHasher::new(k, K);
+            for n in K..1000 {
+                let samples = sampler.prev(n + 1);
+                assert!(samples.len() == K);
+                for i in 0..K - 1 {
+                    assert!(samples[i] < samples[i + 1]);
+                }
+                let next = sampler.prev(n + 2);
+                for i in 0..K {
+                    assert!(samples[i] <= next[i]);
+                }
+                let mut merged = samples.clone();
+                merged.extend(next.clone());
+                merged.sort();
+                merged.dedup();
+                assert!(
+                    merged.len() == K || merged.len() == K + 1,
+                    "Unexpected {samples:?} vs. {next:?}"
+                );
+            }
+        }
+        let mut stats = vec![0; 8];
+        for i in 0..32 {
+            let sampler = ConsistentChooseKHasher::new(i + 32783, 2);
+            let samples = sampler.prev(stats.len());
+            for s in samples {
+                stats[s] += 1;
+            }
+        }
+        println!("{stats:?}");
+        // Test consistency when increasing k!
+        for k in 1..10 {
+            for n in k + 1..20 {
+                for key in 0..1000 {
+                    let sampler1 = ConsistentChooseKHasher::new(key, k);
+                    let sampler2 = ConsistentChooseKHasher::new(key, k + 1);
+                    let set1 = sampler1.prev(n);
+                    let set2 = sampler2.prev(n);
+                    assert_eq!(set1.len(), k);
+                    assert_eq!(set2.len(), k + 1);
+                    let mut merged = set1.clone();
+                    merged.extend(set2);
+                    merged.sort();
+                    merged.dedup();
+                    assert_eq!(merged.len(), k + 1);
+                }
+            }
+        }
+    }
+}