github · aneubeck · Mar 6, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -203,7 +203,7 @@ We benchmarked the following scenarios:
   The data structure we built specifically for this purpose can answer those interval counting requests in typically constant times after the initial linear preprocessing of the text.
   This mode is not available in tiktoken, which only supports counting/encoding a complete text.
 
-All benchmarks were run single-threaded on a MacBook Pro M1.
+All benchmarks were run single-threaded on a MacBook Air M4.
 
 ### Encoding
 
@@ -219,6 +219,7 @@ Two additional encoders are included that are faster but deviate from the origin
 
 - The greedy encoder picks the left-longest token.
 - The minimal encoder computes an encoding with the minimal number of tokens.
+- The minimal_dropout encoder implements BPE-Dropout [algorithm](https://arxiv.org/abs/1910.13267), randomly ignoring some multi-byte tokens at runtime.
 
 The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
 (All encodings were computed from scratch for each slice.)

@@ -9,8 +9,17 @@ use bpe_benchmarks::*;
 use criterion::{
     criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
 };
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 use rand::{rng, Rng};
 
+fn get_rng(seed: u64) -> StdRng {
+    // Expand the u64 seed to 32 bytes
+    let mut seed_bytes = [0u8; 32];
+    seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
+    StdRng::from_seed(seed_bytes)
+}
+
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _, _) in TOKENIZERS.iter() {
         let input = create_test_string(&bpe.bpe, 80_000);
@@ -92,6 +101,20 @@ fn encoding_benchmark(c: &mut Criterion) {
                     criterion::BatchSize::SmallInput,
                 )
             });
+            group.bench_with_input(
+                BenchmarkId::new("minimal_dropout", bytes),
+                &bytes,
+                |b, bytes| {
+                    b.iter_batched(
+                        || select_test_string(&text, *bytes),
+                        |text| {
+                            bpe.bpe
+                                .encode_minimal_dropout(text.as_bytes(), 0.1, get_rng(0))
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                },
+            );
             group.bench_with_input(
                 BenchmarkId::new("huggingface", bytes),
                 &bytes,

@@ -526,9 +526,9 @@ impl BytePairEncoding {
     /// tokenization produced by the original BPE algorithm.
     pub fn encode_minimal(&self, text: &[u8]) -> Vec<u32> {
         let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
-        let mut state = self.overlapping_searcher.start_state();
-        for (pos, c) in text.iter().enumerate() {
-            let (s, iter) = self.overlapping_searcher.consume(state, pos + 1, *c);
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
             state = s;
             let mut best = (0, u32::MAX);
             for m in iter {
@@ -548,7 +548,62 @@ impl BytePairEncoding {
             encoded.push(token);
             pos -= self.token_len(token);
         }
-        encoded.reverse();
+        encoded
+    }
+
+    /// This function computes the encoding while randomly rejecting some merges.
+    /// Result of the encoding will be non-deterministic unless `seed` is provided.
+    /// Implementation loosely follows original BPE dropout paper: https://arxiv.org/abs/1910.13267
+    ///
+    /// In more detail: the tokenization uses dynamic programming, i.e. it models the tokenization as a graph,
+    /// where every position between text bytes is a node and two nodes are connected when the text slice between those two nodes matches a token.
+    // It then tries to find the shortest possible path from the beginning of the text till the end, i.e. it finds the shortest possible encoding.
+    // For this is processes the nodes from left to right and visits all edges to the left. Then, it picks the edge which results in the shortest path.
+    // The length of the shortest path is stored as second value, the edge (or rather token) is stored as first value.
+    //
+    // For the dropout (when dropout > 0.0), we uniformly drop edges from the graph, but always keep the one-byte tokens such that the graph stays connected.
+    // Note: this is very different from how BPE works and cannot produce the same output as the algorithm
+    // in the [paper's repository](https://github.com/VProv/BPE-Dropout/blob/master/bpe.py#L98), for two main reasons:
+    //   - `encode_minimal` already doesn't follow the original heap-based BPE procedure
+    //   - randomness source in dropout works differently in rust and python
+    //   - BPE-dropout authors discard all multi-byte tokens for each word separately, while this implementation does not split the "sentence" into words first
+    //     and hence may include previously discarded token later down the byte stream. At the sentence level though we don't expect it to make much difference.
+    #[cfg(feature = "rand")]
+    pub fn encode_minimal_dropout<R: rand::Rng>(
+        &self,
+        text: &[u8],
+        dropout: f32,
+        mut rng: R,
+    ) -> Vec<u32> {
+        assert!(0.0 <= dropout);
+        assert!(dropout <= 1.0);
+
+        let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
+            state = s;
+            let mut best = (0, u32::MAX);
+            for m in iter {
+                if m.end() > m.start() + 1 && dropout >= rng.random() {
+                    continue;
+                }
+                if m.start() == 0 {
+                    best = (m.value(), 1);
+                    break;
+                } else if last_token[m.start() - 1].1 + 1 < best.1 {
+                    best = (m.value(), last_token[m.start() - 1].1 + 1);
+                }
+            }
+            last_token.push(best);
+        }
+        let mut encoded = Vec::with_capacity(last_token.last().map(|l| l.1 as usize).unwrap_or(0));
+        let mut pos = text.len();
+        while pos > 0 {
+            let token = last_token[pos - 1].0;
+            encoded.push(token);
+            pos -= self.token_len(token);
+        }
         encoded
     }
 }

diff --git a/crates/bpe/tests/Cargo.toml b/crates/bpe/tests/Cargo.toml
@@ -8,3 +8,6 @@ bpe-openai = { path = "../../bpe-openai" }
 itertools = "0.14"
 rand = "0.9"
 tiktoken-rs = "0.9"
+
+[dev-dependencies]
+rand_chacha = { version = "0.9" }
@@ -1,5 +1,7 @@
 #[cfg(test)]
 mod tests {
+    use std::time;
+
     use itertools::Itertools;
     use rand::{rng, Rng};
     use tiktoken_rs::cl100k_base_singleton;
@@ -141,4 +143,45 @@ mod tests {
             assert_eq!(enc.token_count(), bpe.count(&input[i..]));
         }
     }
+
+    #[test]
+    fn test_bpe_dropout() {
+        use rand::rngs::StdRng;
+        use rand::SeedableRng;
+
+        fn get_rng(seed: u64) -> StdRng {
+            // Expand the u64 seed to 32 bytes
+            let mut seed_bytes = [0u8; 32];
+            seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
+            StdRng::from_seed(seed_bytes)
+        }
+
+        let bpe = &cl100k_base().bpe;
+        for bytes in [10000, 20000] {
+            for _ in 0..8 {
+                let input = create_test_bytes(bpe, bytes);
+                let encoded = bpe.encode_minimal(&input);
+                let encoded_d_min = bpe.encode_minimal_dropout(&input, 0.2, get_rng(0));
+                let encoded_d_max = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+                let encoded_d_1_0 = bpe.encode_minimal_dropout(&input, 1.0, get_rng(2));
+                let decoded = bpe.decode_tokens(&encoded);
+                let decoded_min = bpe.decode_tokens(&encoded_d_min);
+                let decoded_max = bpe.decode_tokens(&encoded_d_max);
+                let decoded_max_again = bpe.decode_tokens(&encoded_d_1_0);
+                println!("Input length: {}, Encoded length: {}, Encoded with dropout length: {}-{}, max {}",
+                    input.len(), encoded.len(), encoded_d_min.len(), encoded_d_max.len(), encoded_d_1_0.len());
+                assert_eq!(input, decoded);
+                assert_eq!(input, decoded_min);
+                assert_eq!(input, decoded_max);
+                assert_eq!(input, decoded_max_again);
+                assert_eq!(input.len(), encoded_d_1_0.len());
+                assert!(encoded_d_min.len() >= encoded.len());
+                assert!(encoded_d_max.len() > encoded.len());
+
+                assert_ne!(encoded, encoded_d_min);
+                assert_ne!(encoded, encoded_d_max);
+                assert_ne!(encoded_d_max, encoded_d_1_0);
+            }
+        }
+    }
 }