Add voyage3 model

aneubeck · aneubeck · commit 6566c1cee4a8 · 2025-05-05T14:04:34.000+02:00
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -17,6 +17,7 @@ bpe = { version = "0.2.0", path = "../bpe" }
 either = "1.13"
 regex-automata = "0.4"
 rmp-serde = "1"
+unicode-normalization = "0.1.20"
 
 [dev-dependencies]
 bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] }
diff --git a/crates/bpe-openai/build.rs b/crates/bpe-openai/build.rs
@@ -17,6 +17,11 @@ fn main() {
         include_bytes!("data/o200k_base.tiktoken.gz"),
         17846336922010275747,
     );
+    serialize_tiktoken_bpe(
+        "voyage3_base",
+        include_bytes!("data/voyage3_base.tiktoken.gz"),
+        17846336922010275747,
+    );
     println!("cargo::rerun-if-changed=build.rs");
 }
 
diff --git a/crates/bpe-openai/data/voyage3_base.tiktoken.gz b/crates/bpe-openai/data/voyage3_base.tiktoken.gz
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -8,6 +8,11 @@ use regex_automata::{
     Anchored, Input,
 };
 
+pub mod normalizer;
+
+pub use bpe::*;
+pub use normalizer::{Normalizable, NormalizedString};
+
 // Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
 // The look-ahead character is dropped from the match by the Pretokenizer iterator.
 // Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
-    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
+    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], false)
         .expect("valid regex")
 });
 
@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     ].join("|");
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
-    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
+    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], false)
         .expect("valid regex")
 });
 
-pub use bpe::*;
+static BPE_VOYAGE3_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_voyage3_base.dict"));
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], true)
+        .expect("valid regex")
+});
 
 /// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
 /// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
     pub bpe: BytePairEncoding,
     /// The pattern regex used to split the input.
     pub pre: Option<Pretokenizer>,
+    /// Indicates whether the input should be normalized with NFC.
+    nfc: bool,
 }
 
 pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
 impl Tokenizer {
     /// Build a tokenizer with an optional pretokenization regex pattern.
     #[allow(clippy::result_large_err)]
-    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>, nfc: bool) -> Result<Self, BuildError> {
         let pre = pat.map(Pretokenizer::new).transpose()?;
-        Ok(Self { bpe, pre })
+        Ok(Self { nfc, bpe, pre })
     }
 
     /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
     pub fn new_lookahead(
         bpe: BytePairEncoding,
         patterns: &[(&str, bool)],
+        nfc: bool,
     ) -> Result<Self, BuildError> {
         let pre = Some(Pretokenizer::new_lookahead(patterns)?);
-        Ok(Self { bpe, pre })
+        Ok(Self { nfc, bpe, pre })
     }
 
     /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
     /// before counting.
-    pub fn count(&self, text: &str) -> usize {
-        self.split(text)
+    pub fn count<'a, I: Normalizable<'a>>(&self, text: I) -> usize {
+        let text = self.normalize(text);
+        self.split(text.as_str())
             .map(|piece| self.bpe.count(piece.as_bytes()))
             .sum()
     }
 
     /// Returns the token count iff the total token count stays below the specified token_limit.
     /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
     /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
-    pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
-        self.split(text).try_fold(0, |consumed, piece| {
+    ///
+    /// Note: This function assumes that the text is already normalized, so that this function can run
+    /// in roughly O(token_limit) time.
+    pub fn count_till_limit(&self, text: &NormalizedString, token_limit: usize) -> Option<usize> {
+        let res: Option<usize> = self.split(text.as_str()).try_fold(0, |consumed, piece| {
             self.bpe
                 .count_till_limit(piece.as_bytes(), token_limit - consumed)
                 .map(|piece_count| consumed + piece_count)
-        })
+        });
+        res
     }
 
     /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
     /// encoding.
-    pub fn encode(&self, text: &str) -> Vec<u32> {
-        self.split(text)
+    pub fn encode<'a, I: Normalizable<'a>>(&self, text: I) -> Vec<u32> {
+        let text: NormalizedString<'_> = self.normalize(text);
+        self.split(text.as_str())
             .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
             .collect()
     }
@@ -114,12 +136,18 @@ impl Tokenizer {
 
     /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
     /// tokenizer does not have pre-tokenization, the iterator returns the full text.
-    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
         match &self.pre {
             Some(pre) => Either::Left(pre.split(text)),
             None => Either::Right(std::iter::once(text)),
         }
     }
+
+    /// Returns the normalized text if the tokenizer requires normalization.
+    /// If the input was already normalized, this function is a noop.
+    pub fn normalize<'a, I: Normalizable<'a>>(&self, text: I) -> NormalizedString<'a> {
+        text.normalize(self.nfc)
+    }
 }
 
 impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
     }
 
     /// Returns an iterator with the text pieces after splitting with the regular expression.
-    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
         Splits {
             pat: &self.pat,
             lookahead: &self.lookahead,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
     &BPE_O200K_BASE
 }
 
+pub fn voyage3_base() -> &'static Tokenizer {
+    &BPE_VOYAGE3_BASE
+}
+
 #[cfg(test)]
 mod tests {
     use bpe::byte_pair_encoding::{create_test_string, select_test_string};
@@ -233,9 +265,21 @@ mod tests {
 
     #[test]
     fn test_count_till_limit() {
-        assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
-        assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
-        assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
-        assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abc"), 3),
+            Some(1)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabc"), 3),
+            Some(2)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabc"), 3),
+            Some(3)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabcabc"), 3),
+            None
+        );
     }
 }
diff --git a/crates/bpe-openai/src/normalizer.rs b/crates/bpe-openai/src/normalizer.rs
@@ -0,0 +1,56 @@
+use std::borrow::Cow;
+
+use unicode_normalization::UnicodeNormalization;
+
+/// Type which represents a normalized string.
+/// This is to avoid calling normalize multiple times of forgetting to call normalization!
+///
+/// TODO: Annotate the type with the normalization type, once there are more than one.
+pub struct NormalizedString<'a>(Cow<'a, str>);
+
+impl<'a> NormalizedString<'a> {
+    /// Returns the normalized inner str buffer.
+    pub fn as_str(&self) -> &str {
+        &self.0
+    }
+
+    /// This function is unsafe, since the caller must ensure that the correct normalization
+    /// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
+    /// be handy for certain optimizations or for testing.
+    pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
+        // SAFETY: This is safe because we are creating a NormalizedString from a &str
+        // which is guaranteed to be valid UTF-8.
+        NormalizedString(Cow::Borrowed(s))
+    }
+}
+
+/// Helper trait which converts string types into NormalizedString.
+/// Calling normalize on a NormalizedString is a no-op.
+pub trait Normalizable<'a> {
+    fn normalize(self, nfc: bool) -> NormalizedString<'a>;
+}
+
+impl<'a> Normalizable<'a> for &'a str {
+    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
+        if nfc {
+            NormalizedString(self.nfc().collect())
+        } else {
+            NormalizedString(Cow::Borrowed(self))
+        }
+    }
+}
+
+impl<'a, T> Normalizable<'a> for &'a T
+where
+    T: AsRef<str>,
+{
+    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
+        self.as_ref().normalize(nfc)
+    }
+}
+
+impl<'a> Normalizable<'a> for NormalizedString<'a> {
+    fn normalize(self, _: bool) -> NormalizedString<'a> {
+        self
+    }
+}
diff --git a/crates/bpe/benchmarks/Cargo.toml b/crates/bpe/benchmarks/Cargo.toml
@@ -18,7 +18,7 @@ path = "equivalence.rs"
 test = true
 
 [dependencies]
-bpe = { path = "../../bpe" }
+bpe = { path = "../../bpe", features = ["rand", "tiktoken"] }
 bpe-openai = { path = "../../bpe-openai" }
 criterion = "0.5"
 rand = "0.9"
diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
@@ -1,19 +1,72 @@
+use core::panic;
+use std::collections::HashSet;
+
 use bpe::byte_pair_encoding::{create_test_string, select_test_string};
 use bpe_benchmarks::*;
 
-#[cfg(test)]
-const N: usize = 32;
+/// Converts bytes to unicode characters.
+/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
+/// Hugging face uses the same mapping to work with unicode instead of byte characters.
+fn char_to_byte(c: char) -> u8 {
+    match c as u32 {
+        0x21..0x7f => c as u8,  // 94
+        0xa1..=0xac => c as u8, // 12
+        0xae..=0xff => c as u8, // 82
+        0x7f..0xa1 => c as u8 - 0x7f + 221,
+        0x100..0x121 => (c as u32 - 0x100) as u8,
+        0x121..0x143 => (c as u32 - 0x121) as u8 + 0x7f,
+        0x143..0x144 => 0xad,
+        _ => panic!("Invalid character: {c} {}", c as u32),
+    }
+}
 
 #[test]
-fn test_huggingface_encoding_equivalence_without_pretokenization() {
-    for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
+fn test_compare_dictionary() {
+    for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
         let huggingface = without_pretokenizer(huggingface);
-        let text = create_test_string(&bpe.bpe, 80_000);
-        let texts = (0..N)
-            .map(|_| select_test_string(&text, 100))
+        let mut hugging_tokens = huggingface.get_vocab(false);
+        // HACK: There are incorrect vocabularies in huggingface which have the added tokens stored together with the base tokens..
+        // This is a workaround to remove them.
+        for (added_token, _) in huggingface.get_added_vocabulary().get_vocab() {
+            hugging_tokens.remove(added_token);
+        }
+        let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect();
+        hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b));
+        let hugging_tokens: Vec<_> = hugging_tokens
+            .into_iter()
+            .map(|(token, _)| token.chars().map(char_to_byte).collect())
+            .collect();
+        let bpe_tokens: Vec<_> = (0..bpe.bpe.num_tokens())
+            .map(|id| bpe.bpe.token_bytes(id as u32).to_vec())
+            .collect();
+        let hugging_set: HashSet<_> = hugging_tokens.iter().cloned().collect();
+        let bpe_set: HashSet<_> = bpe_tokens.iter().cloned().collect();
+        let diff: Vec<_> = hugging_set.symmetric_difference(&bpe_set).collect();
+        assert!(diff.is_empty(), "{name}: Token sets differ");
+        // Uncomment the following lines to write the tokens to a file in tiktoken format
+        /*
+        let mut file =
+            std::fs::File::create(std::path::Path::new(_name)).expect("can create output file");
+        std::io::Write::write_all(
+            &mut file,
+            bpe::byte_pair_encoding::write_tiktoken(hugging_tokens).as_bytes(),
+        )
+        .expect("can write output to file");
+        */
+    }
+}
+
+#[test]
+fn test_huggingface_encoding_equivalence_without_pretokenization() {
+    for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let text: String = create_test_string(&bpe.bpe, 200_000);
+        let text = bpe.normalize(&text);
+        let texts = (0..300)
+            .map(|_| select_test_string(text.as_str(), 100))
             .chain(std::iter::once(
                 "You should see the Greek word 'kosme':       \"κόσμε\"",
             ));
+        let huggingface = without_pretokenizer(huggingface);
         for text in texts {
             let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
             let huggingface_out = huggingface
@@ -26,14 +79,10 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() {
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
                 if huggingface_text != text {
                     panic!(
-                        "huggingface tokens and text differ: {:?} != {:?}",
-                        text, huggingface_text
+                        "{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}",
                     );
                 } else {
-                    panic!(
-                        "huggingface tokens differ: {:?} != {:?}",
-                        out, huggingface_out
-                    );
+                    panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}");
                 }
             }
         }
@@ -42,9 +91,9 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() {
 
 #[test]
 fn test_huggingface_encoding_equivalence_with_pretokenization() {
-    for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
-        let text = create_test_string(&bpe.bpe, 80_000);
-        let texts = (0..N)
+    for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let text = create_test_string(&bpe.bpe, 200_000);
+        let texts = (0..300)
             .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
                 "You should see the Greek word 'kosme':       \"κόσμε\"   ",
@@ -62,14 +111,10 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() {
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
                 if huggingface_text != text {
                     panic!(
-                        "huggingface tokens and text differ: {:?} != {:?}",
-                        text, huggingface_text
+                        "{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}",
                     );
                 } else {
-                    panic!(
-                        "huggingface tokens differ: {:?} != {:?}",
-                        out, huggingface_out
-                    );
+                    panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}");
                 }
             }
         }
diff --git a/crates/bpe/benchmarks/lib.rs b/crates/bpe/benchmarks/lib.rs
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs