Skip to content

Commit 6566c1c

Browse files
committed
Add voyage3 model
1 parent 0adc995 commit 6566c1c

9 files changed

Lines changed: 221 additions & 60 deletions

File tree

crates/bpe-openai/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ bpe = { version = "0.2.0", path = "../bpe" }
1717
either = "1.13"
1818
regex-automata = "0.4"
1919
rmp-serde = "1"
20+
unicode-normalization = "0.1.20"
2021

2122
[dev-dependencies]
2223
bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] }

crates/bpe-openai/build.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ fn main() {
1717
include_bytes!("data/o200k_base.tiktoken.gz"),
1818
17846336922010275747,
1919
);
20+
serialize_tiktoken_bpe(
21+
"voyage3_base",
22+
include_bytes!("data/voyage3_base.tiktoken.gz"),
23+
17846336922010275747,
24+
);
2025
println!("cargo::rerun-if-changed=build.rs");
2126
}
2227

1.08 MB
Binary file not shown.

crates/bpe-openai/src/lib.rs

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ use regex_automata::{
88
Anchored, Input,
99
};
1010

11+
pub mod normalizer;
12+
13+
pub use bpe::*;
14+
pub use normalizer::{Normalizable, NormalizedString};
15+
1116
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
1217
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
1318
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1823
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
1924
let pat2 = "\\s+\\s";
2025
let pat3 = "\\s+";
21-
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
26+
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], false)
2227
.expect("valid regex")
2328
});
2429

@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
3540
].join("|");
3641
let pat2 = "\\s+\\s";
3742
let pat3 = "\\s+";
38-
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
43+
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], false)
3944
.expect("valid regex")
4045
});
4146

42-
pub use bpe::*;
47+
static BPE_VOYAGE3_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
48+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_voyage3_base.dict"));
49+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
50+
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
51+
let pat2 = "\\s+\\s";
52+
let pat3 = "\\s+";
53+
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], true)
54+
.expect("valid regex")
55+
});
4356

4457
/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
4558
/// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
5265
pub bpe: BytePairEncoding,
5366
/// The pattern regex used to split the input.
5467
pub pre: Option<Pretokenizer>,
68+
/// Indicates whether the input should be normalized with NFC.
69+
nfc: bool,
5570
}
5671

5772
pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
6479
impl Tokenizer {
6580
/// Build a tokenizer with an optional pretokenization regex pattern.
6681
#[allow(clippy::result_large_err)]
67-
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
82+
pub fn new(bpe: BytePairEncoding, pat: Option<&str>, nfc: bool) -> Result<Self, BuildError> {
6883
let pre = pat.map(Pretokenizer::new).transpose()?;
69-
Ok(Self { bpe, pre })
84+
Ok(Self { nfc, bpe, pre })
7085
}
7186

7287
/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
7590
pub fn new_lookahead(
7691
bpe: BytePairEncoding,
7792
patterns: &[(&str, bool)],
93+
nfc: bool,
7894
) -> Result<Self, BuildError> {
7995
let pre = Some(Pretokenizer::new_lookahead(patterns)?);
80-
Ok(Self { bpe, pre })
96+
Ok(Self { nfc, bpe, pre })
8197
}
8298

8399
/// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84100
/// before counting.
85-
pub fn count(&self, text: &str) -> usize {
86-
self.split(text)
101+
pub fn count<'a, I: Normalizable<'a>>(&self, text: I) -> usize {
102+
let text = self.normalize(text);
103+
self.split(text.as_str())
87104
.map(|piece| self.bpe.count(piece.as_bytes()))
88105
.sum()
89106
}
90107

91108
/// Returns the token count iff the total token count stays below the specified token_limit.
92109
/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93110
/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
94-
pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
95-
self.split(text).try_fold(0, |consumed, piece| {
111+
///
112+
/// Note: This function assumes that the text is already normalized, so that this function can run
113+
/// in roughly O(token_limit) time.
114+
pub fn count_till_limit(&self, text: &NormalizedString, token_limit: usize) -> Option<usize> {
115+
let res: Option<usize> = self.split(text.as_str()).try_fold(0, |consumed, piece| {
96116
self.bpe
97117
.count_till_limit(piece.as_bytes(), token_limit - consumed)
98118
.map(|piece_count| consumed + piece_count)
99-
})
119+
});
120+
res
100121
}
101122

102123
/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
103124
/// encoding.
104-
pub fn encode(&self, text: &str) -> Vec<u32> {
105-
self.split(text)
125+
pub fn encode<'a, I: Normalizable<'a>>(&self, text: I) -> Vec<u32> {
126+
let text: NormalizedString<'_> = self.normalize(text);
127+
self.split(text.as_str())
106128
.flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
107129
.collect()
108130
}
@@ -114,12 +136,18 @@ impl Tokenizer {
114136

115137
/// Returns an iterator with the text pieces resulting from pre-tokenization. If this
116138
/// tokenizer does not have pre-tokenization, the iterator returns the full text.
117-
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
139+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
118140
match &self.pre {
119141
Some(pre) => Either::Left(pre.split(text)),
120142
None => Either::Right(std::iter::once(text)),
121143
}
122144
}
145+
146+
/// Returns the normalized text if the tokenizer requires normalization.
147+
/// If the input was already normalized, this function is a noop.
148+
pub fn normalize<'a, I: Normalizable<'a>>(&self, text: I) -> NormalizedString<'a> {
149+
text.normalize(self.nfc)
150+
}
123151
}
124152

125153
impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
143171
}
144172

145173
/// Returns an iterator with the text pieces after splitting with the regular expression.
146-
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
174+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
147175
Splits {
148176
pat: &self.pat,
149177
lookahead: &self.lookahead,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
201229
&BPE_O200K_BASE
202230
}
203231

232+
pub fn voyage3_base() -> &'static Tokenizer {
233+
&BPE_VOYAGE3_BASE
234+
}
235+
204236
#[cfg(test)]
205237
mod tests {
206238
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
@@ -233,9 +265,21 @@ mod tests {
233265

234266
#[test]
235267
fn test_count_till_limit() {
236-
assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
237-
assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
238-
assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
239-
assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
268+
assert_eq!(
269+
cl100k_base().count_till_limit(&cl100k_base().normalize("abc"), 3),
270+
Some(1)
271+
);
272+
assert_eq!(
273+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabc"), 3),
274+
Some(2)
275+
);
276+
assert_eq!(
277+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabc"), 3),
278+
Some(3)
279+
);
280+
assert_eq!(
281+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabcabc"), 3),
282+
None
283+
);
240284
}
241285
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
use std::borrow::Cow;
2+
3+
use unicode_normalization::UnicodeNormalization;
4+
5+
/// Type which represents a normalized string.
6+
/// This is to avoid calling normalize multiple times of forgetting to call normalization!
7+
///
8+
/// TODO: Annotate the type with the normalization type, once there are more than one.
9+
pub struct NormalizedString<'a>(Cow<'a, str>);
10+
11+
impl<'a> NormalizedString<'a> {
12+
/// Returns the normalized inner str buffer.
13+
pub fn as_str(&self) -> &str {
14+
&self.0
15+
}
16+
17+
/// This function is unsafe, since the caller must ensure that the correct normalization
18+
/// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
19+
/// be handy for certain optimizations or for testing.
20+
pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
21+
// SAFETY: This is safe because we are creating a NormalizedString from a &str
22+
// which is guaranteed to be valid UTF-8.
23+
NormalizedString(Cow::Borrowed(s))
24+
}
25+
}
26+
27+
/// Helper trait which converts string types into NormalizedString.
28+
/// Calling normalize on a NormalizedString is a no-op.
29+
pub trait Normalizable<'a> {
30+
fn normalize(self, nfc: bool) -> NormalizedString<'a>;
31+
}
32+
33+
impl<'a> Normalizable<'a> for &'a str {
34+
fn normalize(self, nfc: bool) -> NormalizedString<'a> {
35+
if nfc {
36+
NormalizedString(self.nfc().collect())
37+
} else {
38+
NormalizedString(Cow::Borrowed(self))
39+
}
40+
}
41+
}
42+
43+
impl<'a, T> Normalizable<'a> for &'a T
44+
where
45+
T: AsRef<str>,
46+
{
47+
fn normalize(self, nfc: bool) -> NormalizedString<'a> {
48+
self.as_ref().normalize(nfc)
49+
}
50+
}
51+
52+
impl<'a> Normalizable<'a> for NormalizedString<'a> {
53+
fn normalize(self, _: bool) -> NormalizedString<'a> {
54+
self
55+
}
56+
}

crates/bpe/benchmarks/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ path = "equivalence.rs"
1818
test = true
1919

2020
[dependencies]
21-
bpe = { path = "../../bpe" }
21+
bpe = { path = "../../bpe", features = ["rand", "tiktoken"] }
2222
bpe-openai = { path = "../../bpe-openai" }
2323
criterion = "0.5"
2424
rand = "0.9"

crates/bpe/benchmarks/equivalence.rs

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,72 @@
1+
use core::panic;
2+
use std::collections::HashSet;
3+
14
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
25
use bpe_benchmarks::*;
36

4-
#[cfg(test)]
5-
const N: usize = 32;
7+
/// Converts bytes to unicode characters.
8+
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
9+
/// Hugging face uses the same mapping to work with unicode instead of byte characters.
10+
fn char_to_byte(c: char) -> u8 {
11+
match c as u32 {
12+
0x21..0x7f => c as u8, // 94
13+
0xa1..=0xac => c as u8, // 12
14+
0xae..=0xff => c as u8, // 82
15+
0x7f..0xa1 => c as u8 - 0x7f + 221,
16+
0x100..0x121 => (c as u32 - 0x100) as u8,
17+
0x121..0x143 => (c as u32 - 0x121) as u8 + 0x7f,
18+
0x143..0x144 => 0xad,
19+
_ => panic!("Invalid character: {c} {}", c as u32),
20+
}
21+
}
622

723
#[test]
8-
fn test_huggingface_encoding_equivalence_without_pretokenization() {
9-
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
24+
fn test_compare_dictionary() {
25+
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
1026
let huggingface = without_pretokenizer(huggingface);
11-
let text = create_test_string(&bpe.bpe, 80_000);
12-
let texts = (0..N)
13-
.map(|_| select_test_string(&text, 100))
27+
let mut hugging_tokens = huggingface.get_vocab(false);
28+
// HACK: There are incorrect vocabularies in huggingface which have the added tokens stored together with the base tokens..
29+
// This is a workaround to remove them.
30+
for (added_token, _) in huggingface.get_added_vocabulary().get_vocab() {
31+
hugging_tokens.remove(added_token);
32+
}
33+
let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect();
34+
hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b));
35+
let hugging_tokens: Vec<_> = hugging_tokens
36+
.into_iter()
37+
.map(|(token, _)| token.chars().map(char_to_byte).collect())
38+
.collect();
39+
let bpe_tokens: Vec<_> = (0..bpe.bpe.num_tokens())
40+
.map(|id| bpe.bpe.token_bytes(id as u32).to_vec())
41+
.collect();
42+
let hugging_set: HashSet<_> = hugging_tokens.iter().cloned().collect();
43+
let bpe_set: HashSet<_> = bpe_tokens.iter().cloned().collect();
44+
let diff: Vec<_> = hugging_set.symmetric_difference(&bpe_set).collect();
45+
assert!(diff.is_empty(), "{name}: Token sets differ");
46+
// Uncomment the following lines to write the tokens to a file in tiktoken format
47+
/*
48+
let mut file =
49+
std::fs::File::create(std::path::Path::new(_name)).expect("can create output file");
50+
std::io::Write::write_all(
51+
&mut file,
52+
bpe::byte_pair_encoding::write_tiktoken(hugging_tokens).as_bytes(),
53+
)
54+
.expect("can write output to file");
55+
*/
56+
}
57+
}
58+
59+
#[test]
60+
fn test_huggingface_encoding_equivalence_without_pretokenization() {
61+
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
62+
let text: String = create_test_string(&bpe.bpe, 200_000);
63+
let text = bpe.normalize(&text);
64+
let texts = (0..300)
65+
.map(|_| select_test_string(text.as_str(), 100))
1466
.chain(std::iter::once(
1567
"You should see the Greek word 'kosme': \"κόσμε\"",
1668
));
69+
let huggingface = without_pretokenizer(huggingface);
1770
for text in texts {
1871
let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
1972
let huggingface_out = huggingface
@@ -26,14 +79,10 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() {
2679
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
2780
if huggingface_text != text {
2881
panic!(
29-
"huggingface tokens and text differ: {:?} != {:?}",
30-
text, huggingface_text
82+
"{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}",
3183
);
3284
} else {
33-
panic!(
34-
"huggingface tokens differ: {:?} != {:?}",
35-
out, huggingface_out
36-
);
85+
panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}");
3786
}
3887
}
3988
}
@@ -42,9 +91,9 @@ fn test_huggingface_encoding_equivalence_without_pretokenization() {
4291

4392
#[test]
4493
fn test_huggingface_encoding_equivalence_with_pretokenization() {
45-
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
46-
let text = create_test_string(&bpe.bpe, 80_000);
47-
let texts = (0..N)
94+
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
95+
let text = create_test_string(&bpe.bpe, 200_000);
96+
let texts = (0..300)
4897
.map(|_| select_test_string(&text, 100))
4998
.chain(std::iter::once(
5099
"You should see the Greek word 'kosme': \"κόσμε\" ",
@@ -62,14 +111,10 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() {
62111
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
63112
if huggingface_text != text {
64113
panic!(
65-
"huggingface tokens and text differ: {:?} != {:?}",
66-
text, huggingface_text
114+
"{name}: huggingface tokens and text differ: {text:?} != {huggingface_text:?}",
67115
);
68116
} else {
69-
panic!(
70-
"huggingface tokens differ: {:?} != {:?}",
71-
out, huggingface_out
72-
);
117+
panic!("{name}: huggingface tokens differ: {out:?} != {huggingface_out:?}");
73118
}
74119
}
75120
}

0 commit comments

Comments
 (0)