Skip to content

Commit 31eca42

Browse files
authored
chore: Update tokenizers (#823)
## Description This PR adds new tokenizer headers and bumps executorch binaries to reflect the changes. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [x] Other (chores, tests, code style improvements etc.) ### Tested on - [ ] iOS - [ ] Android ### Testing instructions Run apps for LLM, T2I, S2T, Embedding
1 parent e51547e commit 31eca42

33 files changed

+4201
-5
lines changed
1.35 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/hf_tokenizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,4 @@ class HFTokenizer : public Tokenizer {
8080
Model::Ptr _model;
8181
};
8282

83-
} // namespace tokenizers
83+
} // namespace tokenizers

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/model.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,4 +148,4 @@ class ModelConfig {
148148
Model::Ptr create() const;
149149
};
150150

151-
} // namespace tokenizers
151+
} // namespace tokenizers

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/pre_tokenizer.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ class PreTokenizerConfig {
104104
*/
105105
PRETOKENIZER_CONFIG_MEMBER(bool, add_prefix_space)
106106

107+
/**
108+
* Used by: ByteLevelPreTokenizer
109+
*/
110+
PRETOKENIZER_CONFIG_MEMBER(bool, use_regex)
111+
107112
/**
108113
* Used by RegexPreTokenizer
109114
*/
@@ -225,11 +230,13 @@ class ByteLevelPreTokenizer : public PreTokenizer {
225230
* @param add_prefix_space: Whether to add a leading space to the first word
226231
* @param pattern: A user-supplied regex to use for token splitting. If not
227232
* provided, it use the standard GPT2 pattern.
233+
* @param use_regex: Whether to use regex for splitting. If false, only apply
234+
* byte encoding without splitting.
228235
*/
229236
ByteLevelPreTokenizer(bool add_prefix_space = true,
230-
const std::string &pattern = "");
237+
const std::string &pattern = "", bool use_regex = true);
231238
explicit ByteLevelPreTokenizer(const std::string &pattern)
232-
: ByteLevelPreTokenizer(true, pattern) {}
239+
: ByteLevelPreTokenizer(true, pattern, true) {}
233240

234241
/** Perform pre-tokenization */
235242
std::vector<std::string>
@@ -238,6 +245,7 @@ class ByteLevelPreTokenizer : public PreTokenizer {
238245
private:
239246
const std::string pattern_;
240247
const bool add_prefix_space_;
248+
const bool use_regex_;
241249

242250
}; // end class ByteLevelPreTokenizer
243251

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/truncation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,4 @@ class TruncationConfig {
8989
TruncationParams params;
9090
};
9191

92-
} // namespace tokenizers
92+
} // namespace tokenizers

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/unicode-nfc-data.h

Lines changed: 4140 additions & 0 deletions
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Unicode NFC/NFD normalization implementation
3+
* This implementation fixes the broken llama.cpp-unicode normalization
4+
* by using proper multi-codepoint decomposition sequences.
5+
*/
6+
7+
#pragma once
8+
9+
#include <cstdint>
10+
#include <string>
11+
#include <vector>
12+
13+
namespace tokenizers {
14+
15+
/**
16+
* Normalize a sequence of codepoints to NFD (Canonical Decomposition)
17+
*
18+
* @param cpts Input codepoint sequence
19+
* @return NFD-normalized codepoint sequence
20+
*/
21+
std::vector<uint32_t> unicode_normalize_nfd(const std::vector<uint32_t> &cpts);
22+
23+
/**
24+
* Normalize a sequence of codepoints to NFC (Canonical Decomposition +
25+
* Composition)
26+
*
27+
* @param cpts Input codepoint sequence
28+
* @return NFC-normalized codepoint sequence
29+
*/
30+
std::vector<uint32_t> unicode_normalize_nfc(const std::vector<uint32_t> &cpts);
31+
32+
/**
33+
* Normalize a UTF-8 string to NFD
34+
*
35+
* @param utf8 Input UTF-8 string
36+
* @return NFD-normalized UTF-8 string
37+
*/
38+
std::string unicode_normalize_nfd_utf8(const std::string &utf8);
39+
40+
/**
41+
* Normalize a UTF-8 string to NFC
42+
*
43+
* @param utf8 Input UTF-8 string
44+
* @return NFC-normalized UTF-8 string
45+
*/
46+
std::string unicode_normalize_nfc_utf8(const std::string &utf8);
47+
48+
} // namespace tokenizers

0 commit comments

Comments
 (0)