Skip to content

Commit e937c36

Browse files
authored
build: Extend tokenizer capabilities (#1114)
## Description This PR introduces rebuilt binaries that contain new, updated tokenizers. This iteration features support for more tokenisation models (i.e. unigram, worldlevel) as well as bunch of previously unsupported pre-tokenisers, decoders, post-processors. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [x] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions Before merging, test all demo applications. See if all models that proved problematic during bumps in the past are working (i.e. kokoro, multi-method models) Check all LLM models, see if output is working. - [x] LLM app on iOS - [x] LLM app on Android - [x] Speech app on iOS - [x] Speech app on Android - [x] Text Embeddings on iOS - [x] Text Embeddings on Android ### Screenshots <!-- Add screenshots here, if applicable --> ### Related issues ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes
1 parent d97781d commit e937c36

59 files changed

Lines changed: 5046 additions & 5214 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Binary file not shown.

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/base64.h

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace base64 {
3636
using tokenizers::Error;
3737
using tokenizers::Result;
3838

39-
Result<std::string> decode(const std::string_view &input);
39+
Result<std::string> decode(const std::string_view& input);
4040

4141
namespace detail {
4242

@@ -68,9 +68,12 @@ inline Error validate(uint32_t v) {
6868
return Error::Ok;
6969
}
7070

71-
inline Error decode(const std::string_view &input, std::string &output) {
72-
TK_CHECK_OR_RETURN_ERROR(input.size() == 4, Base64DecodeFailure,
73-
"input length must be 4, got %zu", input.size());
71+
inline Error decode(const std::string_view& input, std::string& output) {
72+
TK_CHECK_OR_RETURN_ERROR(
73+
input.size() == 4,
74+
Base64DecodeFailure,
75+
"input length must be 4, got %zu",
76+
input.size());
7477

7578
uint32_t val = 0;
7679

@@ -100,10 +103,14 @@ inline Error decode(const std::string_view &input, std::string &output) {
100103
return Error::Ok;
101104
}
102105

103-
inline Error decode_1_padding(const std::string_view &input,
104-
std::string &output) {
105-
TK_CHECK_OR_RETURN_ERROR(input.size() == 3, Base64DecodeFailure,
106-
"input length must be 3, got %zu", input.size());
106+
inline Error decode_1_padding(
107+
const std::string_view& input,
108+
std::string& output) {
109+
TK_CHECK_OR_RETURN_ERROR(
110+
input.size() == 3,
111+
Base64DecodeFailure,
112+
"input length must be 3, got %zu",
113+
input.size());
107114

108115
uint32_t val = 0;
109116

@@ -127,10 +134,14 @@ inline Error decode_1_padding(const std::string_view &input,
127134
return Error::Ok;
128135
}
129136

130-
inline Error decode_2_padding(const std::string_view &input,
131-
std::string &output) {
132-
TK_CHECK_OR_RETURN_ERROR(input.size() == 2, Base64DecodeFailure,
133-
"input length must be 2, got %zu", input.size());
137+
inline Error decode_2_padding(
138+
const std::string_view& input,
139+
std::string& output) {
140+
TK_CHECK_OR_RETURN_ERROR(
141+
input.size() == 2,
142+
Base64DecodeFailure,
143+
"input length must be 2, got %zu",
144+
input.size());
134145

135146
uint32_t val = 0;
136147

@@ -150,12 +161,13 @@ inline Error decode_2_padding(const std::string_view &input,
150161

151162
} // namespace detail
152163

153-
inline tokenizers::Result<std::string> decode(const std::string_view &input) {
164+
inline tokenizers::Result<std::string> decode(const std::string_view& input) {
154165
TK_CHECK_OR_RETURN_ERROR(!input.empty(), Base64DecodeFailure, "empty input");
155166

156167
// Faster than `input.size() % 4`.
157168
TK_CHECK_OR_RETURN_ERROR(
158-
(input.size() & 3) == 0 && input.size() >= 4, Base64DecodeFailure,
169+
(input.size() & 3) == 0 && input.size() >= 4,
170+
Base64DecodeFailure,
159171
"input length must be larger than 4 and is multiple of 4, got %zu",
160172
input.size());
161173

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/bpe_model.h

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <memory>
1414
#include <optional>
1515
#include <string>
16+
#include <unordered_set>
1617
#include <vector>
1718

1819
#include <pytorch/tokenizers/map_utils.h>
@@ -24,47 +25,66 @@
2425
namespace tokenizers {
2526

2627
class BPEModel : public Model {
27-
public:
28-
explicit BPEModel(detail::TokenMap token_map,
29-
detail::TokenMap special_token_map,
30-
std::optional<detail::TokenMap> merge_ranks,
31-
std::unique_ptr<IRegex> special_token_regex,
32-
bool byte_fallback, std::optional<uint64_t> unk_token_id,
33-
std::optional<uint64_t> bos_token_id,
34-
std::optional<uint64_t> eos_token_id);
28+
public:
29+
explicit BPEModel(
30+
detail::TokenMap token_map,
31+
detail::TokenMap special_token_map,
32+
std::optional<detail::TokenMap> merge_ranks,
33+
std::unique_ptr<IRegex> special_token_regex,
34+
bool byte_fallback,
35+
std::optional<uint64_t> unk_token_id,
36+
std::optional<uint64_t> bos_token_id,
37+
std::optional<uint64_t> eos_token_id,
38+
std::unordered_set<std::string> rstrip_tokens = {},
39+
std::unordered_set<std::string> lstrip_tokens = {});
3540

3641
~BPEModel() override = default;
3742

38-
Result<std::vector<uint64_t>>
39-
tokenize(const std::string &piece) const override;
43+
Result<std::vector<uint64_t>> tokenize(
44+
const std::string& piece) const override;
4045

4146
Result<std::string> id_to_piece(uint64_t token) const override;
42-
Result<uint64_t> piece_to_id(const std::string &token) const override;
47+
Result<uint64_t> piece_to_id(const std::string& token) const override;
4348

44-
int32_t vocab_size() const override { return vocab_size_; }
49+
int32_t vocab_size() const override {
50+
return vocab_size_;
51+
}
4552

4653
bool is_special_token(uint64_t token) const override;
4754

48-
bool is_loaded() const override { return initialized_; }
55+
bool is_loaded() const override {
56+
return initialized_;
57+
}
4958

5059
std::pair<std::optional<std::string>, std::string>
51-
split_with_allowed_special_token(const std::string &input,
52-
size_t offset) const override;
60+
split_with_allowed_special_token(const std::string& input, size_t offset)
61+
const override;
5362

54-
uint64_t bos_token_id() const override { return bos_token_id_.value_or(0); }
63+
bool special_token_has_rstrip(const std::string& token) const override {
64+
return rstrip_tokens_.count(token) > 0;
65+
}
66+
bool special_token_has_lstrip(const std::string& token) const override {
67+
return lstrip_tokens_.count(token) > 0;
68+
}
5569

56-
uint64_t eos_token_id() const override { return eos_token_id_.value_or(0); }
70+
uint64_t bos_token_id() const override {
71+
return bos_token_id_.value_or(0);
72+
}
5773

58-
private:
59-
Result<std::pair<std::vector<uint64_t>, uint64_t>>
60-
encode_with_special_token(const std::string &text) const;
74+
uint64_t eos_token_id() const override {
75+
return eos_token_id_.value_or(0);
76+
}
6177

62-
Result<std::vector<uint64_t>>
63-
byte_pair_encode(const std::string &piece) const;
78+
private:
79+
Result<std::pair<std::vector<uint64_t>, uint64_t>> encode_with_special_token(
80+
const std::string& text) const;
6481

65-
std::vector<uint64_t>
66-
byte_pair_merge(const std::string &piece, const detail::TokenMap &ranks,
67-
std::function<uint64_t(uint64_t, uint64_t)> func) const;
82+
Result<std::vector<uint64_t>> byte_pair_encode(const std::string& piece) const;
83+
84+
std::vector<uint64_t> byte_pair_merge(
85+
const std::string& piece,
86+
const detail::TokenMap& ranks,
87+
std::function<uint64_t(uint64_t, uint64_t)> func) const;
6888

6989
// Real state
7090
detail::TokenMap token_map_;
@@ -76,6 +96,8 @@ class BPEModel : public Model {
7696
std::optional<uint64_t> unk_token_id_;
7797
std::optional<uint64_t> bos_token_id_;
7898
std::optional<uint64_t> eos_token_id_;
99+
std::unordered_set<std::string> rstrip_tokens_;
100+
std::unordered_set<std::string> lstrip_tokens_;
79101

80102
bool initialized_ = false;
81103
int32_t vocab_size_ = 0;

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/bpe_tokenizer_base.h

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,53 +32,62 @@ namespace tokenizers {
3232
namespace detail {
3333

3434
class BPETokenizerBase : public Tokenizer {
35-
public:
36-
Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
37-
int8_t eos) const override;
35+
public:
36+
Result<std::vector<uint64_t>>
37+
encode(const std::string& input, int8_t bos, int8_t eos) const override;
3838

3939
Result<std::string> id_to_piece(uint64_t token) const override;
40-
Result<uint64_t> piece_to_id(const std::string &text) const override;
40+
Result<uint64_t> piece_to_id(const std::string& text) const override;
4141

42-
Result<std::string> decode(uint64_t prev_token, uint64_t token,
43-
bool skip_special_tokens = false) const override;
42+
Result<std::string> decode(
43+
uint64_t prev_token,
44+
uint64_t token,
45+
bool skip_special_tokens = false) const override;
4446

45-
protected:
47+
protected:
4648
explicit BPETokenizerBase() {}
4749
virtual ~BPETokenizerBase() override {}
4850

4951
std::pair<std::optional<std::string>, std::string>
50-
split_with_allowed_special_token_(const std::string &input,
51-
const TokenMap &allowed_special) const;
52+
split_with_allowed_special_token_(
53+
const std::string& input,
54+
const TokenMap& allowed_special) const;
5255

5356
std::pair<std::optional<std::string>, std::string>
54-
split_with_allowed_special_token_(const std::string &input, size_t offset,
55-
const TokenMap &allowed_special) const;
57+
split_with_allowed_special_token_(
58+
const std::string& input,
59+
size_t offset,
60+
const TokenMap& allowed_special) const;
5661

57-
Result<std::pair<std::vector<uint64_t>, uint64_t>>
58-
encode_with_special_token_(const std::string &text,
59-
const TokenMap &allowed_special) const;
62+
Result<std::pair<std::vector<uint64_t>, uint64_t>> encode_with_special_token_(
63+
const std::string& text,
64+
const TokenMap& allowed_special) const;
6065

61-
virtual Result<std::vector<uint64_t>>
62-
byte_pair_encode_(const std::string &piece, const TokenMap &encoder) const;
66+
virtual Result<std::vector<uint64_t>> byte_pair_encode_(
67+
const std::string& piece,
68+
const TokenMap& encoder) const;
6369

6470
// Virtual method for BPE merging - can be overridden by derived classes
6571
// The passed in `ranks` param for the base impl is just a regular token map
6672
// and that the actual ranks are derived implicitly from the regular token
6773
// map. This is the same implementation as Tiktoken.
68-
virtual std::vector<uint64_t>
69-
_byte_pair_merge(const std::string &piece, const TokenMap &ranks,
70-
std::function<uint64_t(uint64_t, uint64_t)> func) const;
74+
virtual std::vector<uint64_t> _byte_pair_merge(
75+
const std::string& piece,
76+
const TokenMap& ranks,
77+
std::function<uint64_t(uint64_t, uint64_t)> func) const;
7178

7279
// Protected members that can be overloaded by other BPE tokenizers
7380
std::unique_ptr<IRegex> special_token_regex_;
7481
std::optional<TokenMap> token_map_;
7582
std::optional<TokenMap> special_token_map_;
7683

77-
private:
78-
virtual Error _encode(const std::string &input, std::vector<uint64_t> &ret,
79-
uint64_t &last_piece_token_len) const = 0;
84+
private:
85+
virtual Error _encode(
86+
const std::string& input,
87+
std::vector<uint64_t>& ret,
88+
uint64_t& last_piece_token_len) const = 0;
8089

81-
virtual void _decode(const std::string &input, std::string &ret) const = 0;
90+
virtual void _decode(const std::string& input, std::string& ret) const = 0;
8291
};
8392

8493
} // namespace detail

packages/react-native-executorch/third-party/include/executorch/extension/llm/tokenizers/include/pytorch/tokenizers/error.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -73,23 +73,23 @@ enum class Error : error_code_t {
7373
* @param[in] message__ Format string for the log error message.
7474
* @param[in] ... Optional additional arguments for the format string.
7575
*/
76-
#define TK_CHECK_OR_RETURN_ERROR(cond__, error__, message__, ...) \
77-
{ \
78-
if (!(cond__)) { \
79-
TK_LOG(Error, message__, ##__VA_ARGS__); \
80-
return ::tokenizers::Error::error__; \
81-
} \
76+
#define TK_CHECK_OR_RETURN_ERROR(cond__, error__, message__, ...) \
77+
{ \
78+
if (!(cond__)) { \
79+
TK_LOG(Error, message__, ##__VA_ARGS__); \
80+
return ::tokenizers::Error::error__; \
81+
} \
8282
}
8383

8484
/**
8585
* If error__ is not Error::Ok, return the specified Error
8686
* @param[in] error__ Error enum value to return without the `Error::` prefix,
8787
* like `Base64DecodeFailure`.
8888
*/
89-
#define TK_CHECK_OK_OR_RETURN_ERROR(error__) \
90-
do { \
91-
const auto et_error__ = (error__); \
92-
if (et_error__ != ::tokenizers::Error::Ok) { \
93-
return et_error__; \
94-
} \
89+
#define TK_CHECK_OK_OR_RETURN_ERROR(error__) \
90+
do { \
91+
const auto et_error__ = (error__); \
92+
if (et_error__ != ::tokenizers::Error::Ok) { \
93+
return et_error__; \
94+
} \
9595
} while (0)

0 commit comments

Comments
 (0)