Skip to content

Commit 3ced9e2

Browse files
committed
feat: add static libs & include for tokenizers-cpp
1 parent 604b90f commit 3ced9e2

4 files changed

Lines changed: 179 additions & 0 deletions

File tree

Binary file not shown.
Binary file not shown.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*!
2+
* Copyright (c) 2023 by Contributors
3+
* \file tokenizers_c.h
4+
* \brief C binding to tokenizers rust library
5+
*/
6+
#ifndef TOKENIZERS_C_H_
7+
#define TOKENIZERS_C_H_
8+
9+
// The C API
10+
#ifdef __cplusplus
11+
extern "C" {
12+
#endif
13+
14+
#include <stddef.h>
15+
#include <stdint.h>
16+
17+
typedef void *TokenizerHandle;
18+
19+
typedef struct {
20+
int *token_ids;
21+
size_t len;
22+
} TokenizerEncodeResult;
23+
24+
TokenizerHandle tokenizers_new_from_str(const char *json, size_t len);
25+
26+
TokenizerHandle byte_level_bpe_tokenizers_new_from_str(
27+
const char *vocab, size_t vocab_len, const char *merges, size_t merges_len,
28+
const char *added_tokens, size_t added_tokens_len);
29+
30+
void tokenizers_encode(TokenizerHandle handle, const char *data, size_t len,
31+
int add_special_token, TokenizerEncodeResult *result);
32+
33+
void tokenizers_encode_batch(TokenizerHandle handle, const char **data,
34+
size_t *len, size_t num_seqs,
35+
int add_special_token,
36+
TokenizerEncodeResult *results);
37+
38+
void tokenizers_free_encode_results(TokenizerEncodeResult *results,
39+
size_t num_seqs);
40+
41+
void tokenizers_decode(TokenizerHandle handle, const uint32_t *data, size_t len,
42+
int skip_special_token);
43+
44+
void tokenizers_get_decode_str(TokenizerHandle handle, const char **data,
45+
size_t *len);
46+
47+
void tokenizers_get_vocab_size(TokenizerHandle handle, size_t *size);
48+
49+
void tokenizers_id_to_token(TokenizerHandle handle, uint32_t id,
50+
const char **data, size_t *len);
51+
52+
// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
53+
void tokenizers_token_to_id(TokenizerHandle handle, const char *token,
54+
size_t len, int32_t *id);
55+
56+
void tokenizers_free(TokenizerHandle handle);
57+
58+
#ifdef __cplusplus
59+
}
60+
#endif
61+
#endif // TOKENIZERS_C_H_
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*!
2+
* Copyright (c) 2023 by Contributors
3+
* \file tokenizers_cpp.h
4+
* \brief A C++ binding to common set of tokenizers
5+
*/
6+
#ifndef TOKENIZERS_CPP_H_
7+
#define TOKENIZERS_CPP_H_
8+
9+
#include <memory>
10+
#include <string>
11+
#include <vector>
12+
13+
namespace tokenizers {
14+
15+
/*!
16+
* \brief a universal tokenizer that loads
17+
* either HF's tokenizer or sentence piece,
18+
* depending on the constructor
19+
*/
20+
class Tokenizer {
21+
public:
22+
/*! \brief virtual destructor */
23+
virtual ~Tokenizer() {}
24+
25+
/*!
26+
* \brief Encode text into ids.
27+
* \param text The input text.
28+
* \returns The encoded token ids.
29+
*/
30+
virtual std::vector<int32_t> Encode(const std::string &text) = 0;
31+
32+
/*!
33+
* \brief Encode a batch of texts into ids.
34+
* \param texts The input texts.
35+
* \returns The encoded token ids.
36+
*/
37+
virtual std::vector<std::vector<int32_t>>
38+
EncodeBatch(const std::vector<std::string> &texts) {
39+
// Fall back when the derived class does not implement this function.
40+
std::vector<std::vector<int32_t>> ret;
41+
ret.reserve(texts.size());
42+
for (const auto &text : texts) {
43+
ret.push_back(Encode(text));
44+
}
45+
return ret;
46+
}
47+
48+
/*!
49+
* \brief Decode token ids into text.
50+
* \param text The token ids.
51+
* \returns The decoded text.
52+
*/
53+
virtual std::string Decode(const std::vector<int32_t> &ids) = 0;
54+
55+
virtual std::string Decode(const std::vector<int32_t> &ids,
56+
bool skip_special_tokens) = 0;
57+
58+
/*!
59+
* \brief Returns the vocabulary size. Special tokens are considered.
60+
*/
61+
virtual size_t GetVocabSize() = 0;
62+
63+
/*!
64+
* \brief Convert the given id to its corresponding token if it exists. If
65+
* not, return an empty string.
66+
*/
67+
virtual std::string IdToToken(int32_t token_id) = 0;
68+
69+
/*!
70+
* \brief Convert the given token to its corresponding id if it exists. If
71+
* not, return -1.
72+
*/
73+
virtual int32_t TokenToId(const std::string &token) = 0;
74+
75+
//---------------------------------------------------
76+
// Factory functions from byte-blobs
77+
// These factory function takes in in-memory blobs
78+
// so the library can be independent from filesystem
79+
//---------------------------------------------------
80+
/*!
81+
* \brief Create HF tokenizer from a single in-memory json blob.
82+
*
83+
* \param json_blob The json blob.
84+
* \return The created tokenzier.
85+
*/
86+
static std::unique_ptr<Tokenizer> FromBlobJSON(const std::string &json_blob);
87+
/*!
88+
* \brief Create BPE tokenizer
89+
*
90+
* \param vocab_blob The blob that contains vocabs.
91+
* \param merges_blob The blob that contains the merges.
92+
* \param added_tokens The added tokens.
93+
* \return The created tokenizer.
94+
*/
95+
static std::unique_ptr<Tokenizer>
96+
FromBlobByteLevelBPE(const std::string &vocab_blob,
97+
const std::string &merges_blob,
98+
const std::string &added_tokens = "");
99+
/*!
100+
* \brief Create SentencePiece.
101+
*
102+
* \param model_blob The blob that contains vocabs.
103+
* \return The created tokenizer.
104+
*/
105+
static std::unique_ptr<Tokenizer>
106+
FromBlobSentencePiece(const std::string &model_blob);
107+
/*!
108+
* \brief Create RWKVWorldTokenizer.
109+
*
110+
* \param model_blob The blob that contains vocabs.
111+
* \return The created tokenizer.
112+
*/
113+
static std::unique_ptr<Tokenizer>
114+
FromBlobRWKVWorld(const std::string &model_blob);
115+
};
116+
117+
} // namespace tokenizers
118+
#endif // TOKENIZERS_CPP_H_

0 commit comments

Comments
 (0)