|
| 1 | +/*! |
| 2 | + * Copyright (c) 2023 by Contributors |
| 3 | + * \file tokenizers_cpp.h |
| 4 | + * \brief A C++ binding to common set of tokenizers |
| 5 | + */ |
| 6 | +#ifndef TOKENIZERS_CPP_H_ |
| 7 | +#define TOKENIZERS_CPP_H_ |
| 8 | + |
| 9 | +#include <memory> |
| 10 | +#include <string> |
| 11 | +#include <vector> |
| 12 | + |
| 13 | +namespace tokenizers { |
| 14 | + |
| 15 | +/*! |
| 16 | + * \brief a universal tokenizer that loads |
| 17 | + * either HF's tokenizer or sentence piece, |
| 18 | + * depending on the constructor |
| 19 | + */ |
| 20 | +class Tokenizer { |
| 21 | +public: |
| 22 | + /*! \brief virtual destructor */ |
| 23 | + virtual ~Tokenizer() {} |
| 24 | + |
| 25 | + /*! |
| 26 | + * \brief Encode text into ids. |
| 27 | + * \param text The input text. |
| 28 | + * \returns The encoded token ids. |
| 29 | + */ |
| 30 | + virtual std::vector<int32_t> Encode(const std::string &text) = 0; |
| 31 | + |
| 32 | + /*! |
| 33 | + * \brief Encode a batch of texts into ids. |
| 34 | + * \param texts The input texts. |
| 35 | + * \returns The encoded token ids. |
| 36 | + */ |
| 37 | + virtual std::vector<std::vector<int32_t>> |
| 38 | + EncodeBatch(const std::vector<std::string> &texts) { |
| 39 | + // Fall back when the derived class does not implement this function. |
| 40 | + std::vector<std::vector<int32_t>> ret; |
| 41 | + ret.reserve(texts.size()); |
| 42 | + for (const auto &text : texts) { |
| 43 | + ret.push_back(Encode(text)); |
| 44 | + } |
| 45 | + return ret; |
| 46 | + } |
| 47 | + |
| 48 | + /*! |
| 49 | + * \brief Decode token ids into text. |
| 50 | + * \param text The token ids. |
| 51 | + * \returns The decoded text. |
| 52 | + */ |
| 53 | + virtual std::string Decode(const std::vector<int32_t> &ids) = 0; |
| 54 | + |
| 55 | + virtual std::string Decode(const std::vector<int32_t> &ids, |
| 56 | + bool skip_special_tokens) = 0; |
| 57 | + |
| 58 | + /*! |
| 59 | + * \brief Returns the vocabulary size. Special tokens are considered. |
| 60 | + */ |
| 61 | + virtual size_t GetVocabSize() = 0; |
| 62 | + |
| 63 | + /*! |
| 64 | + * \brief Convert the given id to its corresponding token if it exists. If |
| 65 | + * not, return an empty string. |
| 66 | + */ |
| 67 | + virtual std::string IdToToken(int32_t token_id) = 0; |
| 68 | + |
| 69 | + /*! |
| 70 | + * \brief Convert the given token to its corresponding id if it exists. If |
| 71 | + * not, return -1. |
| 72 | + */ |
| 73 | + virtual int32_t TokenToId(const std::string &token) = 0; |
| 74 | + |
| 75 | + //--------------------------------------------------- |
| 76 | + // Factory functions from byte-blobs |
| 77 | + // These factory function takes in in-memory blobs |
| 78 | + // so the library can be independent from filesystem |
| 79 | + //--------------------------------------------------- |
| 80 | + /*! |
| 81 | + * \brief Create HF tokenizer from a single in-memory json blob. |
| 82 | + * |
| 83 | + * \param json_blob The json blob. |
| 84 | + * \return The created tokenzier. |
| 85 | + */ |
| 86 | + static std::unique_ptr<Tokenizer> FromBlobJSON(const std::string &json_blob); |
| 87 | + /*! |
| 88 | + * \brief Create BPE tokenizer |
| 89 | + * |
| 90 | + * \param vocab_blob The blob that contains vocabs. |
| 91 | + * \param merges_blob The blob that contains the merges. |
| 92 | + * \param added_tokens The added tokens. |
| 93 | + * \return The created tokenizer. |
| 94 | + */ |
| 95 | + static std::unique_ptr<Tokenizer> |
| 96 | + FromBlobByteLevelBPE(const std::string &vocab_blob, |
| 97 | + const std::string &merges_blob, |
| 98 | + const std::string &added_tokens = ""); |
| 99 | + /*! |
| 100 | + * \brief Create SentencePiece. |
| 101 | + * |
| 102 | + * \param model_blob The blob that contains vocabs. |
| 103 | + * \return The created tokenizer. |
| 104 | + */ |
| 105 | + static std::unique_ptr<Tokenizer> |
| 106 | + FromBlobSentencePiece(const std::string &model_blob); |
| 107 | + /*! |
| 108 | + * \brief Create RWKVWorldTokenizer. |
| 109 | + * |
| 110 | + * \param model_blob The blob that contains vocabs. |
| 111 | + * \return The created tokenizer. |
| 112 | + */ |
| 113 | + static std::unique_ptr<Tokenizer> |
| 114 | + FromBlobRWKVWorld(const std::string &model_blob); |
| 115 | +}; |
| 116 | + |
| 117 | +} // namespace tokenizers |
| 118 | +#endif // TOKENIZERS_CPP_H_ |
0 commit comments