|
12 | 12 | #include <cstring> |
13 | 13 | #include <iostream> |
14 | 14 | #include <ranges> |
| 15 | +#include <sstream> |
15 | 16 | #include <stdexcept> |
16 | 17 | #include <unordered_map> |
17 | 18 |
|
@@ -747,6 +748,9 @@ static const char* evo_token_name(uint16_t token) |
747 | 748 | } |
748 | 749 |
|
749 | 750 | static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken); |
| 751 | +static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken); |
| 752 | +static void append_evo_token(data_t& out, uint16_t evoToken); |
| 753 | +static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken); |
750 | 754 |
|
751 | 755 | static std::string evo_token_to_string(uint16_t token) |
752 | 756 | { |
@@ -795,6 +799,81 @@ std::string detokenize_evo_token_words(const data_t& data) |
795 | 799 | return out; |
796 | 800 | } |
797 | 801 |
|
| 802 | +data_t tokenize_evo_token_words(const std::string& source, const options_t& options) |
| 803 | +{ |
| 804 | + const bool deindent = options.contains("deindent") && options.at("deindent") == 1; |
| 805 | + const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0; |
| 806 | + |
| 807 | + std::string normalizedSource; |
| 808 | + if (deindent) |
| 809 | + { |
| 810 | + std::istringstream lines{source}; |
| 811 | + std::string line; |
| 812 | + while (std::getline(lines, line)) |
| 813 | + { |
| 814 | + normalizedSource += ltrim(line) + "\n"; |
| 815 | + } |
| 816 | + if (!normalizedSource.empty()) |
| 817 | + { |
| 818 | + normalizedSource.pop_back(); |
| 819 | + } |
| 820 | + } |
| 821 | + else |
| 822 | + { |
| 823 | + normalizedSource = source; |
| 824 | + } |
| 825 | + |
| 826 | + static constexpr uint16_t legacyQuote = 0x2A; |
| 827 | + |
| 828 | + data_t evo; |
| 829 | + evo.reserve((normalizedSource.size() + 1) * 2); |
| 830 | + bool isWithinString = false; |
| 831 | + |
| 832 | + for (const auto& item : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings)) |
| 833 | + { |
| 834 | + if (item.matched) |
| 835 | + { |
| 836 | + uint16_t evoToken = 0; |
| 837 | + if (item.text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(item.token) && !direct_evo_token_for_legacy(item.token, evoToken)) |
| 838 | + { |
| 839 | + evoToken = item.token; |
| 840 | + } |
| 841 | + else if (isWithinString && item.token != legacyQuote && legacy_token_to_evo_ucs2(item.token, evoToken)) |
| 842 | + { |
| 843 | + // Text inside Evo strings is stored as UCS-2 code units. |
| 844 | + } |
| 845 | + else if (!direct_evo_token_for_legacy(item.token, evoToken)) |
| 846 | + { |
| 847 | + std::cerr << "[Warning] Cannot convert 84+CE token " |
| 848 | + << TypeHandlers::TH_Tokenized::oneTokenBytesToString(item.token) |
| 849 | + << " to an Evo token; replacing with ?" << std::endl; |
| 850 | + evoToken = 0xE41B; |
| 851 | + } |
| 852 | + |
| 853 | + append_evo_token(evo, evoToken); |
| 854 | + if (item.token == legacyQuote) |
| 855 | + { |
| 856 | + isWithinString = !isWithinString; |
| 857 | + } |
| 858 | + continue; |
| 859 | + } |
| 860 | + |
| 861 | + uint16_t codepoint = 0; |
| 862 | + if (utf8_to_single_codepoint(item.text, codepoint) && is_displayable_ucs2_scalar(codepoint)) |
| 863 | + { |
| 864 | + append_evo_token(evo, codepoint); |
| 865 | + } |
| 866 | + else if (!item.text.empty()) |
| 867 | + { |
| 868 | + std::cerr << "[Warning] Cannot encode source text \"" << item.text |
| 869 | + << "\" as an Evo token; skipping it." << std::endl; |
| 870 | + } |
| 871 | + } |
| 872 | + |
| 873 | + append_evo_token(evo, 0); |
| 874 | + return evo; |
| 875 | +} |
| 876 | + |
798 | 877 | bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry) |
799 | 878 | { |
800 | 879 | const EvoTypeID evoTypeID = entry.evoTypeID; |
|
0 commit comments