|
12 | 12 | #include <cstring> |
13 | 13 | #include <iostream> |
14 | 14 | #include <ranges> |
| 15 | +#include <sstream> |
15 | 16 | #include <stdexcept> |
16 | 17 | #include <unordered_map> |
17 | 18 |
|
18 | 19 | #include "TIVarTypes.h" |
19 | 20 | #include "TypeHandlers/TypeHandlers.h" |
| 21 | +#include "json.hpp" |
20 | 22 | #include "tivarslib_utils.h" |
21 | 23 |
|
| 24 | +using json = nlohmann::ordered_json; |
| 25 | + |
22 | 26 | namespace tivars::EvoFormat |
23 | 27 | { |
24 | 28 | namespace |
@@ -112,6 +116,7 @@ namespace tivars::EvoFormat |
112 | 116 | codepoint = static_cast<uint16_t>(value); |
113 | 117 | return true; |
114 | 118 | } |
| 119 | + |
115 | 120 | } |
116 | 121 |
|
117 | 122 | uint16_t evo_checksum(const data_t& body) |
@@ -747,6 +752,9 @@ static const char* evo_token_name(uint16_t token) |
747 | 752 | } |
748 | 753 |
|
749 | 754 | static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken); |
| 755 | +static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken); |
| 756 | +static void append_evo_token(data_t& out, uint16_t evoToken); |
| 757 | +static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken); |
750 | 758 |
|
751 | 759 | static std::string evo_token_to_string(uint16_t token) |
752 | 760 | { |
@@ -795,6 +803,105 @@ std::string detokenize_evo_token_words(const data_t& data) |
795 | 803 | return out; |
796 | 804 | } |
797 | 805 |
|
| 806 | +data_t tokenize_evo_token_words(const std::string& source, const options_t& options) |
| 807 | +{ |
| 808 | + const bool deindent = options.contains("deindent") && options.at("deindent") == 1; |
| 809 | + const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0; |
| 810 | + |
| 811 | + std::string sourceText = source; |
| 812 | + const std::string trimmed = trim(sourceText); |
| 813 | + if (!trimmed.empty() && trimmed.front() == '{') |
| 814 | + { |
| 815 | + try |
| 816 | + { |
| 817 | + const json j = json::parse(trimmed); |
| 818 | + if (j.contains("rawDataHex")) |
| 819 | + { |
| 820 | + return hex_string_to_bytes(j.at("rawDataHex").get<std::string>(), "rawDataHex"); |
| 821 | + } |
| 822 | + if (j.contains("code")) |
| 823 | + { |
| 824 | + sourceText = j.at("code").get<std::string>(); |
| 825 | + } |
| 826 | + } |
| 827 | + catch (const json::exception&) |
| 828 | + { |
| 829 | + // Ignore non-JSON input and fall back to regular Evo tokenized parsing. |
| 830 | + } |
| 831 | + } |
| 832 | + |
| 833 | + std::string normalizedSource; |
| 834 | + if (deindent) |
| 835 | + { |
| 836 | + std::istringstream lines{sourceText}; |
| 837 | + std::string line; |
| 838 | + while (std::getline(lines, line)) |
| 839 | + { |
| 840 | + normalizedSource += ltrim(line) + "\n"; |
| 841 | + } |
| 842 | + if (!normalizedSource.empty()) |
| 843 | + { |
| 844 | + normalizedSource.pop_back(); |
| 845 | + } |
| 846 | + } |
| 847 | + else |
| 848 | + { |
| 849 | + normalizedSource = sourceText; |
| 850 | + } |
| 851 | + |
| 852 | + static constexpr uint16_t legacyQuote = 0x2A; |
| 853 | + |
| 854 | + data_t evo; |
| 855 | + evo.reserve((normalizedSource.size() + 1) * 2); |
| 856 | + bool isWithinString = false; |
| 857 | + |
| 858 | + for (const auto& [text, token, matched] : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings)) |
| 859 | + { |
| 860 | + if (matched) |
| 861 | + { |
| 862 | + uint16_t evoToken = 0; |
| 863 | + if (text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(token) && !direct_evo_token_for_legacy(token, evoToken)) |
| 864 | + { |
| 865 | + evoToken = token; |
| 866 | + } |
| 867 | + else if (isWithinString && token != legacyQuote && legacy_token_to_evo_ucs2(token, evoToken)) |
| 868 | + { |
| 869 | + // Text inside Evo strings is stored as UCS-2 code units. |
| 870 | + } |
| 871 | + else if (!direct_evo_token_for_legacy(token, evoToken)) |
| 872 | + { |
| 873 | + std::cerr << "[Warning] Cannot convert 84+CE token " |
| 874 | + << TypeHandlers::TH_Tokenized::oneTokenBytesToString(token) |
| 875 | + << " to an Evo token; replacing with ?" << std::endl; |
| 876 | + evoToken = 0xE41B; |
| 877 | + } |
| 878 | + |
| 879 | + append_evo_token(evo, evoToken); |
| 880 | + if (token == legacyQuote) |
| 881 | + { |
| 882 | + isWithinString = !isWithinString; |
| 883 | + } |
| 884 | + continue; |
| 885 | + } |
| 886 | + |
| 887 | + // The shared scanner leaves unknown-but-valid UTF-8 source text here; |
| 888 | + // Evo can store displayable UCS-2 characters directly as token words. |
| 889 | + uint16_t codepoint = 0; |
| 890 | + if (utf8_to_single_codepoint(text, codepoint) && is_displayable_ucs2_scalar(codepoint)) |
| 891 | + { |
| 892 | + append_evo_token(evo, codepoint); |
| 893 | + } |
| 894 | + else if (!text.empty()) |
| 895 | + { |
| 896 | + std::cerr << "[Warning] Cannot encode source text \"" << text |
| 897 | + << "\" as an Evo token; skipping it." << std::endl; |
| 898 | + } |
| 899 | + } |
| 900 | + |
| 901 | + append_evo_token(evo, 0); |
| 902 | + return evo; |
| 903 | +} |
| 904 | + |
798 | 905 | bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry) |
799 | 906 | { |
800 | 907 | const EvoTypeID evoTypeID = entry.evoTypeID; |
|
0 commit comments