Evo: make tokenization a 1st class citizen feature.

adriweb · adriweb · commit f09996afbe34 · 2026-05-14T16:53:27.000+02:00
diff --git a/src/EvoFormat.cpp b/src/EvoFormat.cpp
@@ -12,13 +12,17 @@
 #include <cstring>
 #include <iostream>
 #include <ranges>
+#include <sstream>
 #include <stdexcept>
 #include <unordered_map>
 
 #include "TIVarTypes.h"
 #include "TypeHandlers/TypeHandlers.h"
+#include "json.hpp"
 #include "tivarslib_utils.h"
 
+using json = nlohmann::ordered_json;
+
 namespace tivars::EvoFormat
 {
     namespace
@@ -112,6 +116,7 @@ namespace tivars::EvoFormat
             codepoint = static_cast<uint16_t>(value);
             return true;
         }
+
     }
 
 uint16_t evo_checksum(const data_t& body)
@@ -747,6 +752,9 @@ static const char* evo_token_name(uint16_t token)
 }
 
 static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
+static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
+static void append_evo_token(data_t& out, uint16_t evoToken);
+static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
 
 static std::string evo_token_to_string(uint16_t token)
 {
@@ -795,6 +803,105 @@ std::string detokenize_evo_token_words(const data_t& data)
     return out;
 }
 
+data_t tokenize_evo_token_words(const std::string& source, const options_t& options)
+{
+    const bool deindent = options.contains("deindent") && options.at("deindent") == 1;
+    const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0;
+
+    std::string sourceText = source;
+    const std::string trimmed = trim(sourceText);
+    if (!trimmed.empty() && trimmed.front() == '{')
+    {
+        try
+        {
+            const json j = json::parse(trimmed);
+            if (j.contains("rawDataHex"))
+            {
+                return hex_string_to_bytes(j.at("rawDataHex").get<std::string>(), "rawDataHex");
+            }
+            if (j.contains("code"))
+            {
+                sourceText = j.at("code").get<std::string>();
+            }
+        }
+        catch (const json::exception&)
+        {
+            // Ignore non-JSON input and fall back to regular Evo tokenized parsing.
+        }
+    }
+
+    std::string normalizedSource;
+    if (deindent)
+    {
+        std::istringstream lines{sourceText};
+        std::string line;
+        while (std::getline(lines, line))
+        {
+            normalizedSource += ltrim(line) + "\n";
+        }
+        if (!normalizedSource.empty())
+        {
+            normalizedSource.pop_back();
+        }
+    }
+    else
+    {
+        normalizedSource = sourceText;
+    }
+
+    static constexpr uint16_t legacyQuote = 0x2A;
+
+    data_t evo;
+    evo.reserve((normalizedSource.size() + 1) * 2);
+    bool isWithinString = false;
+
+    for (const auto& [text, token, matched] : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings))
+    {
+        if (matched)
+        {
+            uint16_t evoToken = 0;
+            if (text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(token) && !direct_evo_token_for_legacy(token, evoToken))
+            {
+                evoToken = token;
+            }
+            else if (isWithinString && token != legacyQuote && legacy_token_to_evo_ucs2(token, evoToken))
+            {
+                // Text inside Evo strings is stored as UCS-2 code units.
+            }
+            else if (!direct_evo_token_for_legacy(token, evoToken))
+            {
+                std::cerr << "[Warning] Cannot convert 84+CE token "
+                          << TypeHandlers::TH_Tokenized::oneTokenBytesToString(token)
+                          << " to an Evo token; replacing with ?" << std::endl;
+                evoToken = 0xE41B;
+            }
+
+            append_evo_token(evo, evoToken);
+            if (token == legacyQuote)
+            {
+                isWithinString = !isWithinString;
+            }
+            continue;
+        }
+
+        // The shared scanner leaves unknown-but-valid UTF-8 source text here;
+        // Evo can store displayable UCS-2 characters directly as token words.
+        uint16_t codepoint = 0;
+        if (utf8_to_single_codepoint(text, codepoint) && is_displayable_ucs2_scalar(codepoint))
+        {
+            append_evo_token(evo, codepoint);
+        }
+        else if (!text.empty())
+        {
+            std::cerr << "[Warning] Cannot encode source text \"" << text
+                      << "\" as an Evo token; skipping it." << std::endl;
+        }
+    }
+
+    append_evo_token(evo, 0);
+    return evo;
+}
+
 bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry)
 {
     const EvoTypeID evoTypeID = entry.evoTypeID;
diff --git a/src/EvoFormat.h b/src/EvoFormat.h
@@ -68,6 +68,7 @@ namespace tivars::EvoFormat
     EvoPythonScriptInfo parse_evo_python_script_payload(const data_t& data);
 
     std::string detokenize_evo_token_words(const data_t& data);
+    data_t tokenize_evo_token_words(const std::string& source, const options_t& options = options_t());
     bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry);
     data_t evo_tokenized_data_to_legacy(const data_t& evoData);
     data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversion = false);
diff --git a/src/TIVarFile.cpp b/src/TIVarFile.cpp
@@ -991,12 +991,17 @@ namespace tivars
     void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)
     {
         auto& entry = this->entries[entryIdx];
-        data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);
+        data_t data;
         if (this->evoFormat && is_evo_tokenized_entry(entry))
         {
-            data = legacy_tokenized_data_to_evo(data);
+            data = tokenize_evo_token_words(str, options);
         }
-        else if (this->evoFormat && is_legacy_numeric_entry(entry))
+        else
+        {
+            data = std::get<0>(entry._type.getHandlers())(str, options, this);
+        }
+
+        if (this->evoFormat && is_legacy_numeric_entry(entry))
         {
             const bool exactFraction = legacy_value_is_exact_fraction(data);
             data = legacy_value_to_evo_expression(data);
diff --git a/src/TypeHandlers/TH_Tokenized.cpp b/src/TypeHandlers/TH_Tokenized.cpp
@@ -550,7 +550,10 @@ namespace tivars::TypeHandlers
                     continue;
                 }
 
-                const std::string currChar = str.substr(strCursorPos, 1);
+                // Unmatched text is reported as one UTF-8 codepoint so callers can
+                // handle non-CE characters without receiving split byte fragments.
+                const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);
+                const std::string currChar = str.substr(strCursorPos, currCharLen);
 
                 if (currChar == backslashStr)
                 {
@@ -653,6 +656,8 @@ namespace tivars::TypeHandlers
                 if (!matched)
                 {
                     onSkipped(currChar);
+                    // The for-loop will add the final byte advance.
+                    strCursorPos += currCharLen - 1;
                 }
             }
         }
@@ -1412,6 +1417,24 @@ namespace tivars::TypeHandlers
         return posinfo;
     }
 
+    std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)
+    {
+        ensure_tokens_initialized();
+
+        std::vector<token_scan_item> items;
+        items.reserve(sourceStr.size());
+        scan_source_tokens(sourceStr, detectStrings,
+                           [&](const std::string& tokenStr, uint16_t tokenValue)
+                           {
+                               items.push_back({ tokenStr, tokenValue, true });
+                           },
+                           [&](const std::string& skippedStr)
+                           {
+                               items.push_back({ skippedStr, 0, false });
+                           });
+        return items;
+    }
+
     TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)
     {
         const size_t strLen = hexBytesStr.length();
diff --git a/src/TypeHandlers/TypeHandlers.h b/src/TypeHandlers/TypeHandlers.h
@@ -244,10 +244,12 @@ namespace tivars::TypeHandlers
         enum typelang { PRGMLANG_BASIC = 0, PRGMLANG_AXE, PRGMLANG_ICE };
         enum indentchar : char { INDENT_CHAR_SPACE = ' ', INDENT_CHAR_TAB = '\t' };
         struct token_posinfo { uint16_t line; uint16_t column; uint8_t len; };
+        struct token_scan_item { std::string text; uint16_t token; bool matched; };
         static std::string reindentCodeString(const std::string& str_orig, const options_t& options = options_t());
         static token_posinfo getPosInfoAtOffset(const data_t& data, uint16_t byteOffset, const options_t& options = options_t());
         static token_posinfo getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset);
         static token_posinfo getPosInfoAtOffsetInSourceString(const std::string& sourceStr, uint16_t byteOffset);
+        static std::vector<token_scan_item> scanSourceTokens(const std::string& sourceStr, bool detectStrings = true);
         static std::string tokenToString(const data_t& data, int *incr, const options_t& options);
         static std::string oneTokenBytesToString(uint16_t tokenBytes);
     };
diff --git a/src/tivarslib_utils.cpp b/src/tivarslib_utils.cpp
@@ -6,12 +6,14 @@
  */
 
 #include "tivarslib_utils.h"
+#include <cctype>
 #include <cstdlib>
 #include <iomanip>
 #include <sstream>
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <stdexcept>
 
 using namespace std::string_literals;
 
@@ -86,6 +88,27 @@ unsigned char hexdec(const std::string& str)
     return (unsigned char) stoul(str, nullptr, 16);
 }
 
+data_t hex_string_to_bytes(const std::string& hex, const char* fieldName)
+{
+    if ((hex.size() % 2) != 0)
+    {
+        throw std::invalid_argument(std::string(fieldName) + " must contain an even number of hex digits");
+    }
+
+    data_t out;
+    out.reserve(hex.size() / 2);
+    for (size_t i = 0; i < hex.size(); i += 2)
+    {
+        if (!std::isxdigit(static_cast<unsigned char>(hex[i])) ||
+            !std::isxdigit(static_cast<unsigned char>(hex[i + 1])))
+        {
+            throw std::invalid_argument(std::string(fieldName) + " must be valid hexadecimal");
+        }
+        out.push_back(hexdec(hex.substr(i, 2)));
+    }
+    return out;
+}
+
 std::string dechex(unsigned char i, bool zeropad)
 {
     std::string str = "00";
diff --git a/src/tivarslib_utils.h b/src/tivarslib_utils.h
@@ -34,6 +34,8 @@ void vector_append(std::vector<T>& vec, const std::vector<T>& other)
 
 unsigned char hexdec(const std::string& str);
 
+data_t hex_string_to_bytes(const std::string& hex, const char* fieldName = "hex string");
+
 std::string dechex(unsigned char i, bool zeropad = true);
 
 std::string strtoupper(const std::string& str);
diff --git a/tests.cpp b/tests.cpp
@@ -1694,6 +1694,20 @@ int main(int argc, char** argv)
         assert(code.find("\\u0178") != std::string::npos);
     }
 
+    {
+        assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
+
+        TIVarFile directYHatProgram = TIVarFile::createNew("Program", "YHAT2", "84Evo");
+        directYHatProgram.setContentFromString("Disp \"ŷ\"");
+        const json readable = json::parse(directYHatProgram.getReadableContent());
+        assert(readable["code"] == "Disp \"ŷ\"");
+        assert(readable["rawDataHex"].get<std::string>().find("7701") != std::string::npos);
+
+        TIVarFile recreatedYHatProgram = TIVarFile::createNew("Program", "YHAT3", "84Evo");
+        recreatedYHatProgram.setContentFromString(directYHatProgram.getReadableContent());
+        assert(recreatedYHatProgram.getRawContent() == directYHatProgram.getRawContent());
+    }
+
     {
         TIVarFile stringVarProgram = TIVarFile::createNew("Program", "MYPRGM", "84+CE");
         stringVarProgram.setContentFromString(R"(Ans→N
@@ -1747,6 +1761,10 @@ Disp Str1
         evalProgram.convertToModel(TIModel{"84+CE"});
         assert(!evalProgram.isEvoFormat());
         assert(evalProgram.getRawContent() == originalLegacyData);
+
+        TIVarFile directEvoEvalProgram = TIVarFile::createNew("Program", "EVALTOK2", "84Evo");
+        directEvoEvalProgram.setContentFromString("eval(\"1+1\"");
+        assert(json::parse(directEvoEvalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
     }
 
     {

Original file line number	Diff line number	Diff line change
`@@ -991,12 +991,17 @@ namespace tivars`
`991`	`991`	`void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)`
`992`	`992`	`{`
`993`	`993`	`auto& entry = this->entries[entryIdx];`
`994`		`- data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);`
	`994`	`+ data_t data;`
`995`	`995`	`if (this->evoFormat && is_evo_tokenized_entry(entry))`
`996`	`996`	`{`
`997`		`- data = legacy_tokenized_data_to_evo(data);`
	`997`	`+ data = tokenize_evo_token_words(str, options);`
`998`	`998`	`}`
`999`		`- else if (this->evoFormat && is_legacy_numeric_entry(entry))`
	`999`	`+ else`
	`1000`	`+ {`
	`1001`	`+ data = std::get<0>(entry._type.getHandlers())(str, options, this);`
	`1002`	`+ }`
	`1003`	`+`
	`1004`	`+ if (this->evoFormat && is_legacy_numeric_entry(entry))`
`1000`	`1005`	`{`
`1001`	`1006`	`const bool exactFraction = legacy_value_is_exact_fraction(data);`
`1002`	`1007`	`data = legacy_value_to_evo_expression(data);`
Original file line number	Diff line number	Diff line change
`@@ -550,7 +550,10 @@ namespace tivars::TypeHandlers`
`550`	`550`	`continue;`
`551`	`551`	`}`
`552`	`552`
`553`		`- const std::string currChar = str.substr(strCursorPos, 1);`
	`553`	`+ // Unmatched text is reported as one UTF-8 codepoint so callers can`
	`554`	`+ // handle non-CE characters without receiving split byte fragments.`
	`555`	`+ const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);`
	`556`	`+ const std::string currChar = str.substr(strCursorPos, currCharLen);`
`554`	`557`
`555`	`558`	`if (currChar == backslashStr)`
`556`	`559`	`{`
`@@ -653,6 +656,8 @@ namespace tivars::TypeHandlers`
`653`	`656`	`if (!matched)`
`654`	`657`	`{`
`655`	`658`	`onSkipped(currChar);`
	`659`	`+ // The for-loop will add the final byte advance.`
	`660`	`+ strCursorPos += currCharLen - 1;`
`656`	`661`	`}`
`657`	`662`	`}`
`658`	`663`	`}`
`@@ -1412,6 +1417,24 @@ namespace tivars::TypeHandlers`
`1412`	`1417`	`return posinfo;`
`1413`	`1418`	`}`
`1414`	`1419`
	`1420`	`+ std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)`
	`1421`	`+ {`
	`1422`	`+ ensure_tokens_initialized();`
	`1423`	`+`
	`1424`	`+ std::vector<token_scan_item> items;`
	`1425`	`+ items.reserve(sourceStr.size());`
	`1426`	`+ scan_source_tokens(sourceStr, detectStrings,`
	`1427`	`+ [&](const std::string& tokenStr, uint16_t tokenValue)`
	`1428`	`+ {`
	`1429`	`+ items.push_back({ tokenStr, tokenValue, true });`
	`1430`	`+ },`
	`1431`	`+ [&](const std::string& skippedStr)`
	`1432`	`+ {`
	`1433`	`+ items.push_back({ skippedStr, 0, false });`
	`1434`	`+ });`
	`1435`	`+ return items;`
	`1436`	`+ }`
	`1437`	`+`
`1415`	`1438`	`TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)`
`1416`	`1439`	`{`
`1417`	`1440`	`const size_t strLen = hexBytesStr.length();`