Evo: make tokenization a 1st class citizen feature.

adriweb · adriweb · commit ebb56d6b922c · 2026-05-14T15:09:19.000+02:00
diff --git a/src/EvoFormat.cpp b/src/EvoFormat.cpp
@@ -12,6 +12,7 @@
 #include <cstring>
 #include <iostream>
 #include <ranges>
+#include <sstream>
 #include <stdexcept>
 #include <unordered_map>
 
@@ -747,6 +748,9 @@ static const char* evo_token_name(uint16_t token)
 }
 
 static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
+static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
+static void append_evo_token(data_t& out, uint16_t evoToken);
+static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
 
 static std::string evo_token_to_string(uint16_t token)
 {
@@ -795,6 +799,81 @@ std::string detokenize_evo_token_words(const data_t& data)
     return out;
 }
 
+data_t tokenize_evo_token_words(const std::string& source, const options_t& options)
+{
+    const bool deindent = options.contains("deindent") && options.at("deindent") == 1;
+    const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0;
+
+    std::string normalizedSource;
+    if (deindent)
+    {
+        std::istringstream lines{source};
+        std::string line;
+        while (std::getline(lines, line))
+        {
+            normalizedSource += ltrim(line) + "\n";
+        }
+        if (!normalizedSource.empty())
+        {
+            normalizedSource.pop_back();
+        }
+    }
+    else
+    {
+        normalizedSource = source;
+    }
+
+    static constexpr uint16_t legacyQuote = 0x2A;
+
+    data_t evo;
+    evo.reserve((normalizedSource.size() + 1) * 2);
+    bool isWithinString = false;
+
+    for (const auto& item : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings))
+    {
+        if (item.matched)
+        {
+            uint16_t evoToken = 0;
+            if (item.text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(item.token) && !direct_evo_token_for_legacy(item.token, evoToken))
+            {
+                evoToken = item.token;
+            }
+            else if (isWithinString && item.token != legacyQuote && legacy_token_to_evo_ucs2(item.token, evoToken))
+            {
+                // Text inside Evo strings is stored as UCS-2 code units.
+            }
+            else if (!direct_evo_token_for_legacy(item.token, evoToken))
+            {
+                std::cerr << "[Warning] Cannot convert 84+CE token "
+                          << TypeHandlers::TH_Tokenized::oneTokenBytesToString(item.token)
+                          << " to an Evo token; replacing with ?" << std::endl;
+                evoToken = 0xE41B;
+            }
+
+            append_evo_token(evo, evoToken);
+            if (item.token == legacyQuote)
+            {
+                isWithinString = !isWithinString;
+            }
+            continue;
+        }
+
+        uint16_t codepoint = 0;
+        if (utf8_to_single_codepoint(item.text, codepoint) && is_displayable_ucs2_scalar(codepoint))
+        {
+            append_evo_token(evo, codepoint);
+        }
+        else if (!item.text.empty())
+        {
+            std::cerr << "[Warning] Cannot encode source text \"" << item.text
+                      << "\" as an Evo token; skipping it." << std::endl;
+        }
+    }
+
+    append_evo_token(evo, 0);
+    return evo;
+}
+
 bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry)
 {
     const EvoTypeID evoTypeID = entry.evoTypeID;
diff --git a/src/EvoFormat.h b/src/EvoFormat.h
@@ -68,6 +68,7 @@ namespace tivars::EvoFormat
     EvoPythonScriptInfo parse_evo_python_script_payload(const data_t& data);
 
     std::string detokenize_evo_token_words(const data_t& data);
+    data_t tokenize_evo_token_words(const std::string& source, const options_t& options = options_t());
     bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry);
     data_t evo_tokenized_data_to_legacy(const data_t& evoData);
     data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversion = false);
diff --git a/src/TIVarFile.cpp b/src/TIVarFile.cpp
@@ -991,12 +991,17 @@ namespace tivars
     void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)
     {
         auto& entry = this->entries[entryIdx];
-        data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);
+        data_t data;
         if (this->evoFormat && is_evo_tokenized_entry(entry))
         {
-            data = legacy_tokenized_data_to_evo(data);
+            data = tokenize_evo_token_words(str, options);
         }
-        else if (this->evoFormat && is_legacy_numeric_entry(entry))
+        else
+        {
+            data = std::get<0>(entry._type.getHandlers())(str, options, this);
+        }
+
+        if (this->evoFormat && is_legacy_numeric_entry(entry))
         {
             const bool exactFraction = legacy_value_is_exact_fraction(data);
             data = legacy_value_to_evo_expression(data);
diff --git a/src/TypeHandlers/TH_Tokenized.cpp b/src/TypeHandlers/TH_Tokenized.cpp
@@ -573,7 +573,8 @@ namespace tivars::TypeHandlers
                     continue;
                 }
 
-                const std::string currChar = str.substr(strCursorPos, 1);
+                const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);
+                const std::string currChar = str.substr(strCursorPos, currCharLen);
 
                 if (currChar == backslashStr)
                 {
@@ -676,6 +677,7 @@ namespace tivars::TypeHandlers
                 if (!matched)
                 {
                     onSkipped(currChar);
+                    strCursorPos += currCharLen - 1;
                 }
             }
         }
@@ -1435,6 +1437,24 @@ namespace tivars::TypeHandlers
         return posinfo;
     }
 
+    std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)
+    {
+        ensure_tokens_initialized();
+
+        std::vector<token_scan_item> items;
+        items.reserve(sourceStr.size());
+        scan_source_tokens(sourceStr, detectStrings,
+                           [&](const std::string& tokenStr, uint16_t tokenValue)
+                           {
+                               items.push_back({ tokenStr, tokenValue, true });
+                           },
+                           [&](const std::string& skippedStr)
+                           {
+                               items.push_back({ skippedStr, 0, false });
+                           });
+        return items;
+    }
+
     TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)
     {
         const size_t strLen = hexBytesStr.length();
diff --git a/src/TypeHandlers/TypeHandlers.h b/src/TypeHandlers/TypeHandlers.h
@@ -244,10 +244,12 @@ namespace tivars::TypeHandlers
         enum typelang { PRGMLANG_BASIC = 0, PRGMLANG_AXE, PRGMLANG_ICE };
         enum indentchar : char { INDENT_CHAR_SPACE = ' ', INDENT_CHAR_TAB = '\t' };
         struct token_posinfo { uint16_t line; uint16_t column; uint8_t len; };
+        struct token_scan_item { std::string text; uint16_t token; bool matched; };
         static std::string reindentCodeString(const std::string& str_orig, const options_t& options = options_t());
         static token_posinfo getPosInfoAtOffset(const data_t& data, uint16_t byteOffset, const options_t& options = options_t());
         static token_posinfo getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset);
         static token_posinfo getPosInfoAtOffsetInSourceString(const std::string& sourceStr, uint16_t byteOffset);
+        static std::vector<token_scan_item> scanSourceTokens(const std::string& sourceStr, bool detectStrings = true);
         static std::string tokenToString(const data_t& data, int *incr, const options_t& options);
         static std::string oneTokenBytesToString(uint16_t tokenBytes);
     };
diff --git a/tests.cpp b/tests.cpp
@@ -1696,6 +1696,16 @@ int main(int argc, char** argv)
         assert(code.find("\\u0178") != std::string::npos);
     }
 
+    {
+        assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
+
+        TIVarFile directYHatProgram = TIVarFile::createNew("Program", "YHAT2", "84Evo");
+        directYHatProgram.setContentFromString("Disp \"ŷ\"");
+        const json readable = json::parse(directYHatProgram.getReadableContent());
+        assert(readable["code"] == "Disp \"ŷ\"");
+        assert(readable["rawDataHex"].get<std::string>().find("7701") != std::string::npos);
+    }
+
     {
         TIVarFile stringVarProgram = TIVarFile::createNew("Program", "MYPRGM", "84+CE");
         stringVarProgram.setContentFromString(R"(Ans→N
@@ -1749,6 +1759,10 @@ Disp Str1
         evalProgram.convertToModel(TIModel{"84+CE"});
         assert(!evalProgram.isEvoFormat());
         assert(evalProgram.getRawContent() == originalLegacyData);
+
+        TIVarFile directEvoEvalProgram = TIVarFile::createNew("Program", "EVALTOK2", "84Evo");
+        directEvoEvalProgram.setContentFromString("eval(\"1+1\"");
+        assert(json::parse(directEvoEvalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
     }
 
     {

Original file line number	Diff line number	Diff line change
`@@ -991,12 +991,17 @@ namespace tivars`
`991`	`991`	`void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)`
`992`	`992`	`{`
`993`	`993`	`auto& entry = this->entries[entryIdx];`
`994`		`- data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);`
	`994`	`+ data_t data;`
`995`	`995`	`if (this->evoFormat && is_evo_tokenized_entry(entry))`
`996`	`996`	`{`
`997`		`- data = legacy_tokenized_data_to_evo(data);`
	`997`	`+ data = tokenize_evo_token_words(str, options);`
`998`	`998`	`}`
`999`		`- else if (this->evoFormat && is_legacy_numeric_entry(entry))`
	`999`	`+ else`
	`1000`	`+ {`
	`1001`	`+ data = std::get<0>(entry._type.getHandlers())(str, options, this);`
	`1002`	`+ }`
	`1003`	`+`
	`1004`	`+ if (this->evoFormat && is_legacy_numeric_entry(entry))`
`1000`	`1005`	`{`
`1001`	`1006`	`const bool exactFraction = legacy_value_is_exact_fraction(data);`
`1002`	`1007`	`data = legacy_value_to_evo_expression(data);`
Original file line number	Diff line number	Diff line change
`@@ -573,7 +573,8 @@ namespace tivars::TypeHandlers`
`573`	`573`	`continue;`
`574`	`574`	`}`
`575`	`575`
`576`		`- const std::string currChar = str.substr(strCursorPos, 1);`
	`576`	`+ const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);`
	`577`	`+ const std::string currChar = str.substr(strCursorPos, currCharLen);`
`577`	`578`
`578`	`579`	`if (currChar == backslashStr)`
`579`	`580`	`{`
`@@ -676,6 +677,7 @@ namespace tivars::TypeHandlers`
`676`	`677`	`if (!matched)`
`677`	`678`	`{`
`678`	`679`	`onSkipped(currChar);`
	`680`	`+ strCursorPos += currCharLen - 1;`
`679`	`681`	`}`
`680`	`682`	`}`
`681`	`683`	`}`
`@@ -1435,6 +1437,24 @@ namespace tivars::TypeHandlers`
`1435`	`1437`	`return posinfo;`
`1436`	`1438`	`}`
`1437`	`1439`
	`1440`	`+ std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)`
	`1441`	`+ {`
	`1442`	`+ ensure_tokens_initialized();`
	`1443`	`+`
	`1444`	`+ std::vector<token_scan_item> items;`
	`1445`	`+ items.reserve(sourceStr.size());`
	`1446`	`+ scan_source_tokens(sourceStr, detectStrings,`
	`1447`	`+ [&](const std::string& tokenStr, uint16_t tokenValue)`
	`1448`	`+ {`
	`1449`	`+ items.push_back({ tokenStr, tokenValue, true });`
	`1450`	`+ },`
	`1451`	`+ [&](const std::string& skippedStr)`
	`1452`	`+ {`
	`1453`	`+ items.push_back({ skippedStr, 0, false });`
	`1454`	`+ });`
	`1455`	`+ return items;`
	`1456`	`+ }`
	`1457`	`+`
`1438`	`1458`	`TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)`
`1439`	`1459`	`{`
`1440`	`1460`	`const size_t strLen = hexBytesStr.length();`