Skip to content

Commit fc10afc

Browse files
committed
Evo: make tokenization a 1st class citizen feature.
1 parent 0185075 commit fc10afc

6 files changed

Lines changed: 193 additions & 4 deletions

File tree

src/EvoFormat.cpp

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
#include <cstring>
1313
#include <iostream>
1414
#include <ranges>
15+
#include <sstream>
1516
#include <stdexcept>
1617
#include <unordered_map>
1718

1819
#include "TIVarTypes.h"
1920
#include "TypeHandlers/TypeHandlers.h"
21+
#include "json.hpp"
2022
#include "tivarslib_utils.h"
2123

24+
using json = nlohmann::ordered_json;
25+
2226
namespace tivars::EvoFormat
2327
{
2428
namespace
@@ -112,6 +116,7 @@ namespace tivars::EvoFormat
112116
codepoint = static_cast<uint16_t>(value);
113117
return true;
114118
}
119+
115120
}
116121

117122
uint16_t evo_checksum(const data_t& body)
@@ -747,6 +752,9 @@ static const char* evo_token_name(uint16_t token)
747752
}
748753

749754
static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
755+
static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
756+
static void append_evo_token(data_t& out, uint16_t evoToken);
757+
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
750758

751759
static std::string evo_token_to_string(uint16_t token)
752760
{
@@ -795,6 +803,112 @@ std::string detokenize_evo_token_words(const data_t& data)
795803
return out;
796804
}
797805

806+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options)
807+
{
808+
const bool deindent = options.contains("deindent") && options.at("deindent") == 1;
809+
const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0;
810+
811+
std::string sourceText = source;
812+
const std::string trimmed = trim(sourceText);
813+
if (!trimmed.empty() && trimmed.front() == '{')
814+
{
815+
try
816+
{
817+
const json j = json::parse(trimmed);
818+
if (j.contains("rawDataHex"))
819+
{
820+
return hex_string_to_bytes(j.at("rawDataHex").get<std::string>(), "rawDataHex");
821+
}
822+
if (j.contains("code"))
823+
{
824+
sourceText = j.at("code").get<std::string>();
825+
}
826+
}
827+
catch (const json::exception&)
828+
{
829+
// Ignore non-JSON input and fall back to regular Evo tokenized parsing.
830+
}
831+
}
832+
833+
std::string normalizedSource;
834+
if (deindent)
835+
{
836+
std::istringstream lines{sourceText};
837+
std::string line;
838+
while (std::getline(lines, line))
839+
{
840+
normalizedSource += ltrim(line) + "\n";
841+
}
842+
if (!normalizedSource.empty())
843+
{
844+
normalizedSource.pop_back();
845+
}
846+
}
847+
else
848+
{
849+
normalizedSource = sourceText;
850+
}
851+
852+
static constexpr uint16_t legacyStore = 0x04;
853+
static constexpr uint16_t legacyQuote = 0x2A;
854+
static constexpr uint16_t legacyNewLine = 0x3F;
855+
856+
data_t evo;
857+
evo.reserve((normalizedSource.size() + 1) * 2);
858+
bool isWithinString = false;
859+
860+
for (const auto& [text, token, matched] : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings))
861+
{
862+
if (matched)
863+
{
864+
uint16_t evoToken = 0;
865+
if (token == legacyStore || token == legacyNewLine)
866+
{
867+
isWithinString = false;
868+
}
869+
870+
if (text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(token) && !direct_evo_token_for_legacy(token, evoToken))
871+
{
872+
evoToken = token;
873+
}
874+
else if (isWithinString && token != legacyQuote && legacy_token_to_evo_ucs2(token, evoToken))
875+
{
876+
// Text inside Evo strings is stored as UCS-2 code units.
877+
}
878+
else if (!direct_evo_token_for_legacy(token, evoToken))
879+
{
880+
std::cerr << "[Warning] Cannot convert 84+CE token "
881+
<< TypeHandlers::TH_Tokenized::oneTokenBytesToString(token)
882+
<< " to an Evo token; replacing with ?" << std::endl;
883+
evoToken = 0xE41B;
884+
}
885+
886+
append_evo_token(evo, evoToken);
887+
if (token == legacyQuote)
888+
{
889+
isWithinString = !isWithinString;
890+
}
891+
continue;
892+
}
893+
894+
// The shared scanner leaves unknown-but-valid UTF-8 source text here;
895+
// Evo can store displayable UCS-2 characters directly as token words.
896+
uint16_t codepoint = 0;
897+
if (utf8_to_single_codepoint(text, codepoint) && is_displayable_ucs2_scalar(codepoint))
898+
{
899+
append_evo_token(evo, codepoint);
900+
}
901+
else if (!text.empty())
902+
{
903+
std::cerr << "[Warning] Cannot encode source text \"" << text
904+
<< "\" as an Evo token; skipping it." << std::endl;
905+
}
906+
}
907+
908+
append_evo_token(evo, 0);
909+
return evo;
910+
}
911+
798912
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry)
799913
{
800914
const EvoTypeID evoTypeID = entry.evoTypeID;
@@ -1894,6 +2008,7 @@ data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversi
18942008
throw std::invalid_argument("Invalid tokenized legacy data");
18952009
}
18962010

2011+
static constexpr uint16_t legacyStore = 0x04;
18972012
static constexpr uint16_t legacyQuote = 0x2A;
18982013
static constexpr uint16_t legacyColon = 0x3E;
18992014
static constexpr uint16_t legacyNewLine = 0x3F;
@@ -1925,6 +2040,11 @@ data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversi
19252040
}
19262041

19272042
uint16_t evoToken = 0;
2043+
if (legacyToken == legacyStore || legacyToken == legacyNewLine)
2044+
{
2045+
isWithinString = false;
2046+
}
2047+
19282048
if (isWithinString && legacyToken != legacyQuote && legacy_token_to_evo_ucs2(legacyToken, evoToken))
19292049
{
19302050
// Text inside Evo strings is stored as UCS-2 code units.

src/EvoFormat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ namespace tivars::EvoFormat
6868
EvoPythonScriptInfo parse_evo_python_script_payload(const data_t& data);
6969

7070
std::string detokenize_evo_token_words(const data_t& data);
71+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options = options_t());
7172
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry);
7273
data_t evo_tokenized_data_to_legacy(const data_t& evoData);
7374
data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversion = false);

src/TIVarFile.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,12 +991,17 @@ namespace tivars
991991
void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)
992992
{
993993
auto& entry = this->entries[entryIdx];
994-
data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);
994+
data_t data;
995995
if (this->evoFormat && is_evo_tokenized_entry(entry))
996996
{
997-
data = legacy_tokenized_data_to_evo(data);
997+
data = tokenize_evo_token_words(str, options);
998998
}
999-
else if (this->evoFormat && is_legacy_numeric_entry(entry))
999+
else
1000+
{
1001+
data = std::get<0>(entry._type.getHandlers())(str, options, this);
1002+
}
1003+
1004+
if (this->evoFormat && is_legacy_numeric_entry(entry))
10001005
{
10011006
const bool exactFraction = legacy_value_is_exact_fraction(data);
10021007
data = legacy_value_to_evo_expression(data);

src/TypeHandlers/TH_Tokenized.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,10 @@ namespace tivars::TypeHandlers
550550
continue;
551551
}
552552

553-
const std::string currChar = str.substr(strCursorPos, 1);
553+
// Unmatched text is reported as one UTF-8 codepoint so callers can
554+
// handle non-CE characters without receiving split byte fragments.
555+
const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);
556+
const std::string currChar = str.substr(strCursorPos, currCharLen);
554557

555558
if (currChar == backslashStr)
556559
{
@@ -653,6 +656,8 @@ namespace tivars::TypeHandlers
653656
if (!matched)
654657
{
655658
onSkipped(currChar);
659+
// The for-loop will add the final byte advance.
660+
strCursorPos += currCharLen - 1;
656661
}
657662
}
658663
}
@@ -1412,6 +1417,24 @@ namespace tivars::TypeHandlers
14121417
return posinfo;
14131418
}
14141419

1420+
std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)
1421+
{
1422+
ensure_tokens_initialized();
1423+
1424+
std::vector<token_scan_item> items;
1425+
items.reserve(sourceStr.size());
1426+
scan_source_tokens(sourceStr, detectStrings,
1427+
[&](const std::string& tokenStr, uint16_t tokenValue)
1428+
{
1429+
items.push_back({ tokenStr, tokenValue, true });
1430+
},
1431+
[&](const std::string& skippedStr)
1432+
{
1433+
items.push_back({ skippedStr, 0, false });
1434+
});
1435+
return items;
1436+
}
1437+
14151438
TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)
14161439
{
14171440
const size_t strLen = hexBytesStr.length();

src/TypeHandlers/TypeHandlers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,12 @@ namespace tivars::TypeHandlers
244244
enum typelang { PRGMLANG_BASIC = 0, PRGMLANG_AXE, PRGMLANG_ICE };
245245
enum indentchar : char { INDENT_CHAR_SPACE = ' ', INDENT_CHAR_TAB = '\t' };
246246
struct token_posinfo { uint16_t line; uint16_t column; uint8_t len; };
247+
struct token_scan_item { std::string text; uint16_t token; bool matched; };
247248
static std::string reindentCodeString(const std::string& str_orig, const options_t& options = options_t());
248249
static token_posinfo getPosInfoAtOffset(const data_t& data, uint16_t byteOffset, const options_t& options = options_t());
249250
static token_posinfo getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset);
250251
static token_posinfo getPosInfoAtOffsetInSourceString(const std::string& sourceStr, uint16_t byteOffset);
252+
static std::vector<token_scan_item> scanSourceTokens(const std::string& sourceStr, bool detectStrings = true);
251253
static std::string tokenToString(const data_t& data, int *incr, const options_t& options);
252254
static std::string oneTokenBytesToString(uint16_t tokenBytes);
253255
};

tests.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,6 +1666,26 @@ int main(int argc, char** argv)
16661666
assert(lowerStringProgram.getReadableContent() == "Disp \"Hello World");
16671667
}
16681668

1669+
{
1670+
const std::string source = "Pause \"PRESS ENTER\nDelVar S\n1→G\nFor(Y,0,250,250";
1671+
1672+
TIVarFile directEvoProgram = TIVarFile::createNew("Program", "LINETST", "84Evo");
1673+
directEvoProgram.setContentFromString(source);
1674+
const std::string directEvoHex = json::parse(directEvoProgram.getReadableContent())["rawDataHex"];
1675+
assert(directEvoHex.find("CAE416E450005200450053005300200045004E00540045005200") != std::string::npos);
1676+
assert(directEvoHex.find("E0E412E8") != std::string::npos);
1677+
assert(directEvoHex.find("E402E41DE406E8") != std::string::npos);
1678+
assert(directEvoHex.find("C5E418E817E401E417E403E406E401E417E403E406E401E4") != std::string::npos);
1679+
assert(directEvoHex.find("E0E45300") == std::string::npos);
1680+
1681+
TIVarFile legacyProgram = TIVarFile::createNew("Program", "LINETST", "84+CE");
1682+
legacyProgram.setContentFromString(source);
1683+
legacyProgram.convertToModel(TIModel{"84Evo"});
1684+
const std::string convertedEvoHex = json::parse(legacyProgram.getReadableContent())["rawDataHex"];
1685+
assert(convertedEvoHex.find("E0E412E8") != std::string::npos);
1686+
assert(convertedEvoHex.find("E0E45300") == std::string::npos);
1687+
}
1688+
16691689
{
16701690
TIVarFile rawUcs2Program = TIVarFile::createNew("Program", "UCS2", "84Evo");
16711691
rawUcs2Program.setContentFromData({0x48, 0x00, 0xE9, 0x00, 0x00, 0x00});
@@ -1694,6 +1714,20 @@ int main(int argc, char** argv)
16941714
assert(code.find("\\u0178") != std::string::npos);
16951715
}
16961716

1717+
{
1718+
assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
1719+
1720+
TIVarFile directYHatProgram = TIVarFile::createNew("Program", "YHAT2", "84Evo");
1721+
directYHatProgram.setContentFromString("Disp \"ŷ\"");
1722+
const json readable = json::parse(directYHatProgram.getReadableContent());
1723+
assert(readable["code"] == "Disp \"ŷ\"");
1724+
assert(readable["rawDataHex"].get<std::string>().find("7701") != std::string::npos);
1725+
1726+
TIVarFile recreatedYHatProgram = TIVarFile::createNew("Program", "YHAT3", "84Evo");
1727+
recreatedYHatProgram.setContentFromString(directYHatProgram.getReadableContent());
1728+
assert(recreatedYHatProgram.getRawContent() == directYHatProgram.getRawContent());
1729+
}
1730+
16971731
{
16981732
TIVarFile stringVarProgram = TIVarFile::createNew("Program", "MYPRGM", "84+CE");
16991733
stringVarProgram.setContentFromString(R"(Ans→N
@@ -1747,6 +1781,10 @@ Disp Str1
17471781
evalProgram.convertToModel(TIModel{"84+CE"});
17481782
assert(!evalProgram.isEvoFormat());
17491783
assert(evalProgram.getRawContent() == originalLegacyData);
1784+
1785+
TIVarFile directEvoEvalProgram = TIVarFile::createNew("Program", "EVALTOK2", "84Evo");
1786+
directEvoEvalProgram.setContentFromString("eval(\"1+1\"");
1787+
assert(json::parse(directEvoEvalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
17501788
}
17511789

17521790
{

0 commit comments

Comments
 (0)