Skip to content

Commit ebb56d6

Browse files
committed
Evo: make tokenization a 1st class citizen feature.
1 parent 7afa84f commit ebb56d6

6 files changed

Lines changed: 125 additions & 4 deletions

File tree

src/EvoFormat.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <cstring>
1313
#include <iostream>
1414
#include <ranges>
15+
#include <sstream>
1516
#include <stdexcept>
1617
#include <unordered_map>
1718

@@ -747,6 +748,9 @@ static const char* evo_token_name(uint16_t token)
747748
}
748749

749750
static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
751+
static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
752+
static void append_evo_token(data_t& out, uint16_t evoToken);
753+
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
750754

751755
static std::string evo_token_to_string(uint16_t token)
752756
{
@@ -795,6 +799,81 @@ std::string detokenize_evo_token_words(const data_t& data)
795799
return out;
796800
}
797801

802+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options)
803+
{
804+
const bool deindent = options.contains("deindent") && options.at("deindent") == 1;
805+
const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0;
806+
807+
std::string normalizedSource;
808+
if (deindent)
809+
{
810+
std::istringstream lines{source};
811+
std::string line;
812+
while (std::getline(lines, line))
813+
{
814+
normalizedSource += ltrim(line) + "\n";
815+
}
816+
if (!normalizedSource.empty())
817+
{
818+
normalizedSource.pop_back();
819+
}
820+
}
821+
else
822+
{
823+
normalizedSource = source;
824+
}
825+
826+
static constexpr uint16_t legacyQuote = 0x2A;
827+
828+
data_t evo;
829+
evo.reserve((normalizedSource.size() + 1) * 2);
830+
bool isWithinString = false;
831+
832+
for (const auto& item : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings))
833+
{
834+
if (item.matched)
835+
{
836+
uint16_t evoToken = 0;
837+
if (item.text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(item.token) && !direct_evo_token_for_legacy(item.token, evoToken))
838+
{
839+
evoToken = item.token;
840+
}
841+
else if (isWithinString && item.token != legacyQuote && legacy_token_to_evo_ucs2(item.token, evoToken))
842+
{
843+
// Text inside Evo strings is stored as UCS-2 code units.
844+
}
845+
else if (!direct_evo_token_for_legacy(item.token, evoToken))
846+
{
847+
std::cerr << "[Warning] Cannot convert 84+CE token "
848+
<< TypeHandlers::TH_Tokenized::oneTokenBytesToString(item.token)
849+
<< " to an Evo token; replacing with ?" << std::endl;
850+
evoToken = 0xE41B;
851+
}
852+
853+
append_evo_token(evo, evoToken);
854+
if (item.token == legacyQuote)
855+
{
856+
isWithinString = !isWithinString;
857+
}
858+
continue;
859+
}
860+
861+
uint16_t codepoint = 0;
862+
if (utf8_to_single_codepoint(item.text, codepoint) && is_displayable_ucs2_scalar(codepoint))
863+
{
864+
append_evo_token(evo, codepoint);
865+
}
866+
else if (!item.text.empty())
867+
{
868+
std::cerr << "[Warning] Cannot encode source text \"" << item.text
869+
<< "\" as an Evo token; skipping it." << std::endl;
870+
}
871+
}
872+
873+
append_evo_token(evo, 0);
874+
return evo;
875+
}
876+
798877
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry)
799878
{
800879
const EvoTypeID evoTypeID = entry.evoTypeID;

src/EvoFormat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ namespace tivars::EvoFormat
6868
EvoPythonScriptInfo parse_evo_python_script_payload(const data_t& data);
6969

7070
std::string detokenize_evo_token_words(const data_t& data);
71+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options = options_t());
7172
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry);
7273
data_t evo_tokenized_data_to_legacy(const data_t& evoData);
7374
data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversion = false);

src/TIVarFile.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,12 +991,17 @@ namespace tivars
991991
void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)
992992
{
993993
auto& entry = this->entries[entryIdx];
994-
data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);
994+
data_t data;
995995
if (this->evoFormat && is_evo_tokenized_entry(entry))
996996
{
997-
data = legacy_tokenized_data_to_evo(data);
997+
data = tokenize_evo_token_words(str, options);
998998
}
999-
else if (this->evoFormat && is_legacy_numeric_entry(entry))
999+
else
1000+
{
1001+
data = std::get<0>(entry._type.getHandlers())(str, options, this);
1002+
}
1003+
1004+
if (this->evoFormat && is_legacy_numeric_entry(entry))
10001005
{
10011006
const bool exactFraction = legacy_value_is_exact_fraction(data);
10021007
data = legacy_value_to_evo_expression(data);

src/TypeHandlers/TH_Tokenized.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,8 @@ namespace tivars::TypeHandlers
573573
continue;
574574
}
575575

576-
const std::string currChar = str.substr(strCursorPos, 1);
576+
const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);
577+
const std::string currChar = str.substr(strCursorPos, currCharLen);
577578

578579
if (currChar == backslashStr)
579580
{
@@ -676,6 +677,7 @@ namespace tivars::TypeHandlers
676677
if (!matched)
677678
{
678679
onSkipped(currChar);
680+
strCursorPos += currCharLen - 1;
679681
}
680682
}
681683
}
@@ -1435,6 +1437,24 @@ namespace tivars::TypeHandlers
14351437
return posinfo;
14361438
}
14371439

1440+
std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)
1441+
{
1442+
ensure_tokens_initialized();
1443+
1444+
std::vector<token_scan_item> items;
1445+
items.reserve(sourceStr.size());
1446+
scan_source_tokens(sourceStr, detectStrings,
1447+
[&](const std::string& tokenStr, uint16_t tokenValue)
1448+
{
1449+
items.push_back({ tokenStr, tokenValue, true });
1450+
},
1451+
[&](const std::string& skippedStr)
1452+
{
1453+
items.push_back({ skippedStr, 0, false });
1454+
});
1455+
return items;
1456+
}
1457+
14381458
TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)
14391459
{
14401460
const size_t strLen = hexBytesStr.length();

src/TypeHandlers/TypeHandlers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,12 @@ namespace tivars::TypeHandlers
244244
enum typelang { PRGMLANG_BASIC = 0, PRGMLANG_AXE, PRGMLANG_ICE };
245245
enum indentchar : char { INDENT_CHAR_SPACE = ' ', INDENT_CHAR_TAB = '\t' };
246246
struct token_posinfo { uint16_t line; uint16_t column; uint8_t len; };
247+
struct token_scan_item { std::string text; uint16_t token; bool matched; };
247248
static std::string reindentCodeString(const std::string& str_orig, const options_t& options = options_t());
248249
static token_posinfo getPosInfoAtOffset(const data_t& data, uint16_t byteOffset, const options_t& options = options_t());
249250
static token_posinfo getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset);
250251
static token_posinfo getPosInfoAtOffsetInSourceString(const std::string& sourceStr, uint16_t byteOffset);
252+
static std::vector<token_scan_item> scanSourceTokens(const std::string& sourceStr, bool detectStrings = true);
251253
static std::string tokenToString(const data_t& data, int *incr, const options_t& options);
252254
static std::string oneTokenBytesToString(uint16_t tokenBytes);
253255
};

tests.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1696,6 +1696,16 @@ int main(int argc, char** argv)
16961696
assert(code.find("\\u0178") != std::string::npos);
16971697
}
16981698

1699+
{
1700+
assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
1701+
1702+
TIVarFile directYHatProgram = TIVarFile::createNew("Program", "YHAT2", "84Evo");
1703+
directYHatProgram.setContentFromString("Disp \"ŷ\"");
1704+
const json readable = json::parse(directYHatProgram.getReadableContent());
1705+
assert(readable["code"] == "Disp \"ŷ\"");
1706+
assert(readable["rawDataHex"].get<std::string>().find("7701") != std::string::npos);
1707+
}
1708+
16991709
{
17001710
TIVarFile stringVarProgram = TIVarFile::createNew("Program", "MYPRGM", "84+CE");
17011711
stringVarProgram.setContentFromString(R"(Ans→N
@@ -1749,6 +1759,10 @@ Disp Str1
17491759
evalProgram.convertToModel(TIModel{"84+CE"});
17501760
assert(!evalProgram.isEvoFormat());
17511761
assert(evalProgram.getRawContent() == originalLegacyData);
1762+
1763+
TIVarFile directEvoEvalProgram = TIVarFile::createNew("Program", "EVALTOK2", "84Evo");
1764+
directEvoEvalProgram.setContentFromString("eval(\"1+1\"");
1765+
assert(json::parse(directEvoEvalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
17521766
}
17531767

17541768
{

0 commit comments

Comments
 (0)