Skip to content

Commit f09996a

Browse files
committed
Evo: make tokenization a 1st class citizen feature.
1 parent 535ba8d commit f09996a

8 files changed

Lines changed: 185 additions & 4 deletions

File tree

src/EvoFormat.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
#include <cstring>
1313
#include <iostream>
1414
#include <ranges>
15+
#include <sstream>
1516
#include <stdexcept>
1617
#include <unordered_map>
1718

1819
#include "TIVarTypes.h"
1920
#include "TypeHandlers/TypeHandlers.h"
21+
#include "json.hpp"
2022
#include "tivarslib_utils.h"
2123

24+
using json = nlohmann::ordered_json;
25+
2226
namespace tivars::EvoFormat
2327
{
2428
namespace
@@ -112,6 +116,7 @@ namespace tivars::EvoFormat
112116
codepoint = static_cast<uint16_t>(value);
113117
return true;
114118
}
119+
115120
}
116121

117122
uint16_t evo_checksum(const data_t& body)
@@ -747,6 +752,9 @@ static const char* evo_token_name(uint16_t token)
747752
}
748753

749754
static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
755+
static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
756+
static void append_evo_token(data_t& out, uint16_t evoToken);
757+
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
750758

751759
static std::string evo_token_to_string(uint16_t token)
752760
{
@@ -795,6 +803,105 @@ std::string detokenize_evo_token_words(const data_t& data)
795803
return out;
796804
}
797805

806+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options)
807+
{
808+
const bool deindent = options.contains("deindent") && options.at("deindent") == 1;
809+
const bool detectStrings = !options.contains("detect_strings") || options.at("detect_strings") != 0;
810+
811+
std::string sourceText = source;
812+
const std::string trimmed = trim(sourceText);
813+
if (!trimmed.empty() && trimmed.front() == '{')
814+
{
815+
try
816+
{
817+
const json j = json::parse(trimmed);
818+
if (j.contains("rawDataHex"))
819+
{
820+
return hex_string_to_bytes(j.at("rawDataHex").get<std::string>(), "rawDataHex");
821+
}
822+
if (j.contains("code"))
823+
{
824+
sourceText = j.at("code").get<std::string>();
825+
}
826+
}
827+
catch (const json::exception&)
828+
{
829+
// Ignore non-JSON input and fall back to regular Evo tokenized parsing.
830+
}
831+
}
832+
833+
std::string normalizedSource;
834+
if (deindent)
835+
{
836+
std::istringstream lines{sourceText};
837+
std::string line;
838+
while (std::getline(lines, line))
839+
{
840+
normalizedSource += ltrim(line) + "\n";
841+
}
842+
if (!normalizedSource.empty())
843+
{
844+
normalizedSource.pop_back();
845+
}
846+
}
847+
else
848+
{
849+
normalizedSource = sourceText;
850+
}
851+
852+
static constexpr uint16_t legacyQuote = 0x2A;
853+
854+
data_t evo;
855+
evo.reserve((normalizedSource.size() + 1) * 2);
856+
bool isWithinString = false;
857+
858+
for (const auto& [text, token, matched] : TypeHandlers::TH_Tokenized::scanSourceTokens(normalizedSource, detectStrings))
859+
{
860+
if (matched)
861+
{
862+
uint16_t evoToken = 0;
863+
if (text.rfind("\\u", 0) == 0 && is_displayable_ucs2_scalar(token) && !direct_evo_token_for_legacy(token, evoToken))
864+
{
865+
evoToken = token;
866+
}
867+
else if (isWithinString && token != legacyQuote && legacy_token_to_evo_ucs2(token, evoToken))
868+
{
869+
// Text inside Evo strings is stored as UCS-2 code units.
870+
}
871+
else if (!direct_evo_token_for_legacy(token, evoToken))
872+
{
873+
std::cerr << "[Warning] Cannot convert 84+CE token "
874+
<< TypeHandlers::TH_Tokenized::oneTokenBytesToString(token)
875+
<< " to an Evo token; replacing with ?" << std::endl;
876+
evoToken = 0xE41B;
877+
}
878+
879+
append_evo_token(evo, evoToken);
880+
if (token == legacyQuote)
881+
{
882+
isWithinString = !isWithinString;
883+
}
884+
continue;
885+
}
886+
887+
// The shared scanner leaves unknown-but-valid UTF-8 source text here;
888+
// Evo can store displayable UCS-2 characters directly as token words.
889+
uint16_t codepoint = 0;
890+
if (utf8_to_single_codepoint(text, codepoint) && is_displayable_ucs2_scalar(codepoint))
891+
{
892+
append_evo_token(evo, codepoint);
893+
}
894+
else if (!text.empty())
895+
{
896+
std::cerr << "[Warning] Cannot encode source text \"" << text
897+
<< "\" as an Evo token; skipping it." << std::endl;
898+
}
899+
}
900+
901+
append_evo_token(evo, 0);
902+
return evo;
903+
}
904+
798905
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry)
799906
{
800907
const EvoTypeID evoTypeID = entry.evoTypeID;

src/EvoFormat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ namespace tivars::EvoFormat
6868
EvoPythonScriptInfo parse_evo_python_script_payload(const data_t& data);
6969

7070
std::string detokenize_evo_token_words(const data_t& data);
71+
data_t tokenize_evo_token_words(const std::string& source, const options_t& options = options_t());
7172
bool is_evo_tokenized_entry(const TIVarFile::var_entry_t& entry);
7273
data_t evo_tokenized_data_to_legacy(const data_t& evoData);
7374
data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversion = false);

src/TIVarFile.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,12 +991,17 @@ namespace tivars
991991
void TIVarFile::setContentFromString(const std::string& str, const options_t& options, uint16_t entryIdx)
992992
{
993993
auto& entry = this->entries[entryIdx];
994-
data_t data = std::get<0>(entry._type.getHandlers())(str, options, this);
994+
data_t data;
995995
if (this->evoFormat && is_evo_tokenized_entry(entry))
996996
{
997-
data = legacy_tokenized_data_to_evo(data);
997+
data = tokenize_evo_token_words(str, options);
998998
}
999-
else if (this->evoFormat && is_legacy_numeric_entry(entry))
999+
else
1000+
{
1001+
data = std::get<0>(entry._type.getHandlers())(str, options, this);
1002+
}
1003+
1004+
if (this->evoFormat && is_legacy_numeric_entry(entry))
10001005
{
10011006
const bool exactFraction = legacy_value_is_exact_fraction(data);
10021007
data = legacy_value_to_evo_expression(data);

src/TypeHandlers/TH_Tokenized.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,10 @@ namespace tivars::TypeHandlers
550550
continue;
551551
}
552552

553-
const std::string currChar = str.substr(strCursorPos, 1);
553+
// Unmatched text is reported as one UTF-8 codepoint so callers can
554+
// handle non-CE characters without receiving split byte fragments.
555+
const size_t currCharLen = std::max<size_t>(utf8_codepoint_len_at(str, strCursorPos), 1);
556+
const std::string currChar = str.substr(strCursorPos, currCharLen);
554557

555558
if (currChar == backslashStr)
556559
{
@@ -653,6 +656,8 @@ namespace tivars::TypeHandlers
653656
if (!matched)
654657
{
655658
onSkipped(currChar);
659+
// The for-loop will add the final byte advance.
660+
strCursorPos += currCharLen - 1;
656661
}
657662
}
658663
}
@@ -1412,6 +1417,24 @@ namespace tivars::TypeHandlers
14121417
return posinfo;
14131418
}
14141419

1420+
std::vector<TH_Tokenized::token_scan_item> TH_Tokenized::scanSourceTokens(const std::string& sourceStr, bool detectStrings)
1421+
{
1422+
ensure_tokens_initialized();
1423+
1424+
std::vector<token_scan_item> items;
1425+
items.reserve(sourceStr.size());
1426+
scan_source_tokens(sourceStr, detectStrings,
1427+
[&](const std::string& tokenStr, uint16_t tokenValue)
1428+
{
1429+
items.push_back({ tokenStr, tokenValue, true });
1430+
},
1431+
[&](const std::string& skippedStr)
1432+
{
1433+
items.push_back({ skippedStr, 0, false });
1434+
});
1435+
return items;
1436+
}
1437+
14151438
TH_Tokenized::token_posinfo TH_Tokenized::getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset)
14161439
{
14171440
const size_t strLen = hexBytesStr.length();

src/TypeHandlers/TypeHandlers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,12 @@ namespace tivars::TypeHandlers
244244
enum typelang { PRGMLANG_BASIC = 0, PRGMLANG_AXE, PRGMLANG_ICE };
245245
enum indentchar : char { INDENT_CHAR_SPACE = ' ', INDENT_CHAR_TAB = '\t' };
246246
struct token_posinfo { uint16_t line; uint16_t column; uint8_t len; };
247+
struct token_scan_item { std::string text; uint16_t token; bool matched; };
247248
static std::string reindentCodeString(const std::string& str_orig, const options_t& options = options_t());
248249
static token_posinfo getPosInfoAtOffset(const data_t& data, uint16_t byteOffset, const options_t& options = options_t());
249250
static token_posinfo getPosInfoAtOffsetFromHexStr(const std::string& hexBytesStr, uint16_t byteOffset);
250251
static token_posinfo getPosInfoAtOffsetInSourceString(const std::string& sourceStr, uint16_t byteOffset);
252+
static std::vector<token_scan_item> scanSourceTokens(const std::string& sourceStr, bool detectStrings = true);
251253
static std::string tokenToString(const data_t& data, int *incr, const options_t& options);
252254
static std::string oneTokenBytesToString(uint16_t tokenBytes);
253255
};

src/tivarslib_utils.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
*/
77

88
#include "tivarslib_utils.h"
9+
#include <cctype>
910
#include <cstdlib>
1011
#include <iomanip>
1112
#include <sstream>
1213
#include <cmath>
1314
#include <cstring>
1415
#include <limits>
16+
#include <stdexcept>
1517

1618
using namespace std::string_literals;
1719

@@ -86,6 +88,27 @@ unsigned char hexdec(const std::string& str)
8688
return (unsigned char) stoul(str, nullptr, 16);
8789
}
8890

91+
data_t hex_string_to_bytes(const std::string& hex, const char* fieldName)
92+
{
93+
if ((hex.size() % 2) != 0)
94+
{
95+
throw std::invalid_argument(std::string(fieldName) + " must contain an even number of hex digits");
96+
}
97+
98+
data_t out;
99+
out.reserve(hex.size() / 2);
100+
for (size_t i = 0; i < hex.size(); i += 2)
101+
{
102+
if (!std::isxdigit(static_cast<unsigned char>(hex[i])) ||
103+
!std::isxdigit(static_cast<unsigned char>(hex[i + 1])))
104+
{
105+
throw std::invalid_argument(std::string(fieldName) + " must be valid hexadecimal");
106+
}
107+
out.push_back(hexdec(hex.substr(i, 2)));
108+
}
109+
return out;
110+
}
111+
89112
std::string dechex(unsigned char i, bool zeropad)
90113
{
91114
std::string str = "00";

src/tivarslib_utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ void vector_append(std::vector<T>& vec, const std::vector<T>& other)
3434

3535
unsigned char hexdec(const std::string& str);
3636

37+
data_t hex_string_to_bytes(const std::string& hex, const char* fieldName = "hex string");
38+
3739
std::string dechex(unsigned char i, bool zeropad = true);
3840

3941
std::string strtoupper(const std::string& str);

tests.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,6 +1694,20 @@ int main(int argc, char** argv)
16941694
assert(code.find("\\u0178") != std::string::npos);
16951695
}
16961696

1697+
{
1698+
assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
1699+
1700+
TIVarFile directYHatProgram = TIVarFile::createNew("Program", "YHAT2", "84Evo");
1701+
directYHatProgram.setContentFromString("Disp \"ŷ\"");
1702+
const json readable = json::parse(directYHatProgram.getReadableContent());
1703+
assert(readable["code"] == "Disp \"ŷ\"");
1704+
assert(readable["rawDataHex"].get<std::string>().find("7701") != std::string::npos);
1705+
1706+
TIVarFile recreatedYHatProgram = TIVarFile::createNew("Program", "YHAT3", "84Evo");
1707+
recreatedYHatProgram.setContentFromString(directYHatProgram.getReadableContent());
1708+
assert(recreatedYHatProgram.getRawContent() == directYHatProgram.getRawContent());
1709+
}
1710+
16971711
{
16981712
TIVarFile stringVarProgram = TIVarFile::createNew("Program", "MYPRGM", "84+CE");
16991713
stringVarProgram.setContentFromString(R"(Ans→N
@@ -1747,6 +1761,10 @@ Disp Str1
17471761
evalProgram.convertToModel(TIModel{"84+CE"});
17481762
assert(!evalProgram.isEvoFormat());
17491763
assert(evalProgram.getRawContent() == originalLegacyData);
1764+
1765+
TIVarFile directEvoEvalProgram = TIVarFile::createNew("Program", "EVALTOK2", "84Evo");
1766+
directEvoEvalProgram.setContentFromString("eval(\"1+1\"");
1767+
assert(json::parse(directEvoEvalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
17501768
}
17511769

17521770
{

0 commit comments

Comments
 (0)