Skip to content

Commit 535ba8d

Browse files
committed
Evo: correctly map the handled UCS-2 tokens.
1 parent 7b77386 commit 535ba8d

2 files changed

Lines changed: 139 additions & 10 deletions

File tree

src/EvoFormat.cpp

Lines changed: 108 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,67 @@ namespace tivars::EvoFormat
5151
static_cast<char>(0x80 | (codepoint & 0x3F)),
5252
};
5353
}
54+
55+
bool is_displayable_ucs2_scalar(uint16_t codepoint)
56+
{
57+
return (codepoint >= 0x0020 && codepoint <= 0x007E)
58+
|| (codepoint >= 0x00A0 && codepoint <= 0x00FF)
59+
|| codepoint == 0x0177;
60+
}
61+
62+
bool utf8_to_single_codepoint(const std::string& text, uint16_t& codepoint)
63+
{
64+
if (text.empty())
65+
{
66+
return false;
67+
}
68+
69+
const uint8_t first = static_cast<uint8_t>(text[0]);
70+
size_t length = 0;
71+
uint32_t value = 0;
72+
if ((first & 0x80) == 0)
73+
{
74+
length = 1;
75+
value = first;
76+
}
77+
else if ((first & 0xE0) == 0xC0)
78+
{
79+
length = 2;
80+
value = first & 0x1F;
81+
}
82+
else if ((first & 0xF0) == 0xE0)
83+
{
84+
length = 3;
85+
value = first & 0x0F;
86+
}
87+
else
88+
{
89+
return false;
90+
}
91+
92+
if (text.size() != length)
93+
{
94+
return false;
95+
}
96+
97+
for (size_t i = 1; i < length; i++)
98+
{
99+
const uint8_t byte = static_cast<uint8_t>(text[i]);
100+
if ((byte & 0xC0) != 0x80)
101+
{
102+
return false;
103+
}
104+
value = (value << 6) | (byte & 0x3F);
105+
}
106+
107+
if (value > 0xFFFF)
108+
{
109+
return false;
110+
}
111+
112+
codepoint = static_cast<uint16_t>(value);
113+
return true;
114+
}
54115
}
55116

56117
uint16_t evo_checksum(const data_t& body)
@@ -690,19 +751,14 @@ static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken
690751
static std::string evo_token_to_string(uint16_t token)
691752
{
692753
if (token == 0x0000) return "";
754+
if (is_displayable_ucs2_scalar(token)) return utf8_from_codepoint(token);
693755
if (token >= 0xE800 && token <= 0xE819) return std::string(1, static_cast<char>('A' + (token - 0xE800)));
694756
if (token == 0xE81A) return "θ";
695757
if (token >= 0xE401 && token <= 0xE40A) return std::string(1, static_cast<char>('0' + (token - 0xE401)));
696758
if (token >= 0xE830 && token <= 0xE835) return "L" + std::to_string(token - 0xE830 + 1);
697759
if (token >= 0xE820 && token <= 0xE829) return "[" + std::string(1, static_cast<char>('A' + (token - 0xE820))) + "]";
698-
if (token >= 0x20 && token <= 0x7E && !(token >= 'A' && token <= 'Z')) return std::string(1, static_cast<char>(token));
699760
if (token >= 0xE840 && token <= 0xE849) return "Y" + std::to_string(token == 0xE849 ? 0 : token - 0xE840 + 1);
700761
if (token >= 0xE8A0 && token <= 0xE8A9) return "Str" + std::to_string(token == 0xE8A9 ? 0 : token - 0xE8A0 + 1);
701-
if ((token >= 0x00A1 && token <= 0x00FF) || (token >= 0x0391 && token <= 0x03C9)) return utf8_from_codepoint(token);
702-
if ((token >= 0x2070 && token <= 0x209F) || token == 0x02E3 || token == 0x029F || token == 0x1D1B) return utf8_from_codepoint(token);
703-
if (token == 0x2026 || token == 0x2191 || token == 0x2193 || token == 0x221A || token == 0x2220 || token == 0x222B) return utf8_from_codepoint(token);
704-
if (token == 0x2338 || token == 0x25A1 || token == 0x25BA || token == 0x25C4 || token == 0xFE62) return utf8_from_codepoint(token);
705-
if (token == 0x007C || token == 0x0060) return utf8_from_codepoint(token);
706762
if (token == 0xE41A) return "'";
707763
if (token == 0xE424) return "";
708764
if (token == 0xE589) return "Grad";
@@ -760,6 +816,42 @@ static void append_evo_token(data_t& out, uint16_t evoToken)
760816
out.push_back(static_cast<uint8_t>((evoToken >> 8) & 0xFF));
761817
}
762818

819+
static bool legacy_payload_for_evo_ucs2(uint16_t evoToken, data_t& payload)
820+
{
821+
if (!is_displayable_ucs2_scalar(evoToken))
822+
{
823+
return false;
824+
}
825+
826+
try
827+
{
828+
const data_t legacy = TypeHandlers::TH_Tokenized::makeDataFromString(utf8_from_codepoint(evoToken));
829+
if (legacy.size() < 3 || legacy.size() != static_cast<size_t>(2 + legacy[0] + (legacy[1] << 8)))
830+
{
831+
return false;
832+
}
833+
payload.assign(legacy.begin() + 2, legacy.end());
834+
return !payload.empty();
835+
}
836+
catch (...)
837+
{
838+
return false;
839+
}
840+
}
841+
842+
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken)
843+
{
844+
const std::string text = TypeHandlers::TH_Tokenized::oneTokenBytesToString(legacyToken);
845+
uint16_t codepoint = 0;
846+
if (!utf8_to_single_codepoint(text, codepoint) || !is_displayable_ucs2_scalar(codepoint))
847+
{
848+
return false;
849+
}
850+
851+
evoToken = codepoint;
852+
return true;
853+
}
854+
763855
static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken)
764856
{
765857
if (evoToken >= 0xE401 && evoToken <= 0xE40A)
@@ -1749,6 +1841,11 @@ static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken
17491841

17501842
static bool tokenized_legacy_payload_for_evo(uint16_t evoToken, data_t& payload)
17511843
{
1844+
if (legacy_payload_for_evo_ucs2(evoToken, payload))
1845+
{
1846+
return true;
1847+
}
1848+
17521849
uint16_t legacyToken = 0;
17531850
if (!direct_legacy_token_for_evo(evoToken, legacyToken))
17541851
{
@@ -1828,7 +1925,11 @@ data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversi
18281925
}
18291926

18301927
uint16_t evoToken = 0;
1831-
if (!direct_evo_token_for_legacy(legacyToken, evoToken))
1928+
if (isWithinString && legacyToken != legacyQuote && legacy_token_to_evo_ucs2(legacyToken, evoToken))
1929+
{
1930+
// Text inside Evo strings is stored as UCS-2 code units.
1931+
}
1932+
else if (!direct_evo_token_for_legacy(legacyToken, evoToken))
18321933
{
18331934
std::cerr << "[Warning] Cannot convert 84+CE token "
18341935
<< TypeHandlers::TH_Tokenized::oneTokenBytesToString(legacyToken)

tests.cpp

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,14 +1656,42 @@ int main(int argc, char** argv)
16561656

16571657
{
16581658
TIVarFile lowerStringProgram = TIVarFile::loadFromFile("testData/ProtectedProgram.8xp");
1659-
const data_t originalLegacyData = lowerStringProgram.getRawContent();
16601659
lowerStringProgram.convertToModel(TIModel{"84Evo"});
16611660
assert(lowerStringProgram.isEvoFormat());
16621661
assert(json::parse(lowerStringProgram.getReadableContent())["code"] == "Disp \"Hello World");
1662+
assert(lowerStringProgram.getRawContentHexStr().find("480065006c006c006f00200057006f0072006c006400") != std::string::npos);
16631663

16641664
lowerStringProgram.convertToModel(TIModel{"84+CE"});
16651665
assert(!lowerStringProgram.isEvoFormat());
1666-
assert(lowerStringProgram.getRawContent() == originalLegacyData);
1666+
assert(lowerStringProgram.getReadableContent() == "Disp \"Hello World");
1667+
}
1668+
1669+
{
1670+
TIVarFile rawUcs2Program = TIVarFile::createNew("Program", "UCS2", "84Evo");
1671+
rawUcs2Program.setContentFromData({0x48, 0x00, 0xE9, 0x00, 0x00, 0x00});
1672+
assert(json::parse(rawUcs2Program.getReadableContent())["code"] == "");
1673+
1674+
rawUcs2Program.convertToModel(TIModel{"84+CE"});
1675+
assert(!rawUcs2Program.isEvoFormat());
1676+
assert(rawUcs2Program.getReadableContent() == "");
1677+
}
1678+
1679+
{
1680+
TIVarFile rawControlUcs2Program = TIVarFile::createNew("Program", "CTL", "84Evo");
1681+
rawControlUcs2Program.setContentFromData({0x1F, 0x00, 0x7F, 0x00, 0x9F, 0x00, 0x00, 0x00});
1682+
const std::string code = json::parse(rawControlUcs2Program.getReadableContent())["code"];
1683+
assert(code.find("\\u001F") != std::string::npos);
1684+
assert(code.find("\\u007F") != std::string::npos);
1685+
assert(code.find("\\u009F") != std::string::npos);
1686+
}
1687+
1688+
{
1689+
TIVarFile rawYHatProgram = TIVarFile::createNew("Program", "YHAT", "84Evo");
1690+
rawYHatProgram.setContentFromData({0x76, 0x01, 0x77, 0x01, 0x78, 0x01, 0x00, 0x00});
1691+
const std::string code = json::parse(rawYHatProgram.getReadableContent())["code"];
1692+
assert(code.find("\\u0176") != std::string::npos);
1693+
assert(code.find("ŷ") != std::string::npos);
1694+
assert(code.find("\\u0178") != std::string::npos);
16671695
}
16681696

16691697
{
@@ -1714,7 +1742,7 @@ Disp Str1
17141742

17151743
evalProgram.convertToModel(TIModel{"84Evo"});
17161744
assert(evalProgram.isEvoFormat());
1717-
assert(json::parse(evalProgram.getReadableContent())["dataHex"] == "E4E416E402E428E402E416E40000");
1745+
assert(json::parse(evalProgram.getReadableContent())["rawDataHex"] == "E4E416E431002B00310016E40000");
17181746

17191747
evalProgram.convertToModel(TIModel{"84+CE"});
17201748
assert(!evalProgram.isEvoFormat());

0 commit comments

Comments
 (0)