Skip to content

Commit 6d5834d

Browse files
committed
Evo: handle even more UCS-2 ranges and direct token mappings.
Credits to Zeroko mostly for painfully testing ranges.
1 parent fb1a740 commit 6d5834d

2 files changed

Lines changed: 162 additions & 17 deletions

File tree

src/EvoFormat.cpp

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <sstream>
1616
#include <stdexcept>
1717
#include <unordered_map>
18+
#include <utility>
1819

1920
#include "TIVarTypes.h"
2021
#include "TypeHandlers/TypeHandlers.h"
@@ -58,9 +59,24 @@ namespace tivars::EvoFormat
5859

5960
bool is_displayable_ucs2_scalar(uint16_t codepoint)
6061
{
61-
return (codepoint >= 0x0020 && codepoint <= 0x007E)
62-
|| (codepoint >= 0x00A0 && codepoint <= 0x00FF)
63-
|| codepoint == 0x0177;
62+
static constexpr std::pair<uint16_t, uint16_t> acceptedRanges[] = {
63+
{0x0020, 0x007E}, {0x00A0, 0x00FF}, {0x0177, 0x0177}, {0x0394, 0x0394},
64+
{0x03A3, 0x03A3}, {0x03A9, 0x03A9}, {0x03B1, 0x03B5}, {0x03B8, 0x03B8},
65+
{0x03BB, 0x03BC}, {0x03C0, 0x03C1}, {0x03C3, 0x03C4}, {0x03C6, 0x03C7},
66+
{0x2010, 0x2010}, {0x2026, 0x2026}, {0x2070, 0x2070}, {0x2074, 0x2079},
67+
{0x2080, 0x2089}, {0x2122, 0x2122}, {0x2190, 0x2193}, {0x221A, 0x221A},
68+
{0x2220, 0x2220}, {0x222B, 0x222B}, {0x2260, 0x2260}, {0x2264, 0x2265},
69+
{0x238C, 0x238C}, {0x25A0, 0x25A0}, {0x25AB, 0x25AB}, {0x25B2, 0x25B2},
70+
{0x25B6, 0x25B6}, {0x25B8, 0x25B8}, {0x25BC, 0x25BC}, {0x25C0, 0x25C0},
71+
{0x25C2, 0x25C2}, {0xF000, 0xF032}, {0xF038, 0xF03A}, {0xF041, 0xF04D},
72+
{0xF04F, 0xF058}, {0xF05B, 0xF061},
73+
};
74+
75+
return std::ranges::any_of(acceptedRanges, [codepoint](const auto& range)
76+
{
77+
const auto& [first, last] = range;
78+
return codepoint >= first && codepoint <= last;
79+
});
6480
}
6581

6682
bool utf8_to_single_codepoint(const std::string& text, uint16_t& codepoint)
@@ -752,6 +768,7 @@ static const char* evo_token_name(uint16_t token)
752768
}
753769

754770
static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken);
771+
static bool direct_legacy_payload_for_evo(uint16_t evoToken, data_t& payload);
755772
static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken);
756773
static void append_evo_token(data_t& out, uint16_t evoToken);
757774
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken);
@@ -770,6 +787,9 @@ static std::string evo_token_to_string(uint16_t token)
770787
if (token == 0xE41A) return "'";
771788
if (token == 0xE424) return "";
772789
if (token == 0xE589) return "Grad";
790+
if (token == 0xE9D6) return "►ʳ";
791+
if (token == 0xE9D7) return "►ᵍ";
792+
if (token == 0xE9D8) return "►º";
773793
if (token >= 0xE850 && token <= 0xE85B)
774794
{
775795
const uint16_t idx = static_cast<uint16_t>((token - 0xE850) / 2 + 1);
@@ -946,6 +966,28 @@ static bool legacy_payload_for_evo_ucs2(uint16_t evoToken, data_t& payload)
946966
}
947967
}
948968

969+
static bool direct_legacy_payload_for_evo(uint16_t evoToken, data_t& payload)
970+
{
971+
payload.clear();
972+
973+
// ►{angle} conv token
974+
if (evoToken == 0xE9D6 || evoToken == 0xE9D7 || evoToken == 0xE9D8)
975+
{
976+
append_legacy_token(payload, 0xBBEC);
977+
append_legacy_token(payload, evoToken == 0xE9D6 ? 0x0A : evoToken == 0xE9D7 ? 0xAF : 0x0B);
978+
return true;
979+
}
980+
981+
uint16_t legacyToken = 0;
982+
if (!direct_legacy_token_for_evo(evoToken, legacyToken))
983+
{
984+
return false;
985+
}
986+
987+
append_legacy_token(payload, legacyToken);
988+
return true;
989+
}
990+
949991
static bool legacy_token_to_evo_ucs2(uint16_t legacyToken, uint16_t& evoToken)
950992
{
951993
const std::string text = TypeHandlers::TH_Tokenized::oneTokenBytesToString(legacyToken);
@@ -1105,6 +1147,7 @@ static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken
11051147
{0xE5BD, 0x7F}, {0xE5BE, 0x80}, {0xE5BF, 0x81}, {0xE5C0, 0xEF73},
11061148
{0xE5C1, 0xEF74}, {0xE5C2, 0xEF75},
11071149
{0xE4F9, 0xBB57}, {0xE593, 0xBB64}, {0xE6C6, 0xE8}, {0xE6C7, 0xE7},
1150+
{0xE6AE, 0xEF79},
11081151
{0xE900, 0x6201}, {0xE901, 0x6202}, {0xE902, 0x6203}, {0xE903, 0x6204},
11091152
{0xE904, 0x6205}, {0xE905, 0x6206}, {0xE906, 0x6207}, {0xE907, 0x6208},
11101153
{0xE908, 0x6209}, {0xE909, 0x620A}, {0xE90A, 0x620B}, {0xE90B, 0x620C},
@@ -1121,6 +1164,9 @@ static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken
11211164
{0xE932, 0x6234}, {0xE933, 0x6235}, {0xE934, 0x6236}, {0xE935, 0x6237},
11221165
{0xE936, 0x6238}, {0xE937, 0x6239}, {0xE938, 0x623A}, {0xE939, 0x623B},
11231166
{0xE93A, 0x623C}, {0xE980, 0x6304}, {0xE981, 0x6305}, {0xE982, 0x6332},
1167+
{0xE93B, 0x6203}, {0xE93C, 0x622B}, {0xE93D, 0x622E}, {0xE93E, 0xBBA6},
1168+
{0xE941, 0x622D}, {0xE942, 0x6230}, {0xE943, 0x6206}, {0xE944, 0x622C},
1169+
{0xE945, 0x622F}, {0xE946, 0xBBCB}, {0xE95C, 0x6227},
11241170
{0xE983, 0x6306}, {0xE984, 0x6307}, {0xE985, 0x6308}, {0xE986, 0x6309},
11251171
{0xE987, 0x6333}, {0xE98F, 0x630A}, {0xE990, 0x630B}, {0xE991, 0x6302},
11261172
{0xE992, 0x6336}, {0xE993, 0x630C}, {0xE994, 0x630D}, {0xE995, 0x6303},
@@ -1600,6 +1646,7 @@ static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken
16001646
{0x7F, 0xE5BD}, {0x80, 0xE5BE}, {0x81, 0xE5BF}, {0xEF73, 0xE5C0},
16011647
{0xEF74, 0xE5C1}, {0xEF75, 0xE5C2},
16021648
{0xBB57, 0xE4F9}, {0xBB64, 0xE593}, {0xE8, 0xE6C6}, {0xE7, 0xE6C7},
1649+
{0xEF79, 0xE6AE},
16031650
{0x6201, 0xE900}, {0x6202, 0xE901}, {0x6203, 0xE902}, {0x6204, 0xE903},
16041651
{0x6205, 0xE904}, {0x6206, 0xE905}, {0x6207, 0xE906}, {0x6208, 0xE907},
16051652
{0x6209, 0xE908}, {0x620A, 0xE909}, {0x620B, 0xE90A}, {0x620C, 0xE90B},
@@ -1948,20 +1995,8 @@ static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken
19481995

19491996
static bool tokenized_legacy_payload_for_evo(uint16_t evoToken, data_t& payload)
19501997
{
1951-
if (legacy_payload_for_evo_ucs2(evoToken, payload))
1952-
{
1953-
return true;
1954-
}
1955-
1956-
uint16_t legacyToken = 0;
1957-
if (!direct_legacy_token_for_evo(evoToken, legacyToken))
1958-
{
1959-
return false;
1960-
}
1961-
1962-
payload.clear();
1963-
append_legacy_token(payload, legacyToken);
1964-
return true;
1998+
return direct_legacy_payload_for_evo(evoToken, payload)
1999+
|| legacy_payload_for_evo_ucs2(evoToken, payload);
19652000
}
19662001

19672002
data_t evo_tokenized_data_to_legacy(const data_t& evoData)

tests.cpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <array>
1717
#include <vector>
1818
#include <tuple>
19+
#include <utility>
1920

2021
#ifndef _WIN32
2122
#include <sys/stat.h>
@@ -64,6 +65,26 @@ struct ScopedStderrCapture
6465
}
6566
};
6667

68+
static std::string utf8_from_bmp_codepoint(uint16_t codepoint)
69+
{
70+
if (codepoint < 0x80)
71+
{
72+
return std::string(1, static_cast<char>(codepoint));
73+
}
74+
if (codepoint < 0x800)
75+
{
76+
return {
77+
static_cast<char>(0xC0 | (codepoint >> 6)),
78+
static_cast<char>(0x80 | (codepoint & 0x3F)),
79+
};
80+
}
81+
return {
82+
static_cast<char>(0xE0 | (codepoint >> 12)),
83+
static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)),
84+
static_cast<char>(0x80 | (codepoint & 0x3F)),
85+
};
86+
}
87+
6788
static void assert_roundtrip_from_readable(TIVarFile& original, const options_t& readableOptions = options_t{})
6889
{
6990
assert(original.hasMultipleEntries() == false);
@@ -1614,6 +1635,48 @@ int main(int argc, char** argv)
16141635
assert(checkedAliasTokenCount == 544);
16151636
}
16161637

1638+
{
1639+
const auto evo_token_data = [](uint16_t token) {
1640+
return data_t{
1641+
static_cast<uint8_t>(token & 0xFF),
1642+
static_cast<uint8_t>((token >> 8) & 0xFF),
1643+
0x00,
1644+
0x00
1645+
};
1646+
};
1647+
1648+
const std::vector<std::pair<uint16_t, data_t>> evoToLegacyAliases = {
1649+
{0xE6AE, {0x02, 0x00, 0xEF, 0x79}},
1650+
{0xE93B, {0x02, 0x00, 0x62, 0x03}},
1651+
{0xE93C, {0x02, 0x00, 0x62, 0x2B}},
1652+
{0xE93D, {0x02, 0x00, 0x62, 0x2E}},
1653+
{0xE93E, {0x02, 0x00, 0xBB, 0xA6}},
1654+
{0xE941, {0x02, 0x00, 0x62, 0x2D}},
1655+
{0xE942, {0x02, 0x00, 0x62, 0x30}},
1656+
{0xE943, {0x02, 0x00, 0x62, 0x06}},
1657+
{0xE944, {0x02, 0x00, 0x62, 0x2C}},
1658+
{0xE945, {0x02, 0x00, 0x62, 0x2F}},
1659+
{0xE946, {0x02, 0x00, 0xBB, 0xCB}},
1660+
{0xE95C, {0x02, 0x00, 0x62, 0x27}},
1661+
{0xE9D6, {0x03, 0x00, 0xBB, 0xEC, 0x0A}},
1662+
{0xE9D7, {0x03, 0x00, 0xBB, 0xEC, 0xAF}},
1663+
{0xE9D8, {0x03, 0x00, 0xBB, 0xEC, 0x0B}},
1664+
};
1665+
1666+
for (const auto& [evoToken, expectedLegacyData] : evoToLegacyAliases)
1667+
{
1668+
ScopedStderrCapture stderrCapture;
1669+
assert(EvoFormat::evo_tokenized_data_to_legacy(evo_token_data(evoToken)) == expectedLegacyData);
1670+
assert(stderrCapture.str().find("Cannot convert Evo token") == std::string::npos);
1671+
}
1672+
1673+
assert(EvoFormat::detokenize_evo_token_words(evo_token_data(0xE9D6)) == "►ʳ");
1674+
assert(EvoFormat::detokenize_evo_token_words(evo_token_data(0xE9D7)) == "►ᵍ");
1675+
assert(EvoFormat::detokenize_evo_token_words(evo_token_data(0xE9D8)) == "►º");
1676+
1677+
assert(EvoFormat::legacy_tokenized_data_to_evo({0x02, 0x00, 0xEF, 0x79}) == evo_token_data(0xE6AE));
1678+
}
1679+
16171680
{
16181681
const std::string fromGradPath = "/tmp/tivars_evo_from_grad.8xp2";
16191682
TIVarFile fromGradProgram = TIVarFile::createNew("Program", "GRAD", "84Evo");
@@ -1694,6 +1757,53 @@ int main(int argc, char** argv)
16941757
assert(code.find("\\u0178") != std::string::npos);
16951758
}
16961759

1760+
{
1761+
const std::vector<std::pair<uint16_t, uint16_t>> acceptedUcs2Ranges = {
1762+
{0x0394, 0x0394}, {0x03A3, 0x03A3}, {0x03A9, 0x03A9},
1763+
{0x03B1, 0x03B5}, {0x03B8, 0x03B8}, {0x03BB, 0x03BC},
1764+
{0x03C0, 0x03C1}, {0x03C3, 0x03C4}, {0x03C6, 0x03C7},
1765+
{0x2010, 0x2010}, {0x2026, 0x2026}, {0x2070, 0x2070},
1766+
{0x2074, 0x2079}, {0x2080, 0x2089}, {0x2122, 0x2122},
1767+
{0x2190, 0x2193}, {0x221A, 0x221A}, {0x2220, 0x2220},
1768+
{0x222B, 0x222B}, {0x2260, 0x2260}, {0x2264, 0x2265},
1769+
{0x238C, 0x238C}, {0x25A0, 0x25A0}, {0x25AB, 0x25AB},
1770+
{0x25B2, 0x25B2}, {0x25B6, 0x25B6}, {0x25B8, 0x25B8},
1771+
{0x25BC, 0x25BC}, {0x25C0, 0x25C0}, {0x25C2, 0x25C2},
1772+
{0xF000, 0xF032}, {0xF038, 0xF03A}, {0xF041, 0xF04D},
1773+
{0xF04F, 0xF058}, {0xF05B, 0xF061},
1774+
};
1775+
1776+
data_t rawData;
1777+
std::string expectedCode;
1778+
for (const auto& [first, last] : acceptedUcs2Ranges)
1779+
{
1780+
for (uint16_t codepoint = first; codepoint <= last; ++codepoint)
1781+
{
1782+
rawData.push_back(static_cast<uint8_t>(codepoint & 0xFF));
1783+
rawData.push_back(static_cast<uint8_t>((codepoint >> 8) & 0xFF));
1784+
expectedCode += utf8_from_bmp_codepoint(codepoint);
1785+
}
1786+
}
1787+
rawData.push_back(0x00);
1788+
rawData.push_back(0x00);
1789+
1790+
TIVarFile rawAcceptedUcs2Program = TIVarFile::createNew("Program", "UCS2OK", "84Evo");
1791+
rawAcceptedUcs2Program.setContentFromData(rawData);
1792+
assert(json::parse(rawAcceptedUcs2Program.getReadableContent())["code"] == expectedCode);
1793+
1794+
assert(EvoFormat::tokenize_evo_token_words(utf8_from_bmp_codepoint(0x25C2)) == data_t({0xC2, 0x25, 0x00, 0x00}));
1795+
assert(EvoFormat::tokenize_evo_token_words(utf8_from_bmp_codepoint(0xF061)) == data_t({0x61, 0xF0, 0x00, 0x00}));
1796+
assert(EvoFormat::tokenize_evo_token_words("\\u03C0") == data_t({0xC0, 0x03, 0x00, 0x00}));
1797+
assert(EvoFormat::tokenize_evo_token_words("\\uF000") == data_t({0x00, 0xF0, 0x00, 0x00}));
1798+
assert(EvoFormat::evo_tokenized_data_to_legacy({0xC0, 0x03, 0x00, 0x00}) == data_t({0x02, 0x00, 0xBB, 0xA7}));
1799+
1800+
TIVarFile rawRejectedUcs2Program = TIVarFile::createNew("Program", "UCS2BAD", "84Evo");
1801+
rawRejectedUcs2Program.setContentFromData({0xFF, 0x02, 0x10, 0x20, 0x00, 0x00});
1802+
const std::string rejectedCode = json::parse(rawRejectedUcs2Program.getReadableContent())["code"];
1803+
assert(rejectedCode.find("\\u02FF") != std::string::npos);
1804+
assert(rejectedCode.find("\\u2010") == std::string::npos);
1805+
}
1806+
16971807
{
16981808
assert(EvoFormat::tokenize_evo_token_words("ŷ") == data_t({0x77, 0x01, 0x00, 0x00}));
16991809

0 commit comments

Comments
 (0)