@@ -51,6 +51,67 @@ namespace tivars::EvoFormat
5151 static_cast <char >(0x80 | (codepoint & 0x3F )),
5252 };
5353 }
54+
55+ bool is_displayable_ucs2_scalar (uint16_t codepoint)
56+ {
57+ return (codepoint >= 0x0020 && codepoint <= 0x007E )
58+ || (codepoint >= 0x00A0 && codepoint <= 0x00FF )
59+ || codepoint == 0x0177 ;
60+ }
61+
62+ bool utf8_to_single_codepoint (const std::string& text, uint16_t & codepoint)
63+ {
64+ if (text.empty ())
65+ {
66+ return false ;
67+ }
68+
69+ const uint8_t first = static_cast <uint8_t >(text[0 ]);
70+ size_t length = 0 ;
71+ uint32_t value = 0 ;
72+ if ((first & 0x80 ) == 0 )
73+ {
74+ length = 1 ;
75+ value = first;
76+ }
77+ else if ((first & 0xE0 ) == 0xC0 )
78+ {
79+ length = 2 ;
80+ value = first & 0x1F ;
81+ }
82+ else if ((first & 0xF0 ) == 0xE0 )
83+ {
84+ length = 3 ;
85+ value = first & 0x0F ;
86+ }
87+ else
88+ {
89+ return false ;
90+ }
91+
92+ if (text.size () != length)
93+ {
94+ return false ;
95+ }
96+
97+ for (size_t i = 1 ; i < length; i++)
98+ {
99+ const uint8_t byte = static_cast <uint8_t >(text[i]);
100+ if ((byte & 0xC0 ) != 0x80 )
101+ {
102+ return false ;
103+ }
104+ value = (value << 6 ) | (byte & 0x3F );
105+ }
106+
107+ if (value > 0xFFFF )
108+ {
109+ return false ;
110+ }
111+
112+ codepoint = static_cast <uint16_t >(value);
113+ return true ;
114+ }
54115 }
55116
56117uint16_t evo_checksum (const data_t & body)
@@ -690,19 +751,14 @@ static bool direct_legacy_token_for_evo(uint16_t evoToken, uint16_t& legacyToken
690751static std::string evo_token_to_string (uint16_t token)
691752{
692753 if (token == 0x0000 ) return " " ;
754+ if (is_displayable_ucs2_scalar (token)) return utf8_from_codepoint (token);
693755 if (token >= 0xE800 && token <= 0xE819 ) return std::string (1 , static_cast <char >(' A' + (token - 0xE800 )));
694756 if (token == 0xE81A ) return " θ" ;
695757 if (token >= 0xE401 && token <= 0xE40A ) return std::string (1 , static_cast <char >(' 0' + (token - 0xE401 )));
696758 if (token >= 0xE830 && token <= 0xE835 ) return " L" + std::to_string (token - 0xE830 + 1 );
697759 if (token >= 0xE820 && token <= 0xE829 ) return " [" + std::string (1 , static_cast <char >(' A' + (token - 0xE820 ))) + " ]" ;
698- if (token >= 0x20 && token <= 0x7E && !(token >= ' A' && token <= ' Z' )) return std::string (1 , static_cast <char >(token));
699760 if (token >= 0xE840 && token <= 0xE849 ) return " Y" + std::to_string (token == 0xE849 ? 0 : token - 0xE840 + 1 );
700761 if (token >= 0xE8A0 && token <= 0xE8A9 ) return " Str" + std::to_string (token == 0xE8A9 ? 0 : token - 0xE8A0 + 1 );
701- if ((token >= 0x00A1 && token <= 0x00FF ) || (token >= 0x0391 && token <= 0x03C9 )) return utf8_from_codepoint (token);
702- if ((token >= 0x2070 && token <= 0x209F ) || token == 0x02E3 || token == 0x029F || token == 0x1D1B ) return utf8_from_codepoint (token);
703- if (token == 0x2026 || token == 0x2191 || token == 0x2193 || token == 0x221A || token == 0x2220 || token == 0x222B ) return utf8_from_codepoint (token);
704- if (token == 0x2338 || token == 0x25A1 || token == 0x25BA || token == 0x25C4 || token == 0xFE62 ) return utf8_from_codepoint (token);
705- if (token == 0x007C || token == 0x0060 ) return utf8_from_codepoint (token);
706762 if (token == 0xE41A ) return " '" ;
707763 if (token == 0xE424 ) return " ᵍ" ;
708764 if (token == 0xE589 ) return " Grad" ;
@@ -760,6 +816,42 @@ static void append_evo_token(data_t& out, uint16_t evoToken)
760816 out.push_back (static_cast <uint8_t >((evoToken >> 8 ) & 0xFF ));
761817}
762818
819+ static bool legacy_payload_for_evo_ucs2 (uint16_t evoToken, data_t & payload)
820+ {
821+ if (!is_displayable_ucs2_scalar (evoToken))
822+ {
823+ return false ;
824+ }
825+
826+ try
827+ {
828+ const data_t legacy = TypeHandlers::TH_Tokenized::makeDataFromString (utf8_from_codepoint (evoToken));
829+ if (legacy.size () < 3 || legacy.size () != static_cast <size_t >(2 + legacy[0 ] + (legacy[1 ] << 8 )))
830+ {
831+ return false ;
832+ }
833+ payload.assign (legacy.begin () + 2 , legacy.end ());
834+ return !payload.empty ();
835+ }
836+ catch (...)
837+ {
838+ return false ;
839+ }
840+ }
841+
842+ static bool legacy_token_to_evo_ucs2 (uint16_t legacyToken, uint16_t & evoToken)
843+ {
844+ const std::string text = TypeHandlers::TH_Tokenized::oneTokenBytesToString (legacyToken);
845+ uint16_t codepoint = 0 ;
846+ if (!utf8_to_single_codepoint (text, codepoint) || !is_displayable_ucs2_scalar (codepoint))
847+ {
848+ return false ;
849+ }
850+
851+ evoToken = codepoint;
852+ return true ;
853+ }
854+
763855static bool direct_legacy_token_for_evo (uint16_t evoToken, uint16_t & legacyToken)
764856{
765857 if (evoToken >= 0xE401 && evoToken <= 0xE40A )
@@ -1749,6 +1841,11 @@ static bool direct_evo_token_for_legacy(uint16_t legacyToken, uint16_t& evoToken
17491841
17501842static bool tokenized_legacy_payload_for_evo (uint16_t evoToken, data_t & payload)
17511843{
1844+ if (legacy_payload_for_evo_ucs2 (evoToken, payload))
1845+ {
1846+ return true ;
1847+ }
1848+
17521849 uint16_t legacyToken = 0 ;
17531850 if (!direct_legacy_token_for_evo (evoToken, legacyToken))
17541851 {
@@ -1828,7 +1925,11 @@ data_t legacy_tokenized_data_to_evo(const data_t& legacyData, bool smartConversi
18281925 }
18291926
18301927 uint16_t evoToken = 0 ;
1831- if (!direct_evo_token_for_legacy (legacyToken, evoToken))
1928+ if (isWithinString && legacyToken != legacyQuote && legacy_token_to_evo_ucs2 (legacyToken, evoToken))
1929+ {
1930+ // Text inside Evo strings is stored as UCS-2 code units.
1931+ }
1932+ else if (!direct_evo_token_for_legacy (legacyToken, evoToken))
18321933 {
18331934 std::cerr << " [Warning] Cannot convert 84+CE token "
18341935 << TypeHandlers::TH_Tokenized::oneTokenBytesToString (legacyToken)
0 commit comments