@@ -64,34 +64,83 @@ namespace pl::gen::fmt {
6464 if (pattern->getVisibility () == ptrn::Visibility::Hidden) return ;
6565 if (pattern->getVisibility () == ptrn::Visibility::TreeHidden) return ;
6666
67- const auto string = pattern->toString ();
67+ const auto rawBytes = pattern->toString ();
6868
6969 std::string result;
70- const auto decodedString = wolv::util::utf8ToUtf32 (string, true ).value ();
71- for (const auto &ch : decodedString) {
72- switch (ch) {
73- case U' "' : result += " \\\" " ; break ;
74- case U' \\ ' : result += " \\\\ " ; break ;
75- case U' \b ' : result += " \\ b" ; break ;
76- case U' \f ' : result += " \\ f" ; break ;
77- case U' \n ' : result += " \\ n" ; break ;
78- case U' \r ' : result += " \\ r" ; break ;
79- case U' \t ' : result += " \\ t" ; break ;
80- default :
81- if (ch < 0x20 ) {
82- result += ::fmt::format (" \\ u{:04x}" , static_cast <u32 >(ch));
83- } else if (ch <= 0xFFFF ) {
84- result += static_cast <char >(ch);
85- } else {
86- u32 code = static_cast <u32 >(ch) - 0x10000 ;
87- u16 highSurrogate = 0xD800 + ((code >> 10 ) & 0x3FF );
88- u16 lowSurrogate = 0xDC00 + (code & 0x3FF );
89- result += ::fmt::format (" \\ u{:04x}\\ u{:04x}" , highSurrogate, lowSurrogate);
90- }
91- break ;
70+ for (size_t i = 0 ; i < rawBytes.size (); ) {
71+ const u8 ch = static_cast <u8 >(rawBytes[i]);
72+
73+ // Determine UTF-8 sequence length
74+ size_t seqLen = 0 ;
75+ u32 codepoint = 0 ;
76+ if (ch < 0x80 ) {
77+ seqLen = 1 ;
78+ codepoint = ch;
79+ } else if ((ch & 0xE0 ) == 0xC0 && ch >= 0xC2 ) {
80+ seqLen = 2 ;
81+ codepoint = ch & 0x1F ;
82+ } else if ((ch & 0xF0 ) == 0xE0 ) {
83+ seqLen = 3 ;
84+ codepoint = ch & 0x0F ;
85+ } else if ((ch & 0xF8 ) == 0xF0 && ch <= 0xF4 ) {
86+ seqLen = 4 ;
87+ codepoint = ch & 0x07 ;
88+ }
89+
90+ // Validate continuation bytes
91+ bool valid = seqLen > 0 ;
92+ for (size_t j = 1 ; j < seqLen && valid; ++j) {
93+ const u8 cb = static_cast <u8 >(rawBytes[i + j]);
94+ if (i + j >= rawBytes.size () || (cb & 0xC0 ) != 0x80 ) {
95+ valid = false ;
96+ } else {
97+ codepoint = (codepoint << 6 ) | (cb & 0x3F );
9298 }
9399 }
94100
101+ // Reject overlong sequences and surrogates
102+ if (valid) {
103+ if (seqLen == 2 && codepoint < 0x80 ) valid = false ;
104+ if (seqLen == 3 && codepoint < 0x800 ) valid = false ;
105+ if (seqLen == 4 && codepoint < 0x10000 ) valid = false ;
106+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF ) valid = false ;
107+ if (codepoint > 0x10FFFF ) valid = false ;
108+ }
109+
110+ if (!valid) {
111+ // Escape the single invalid byte
112+ result += ::fmt::format (" \\ u{:04x}" , static_cast <u32 >(ch));
113+ i += 1 ;
114+ continue ;
115+ }
116+
117+ // Emit the codepoint
118+ switch (codepoint) {
119+ case U' "' : result += " \\\" " ; break ;
120+ case U' \\ ' : result += " \\\\ " ; break ;
121+ case U' \b ' : result += " \\ b" ; break ;
122+ case U' \f ' : result += " \\ f" ; break ;
123+ case U' \n ' : result += " \\ n" ; break ;
124+ case U' \r ' : result += " \\ r" ; break ;
125+ case U' \t ' : result += " \\ t" ; break ;
126+ default :
127+ if (codepoint < 0x20 || codepoint == 0x7F ) {
128+ result += ::fmt::format (" \\ u{:04x}" , codepoint);
129+ } else if (codepoint <= 0xFFFF ) {
130+ // Append the original UTF-8 bytes directly
131+ for (size_t j = 0 ; j < seqLen; ++j)
132+ result += rawBytes[i + j];
133+ } else {
134+ u32 code = codepoint - 0x10000 ;
135+ u16 highSurrogate = 0xD800 + ((code >> 10 ) & 0x3FF );
136+ u16 lowSurrogate = 0xDC00 + (code & 0x3FF );
137+ result += ::fmt::format (" \\ u{:04x}\\ u{:04x}" , highSurrogate, lowSurrogate);
138+ }
139+ break ;
140+ }
141+ i += seqLen;
142+ }
143+
95144 addLine (pattern->getVariableName (), ::fmt::format (" \" {}\" ," , result));
96145 }
97146
0 commit comments