Skip to content

Commit a349a57

Browse files
poire-zvirxkane
authored andcommitted
Supports book text encoded in WTF-8
https://en.wikipedia.org/wiki/UTF-8#WTF-8 WTF-8 is a superset of UTF-8, that includes UTF-16 surrogates in UTF-8 bytes (forbidden in well-formed UTF-8). We may get UTF-8 with these from bad producers or converters. (cherry picked from commit 7145b86)
1 parent 6d0699a commit a349a57

File tree

1 file changed

+62
-12
lines changed

1 file changed

+62
-12
lines changed

crengine/src/lvstring.cpp

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2915,44 +2915,94 @@ static void DecodeUtf8(const char * s, lChar16 * p, int len)
29152915
}
29162916
}
29172917

2918+
// Top two bits are 10, i.e. original & 11000000(2) == 10000000(2)
2919+
#define IS_FOLLOWING(index) ((s[index] & 0xC0) == 0x80)
2920+
29182921
void Utf8ToUnicode(const lUInt8 * src, int &srclen, lChar16 * dst, int &dstlen)
29192922
{
29202923
const lUInt8 * s = src;
29212924
const lUInt8 * ends = s + srclen;
29222925
lChar16 * p = dst;
29232926
lChar16 * endp = p + dstlen;
29242927
lUInt32 ch;
2928+
bool matched;
29252929
while (p < endp && s < ends) {
29262930
ch = *s;
2931+
matched = false;
29272932
if ( (ch & 0x80) == 0 ) {
2933+
matched = true;
29282934
*p++ = (char)ch;
29292935
s++;
29302936
} else if ( (ch & 0xE0) == 0xC0 ) {
29312937
if (s + 2 > ends)
29322938
break;
2933-
*p++ = ((ch & 0x1F) << 6)
2934-
| CONT_BYTE(1,0);
2935-
s += 2;
2939+
if (IS_FOLLOWING(1)) {
2940+
matched = true;
2941+
*p++ = ((ch & 0x1F) << 6)
2942+
| CONT_BYTE(1,0);
2943+
s += 2;
2944+
}
29362945
} else if ( (ch & 0xF0) == 0xE0 ) {
29372946
if (s + 3 > ends)
29382947
break;
2939-
*p++ = ((ch & 0x0F) << 12)
2940-
| CONT_BYTE(1,6)
2941-
| CONT_BYTE(2,0);
2942-
s += 3;
2948+
if (IS_FOLLOWING(1) && IS_FOLLOWING(2)) {
2949+
matched = true;
2950+
*p++ = ((ch & 0x0F) << 12)
2951+
| CONT_BYTE(1,6)
2952+
| CONT_BYTE(2,0);
2953+
s += 3;
2954+
// Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
2955+
// a superset of UTF-8, that includes UTF-16 surrogates
2956+
// in UTF-8 bytes (forbidden in well-formed UTF-8).
2957+
// We may get that from bad producers or converters.
2958+
// As these shouldn't be there in UTF-8, if we find
2959+
// these surrogates in the right sequence, we might as well
2960+
// convert the char they represent to the right Unicode
2961+
// codepoint and display it instead of a '?'.
2962+
// Surrogates are code points from two special ranges of
2963+
// Unicode values, reserved for use as the leading, and
2964+
// trailing values of paired code units in UTF-16. Leading,
2965+
// also called high, surrogates are from D800 to DBFF, and
2966+
// trailing, or low, surrogates are from DC00 to DFFF. They
2967+
// are called surrogates, since they do not represent
2968+
// characters directly, but only as a pair.
2969+
// (Note that lChar16 (wchar_t) is 4-bytes, and can store
2970+
// unicode codepoint > 0xFFFF like 0x10123)
2971+
if (*(p-1) >= 0xD800 && *(p-1) <= 0xDBFF && s+2 < ends) { // what we wrote is a high surrogate,
2972+
lUInt32 next = *s; // and there's room next for a low surrogate
2973+
if ( (next & 0xF0) == 0xE0 && IS_FOLLOWING(1) && IS_FOLLOWING(2)) { // is a valid 3-bytes sequence
2974+
next = ((next & 0x0F) << 12) | CONT_BYTE(1,6) | CONT_BYTE(2,0);
2975+
if (next >= 0xDC00 && next <= 0xDFFF) { // is a low surrogate: valid surrogates sequence
2976+
ch = 0x10000 + ((*(p-1) & 0x3FF)<<10) + (next & 0x3FF);
2977+
p--; // rewind to override what we wrote
2978+
*p++ = ch;
2979+
s += 3;
2980+
}
2981+
}
2982+
}
2983+
}
29432984
} else if ( (ch & 0xF8) == 0xF0 ) {
29442985
if (s + 4 > ends)
29452986
break;
2946-
*p++ = ((ch & 0x07) << 18)
2947-
| CONT_BYTE(1,12)
2948-
| CONT_BYTE(2,6)
2949-
| CONT_BYTE(3,0);
2950-
s += 4;
2987+
if (IS_FOLLOWING(1) && IS_FOLLOWING(2) && IS_FOLLOWING(3)) {
2988+
matched = true;
2989+
*p++ = ((ch & 0x07) << 18)
2990+
| CONT_BYTE(1,12)
2991+
| CONT_BYTE(2,6)
2992+
| CONT_BYTE(3,0);
2993+
s += 4;
2994+
}
29512995
} else {
29522996
// Invalid first byte in UTF-8 sequence
29532997
// Pass with mask 0x7F, to resolve exception around env->NewStringUTF()
29542998
*p++ = (char) (ch & 0x7F);
29552999
s++;
3000+
matched = true; // just to avoid next if
3001+
}
3002+
// unexpected character
3003+
if (!matched) {
3004+
*p++ = '?';
3005+
s++;
29563006
}
29573007
}
29583008
srclen = (int)(s - src);

0 commit comments

Comments
 (0)