@@ -2915,44 +2915,94 @@ static void DecodeUtf8(const char * s, lChar16 * p, int len)
29152915 }
29162916}
29172917
2918+ // Top two bits are 10, i.e. original & 11000000(2) == 10000000(2)
2919+ #define IS_FOLLOWING (index ) ((s[index] & 0xC0 ) == 0x80 )
2920+
29182921void Utf8ToUnicode (const lUInt8 * src, int &srclen, lChar16 * dst, int &dstlen)
29192922{
29202923 const lUInt8 * s = src;
29212924 const lUInt8 * ends = s + srclen;
29222925 lChar16 * p = dst;
29232926 lChar16 * endp = p + dstlen;
29242927 lUInt32 ch;
2928+ bool matched;
29252929 while (p < endp && s < ends) {
29262930 ch = *s;
2931+ matched = false ;
29272932 if ( (ch & 0x80 ) == 0 ) {
2933+ matched = true ;
29282934 *p++ = (char )ch;
29292935 s++;
29302936 } else if ( (ch & 0xE0 ) == 0xC0 ) {
29312937 if (s + 2 > ends)
29322938 break ;
2933- *p++ = ((ch & 0x1F ) << 6 )
2934- | CONT_BYTE (1 ,0 );
2935- s += 2 ;
2939+ if (IS_FOLLOWING (1 )) {
2940+ matched = true ;
2941+ *p++ = ((ch & 0x1F ) << 6 )
2942+ | CONT_BYTE (1 ,0 );
2943+ s += 2 ;
2944+ }
29362945 } else if ( (ch & 0xF0 ) == 0xE0 ) {
29372946 if (s + 3 > ends)
29382947 break ;
2939- *p++ = ((ch & 0x0F ) << 12 )
2940- | CONT_BYTE (1 ,6 )
2941- | CONT_BYTE (2 ,0 );
2942- s += 3 ;
2948+ if (IS_FOLLOWING (1 ) && IS_FOLLOWING (2 )) {
2949+ matched = true ;
2950+ *p++ = ((ch & 0x0F ) << 12 )
2951+ | CONT_BYTE (1 ,6 )
2952+ | CONT_BYTE (2 ,0 );
2953+ s += 3 ;
2954+ // Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
2955+ // a superset of UTF-8, that includes UTF-16 surrogates
2956+ // in UTF-8 bytes (forbidden in well-formed UTF-8).
2957+ // We may get that from bad producers or converters.
2958+ // As these shouldn't be there in UTF-8, if we find
2959+ // these surrogates in the right sequence, we might as well
2960+ // convert the char they represent to the right Unicode
2961+ // codepoint and display it instead of a '?'.
2962+ // Surrogates are code points from two special ranges of
2963+ // Unicode values, reserved for use as the leading, and
2964+ // trailing values of paired code units in UTF-16. Leading,
2965+ // also called high, surrogates are from D800 to DBFF, and
2966+ // trailing, or low, surrogates are from DC00 to DFFF. They
2967+ // are called surrogates, since they do not represent
2968+ // characters directly, but only as a pair.
2969+ // (Note that lChar16 (wchar_t) is 4-bytes, and can store
2970+ // unicode codepoint > 0xFFFF like 0x10123)
2971+ if (*(p-1 ) >= 0xD800 && *(p-1 ) <= 0xDBFF && s+2 < ends) { // what we wrote is a high surrogate,
2972+ lUInt32 next = *s; // and there's room next for a low surrogate
2973+ if ( (next & 0xF0 ) == 0xE0 && IS_FOLLOWING (1 ) && IS_FOLLOWING (2 )) { // is a valid 3-bytes sequence
2974+ next = ((next & 0x0F ) << 12 ) | CONT_BYTE (1 ,6 ) | CONT_BYTE (2 ,0 );
2975+ if (next >= 0xDC00 && next <= 0xDFFF ) { // is a low surrogate: valid surrogates sequence
2976+ ch = 0x10000 + ((*(p-1 ) & 0x3FF )<<10 ) + (next & 0x3FF );
2977+ p--; // rewind to override what we wrote
2978+ *p++ = ch;
2979+ s += 3 ;
2980+ }
2981+ }
2982+ }
2983+ }
29432984 } else if ( (ch & 0xF8 ) == 0xF0 ) {
29442985 if (s + 4 > ends)
29452986 break ;
2946- *p++ = ((ch & 0x07 ) << 18 )
2947- | CONT_BYTE (1 ,12 )
2948- | CONT_BYTE (2 ,6 )
2949- | CONT_BYTE (3 ,0 );
2950- s += 4 ;
2987+ if (IS_FOLLOWING (1 ) && IS_FOLLOWING (2 ) && IS_FOLLOWING (3 )) {
2988+ matched = true ;
2989+ *p++ = ((ch & 0x07 ) << 18 )
2990+ | CONT_BYTE (1 ,12 )
2991+ | CONT_BYTE (2 ,6 )
2992+ | CONT_BYTE (3 ,0 );
2993+ s += 4 ;
2994+ }
29512995 } else {
29522996 // Invalid first byte in UTF-8 sequence
29532997 // Pass with mask 0x7F, to resolve exception around env->NewStringUTF()
29542998 *p++ = (char ) (ch & 0x7F );
29552999 s++;
3000+ matched = true ; // just to avoid next if
3001+ }
3002+ // unexpected character
3003+ if (!matched) {
3004+ *p++ = ' ?' ;
3005+ s++;
29563006 }
29573007 }
29583008 srclen = (int )(s - src);
0 commit comments