@@ -475,7 +475,7 @@ function byteLength (string, encoding) {
475475 return len
476476 case 'utf8' :
477477 case 'utf-8' :
478- return utf8ToBytes ( string ) . length
478+ return utf8ByteLength ( string )
479479 case 'ucs2' :
480480 case 'ucs-2' :
481481 case 'utf16le' :
@@ -487,7 +487,7 @@ function byteLength (string, encoding) {
487487 return base64ToBytes ( string ) . length
488488 default :
489489 if ( loweredCase ) {
490- return mustMatch ? - 1 : utf8ToBytes ( string ) . length // assume utf8
490+ return mustMatch ? - 1 : utf8ByteLength ( string ) // assume utf8
491491 }
492492 encoding = ( '' + encoding ) . toLowerCase ( )
493493 loweredCase = true
@@ -865,7 +865,141 @@ function hexWrite (buf, string, offset, length) {
865865}
866866
867867function utf8Write ( buf , string , offset , length ) {
868- return blitBuffer ( utf8ToBytes ( string , buf . length - offset ) , buf , offset , length )
868+ let remaining = length
869+ let leadSurrogate = 0
870+ let pos = offset
871+
872+ for ( let i = 0 ; i < string . length ; i ++ ) {
873+ let codePoint = string . charCodeAt ( i )
874+
875+ // is surrogate component
876+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
877+ // last char was a lead
878+ if ( ! leadSurrogate ) {
879+ // no lead yet
880+ if ( codePoint > 0xdbff ) {
881+ // unexpected trail
882+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
883+ remaining -= 3
884+ continue
885+ } else if ( i + 1 === string . length ) {
886+ // unpaired lead
887+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
888+ remaining -= 3
889+ continue
890+ }
891+
892+ // valid lead
893+ leadSurrogate = codePoint
894+
895+ continue
896+ }
897+
898+ // 2 leads in a row
899+ if ( codePoint < 0xdc00 ) {
900+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
901+ remaining -= 3
902+ leadSurrogate = codePoint
903+ continue
904+ }
905+
906+ // valid surrogate pair
907+ codePoint -= 0xdc00
908+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
909+ codePoint += 0x10000
910+ } else if ( leadSurrogate ) {
911+ // valid bmp char, but last char was a lead
912+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
913+ remaining -= 3
914+ }
915+
916+ leadSurrogate = 0
917+
918+ // encode utf8
919+ if ( codePoint < 0x80 ) {
920+ if ( remaining < 1 ) break
921+ buf [ pos ++ ] = codePoint
922+ remaining -= 1
923+ } else if ( codePoint < 0x800 ) {
924+ if ( remaining < 2 ) break
925+ buf [ pos ++ ] = ( codePoint >> 6 ) | 0xc0
926+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
927+ remaining -= 2
928+ } else if ( codePoint < 0x10000 ) {
929+ if ( remaining < 3 ) break
930+ buf [ pos ++ ] = ( codePoint >> 12 ) | 0xe0
931+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
932+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
933+ remaining -= 3
934+ } else if ( codePoint < 0x110000 ) {
935+ if ( remaining < 4 ) break
936+ buf [ pos ++ ] = ( codePoint >> 18 ) | 0xf0
937+ buf [ pos ++ ] = ( ( codePoint >> 12 ) & 0x3f ) | 0x80
938+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
939+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
940+ remaining -= 4
941+ } else {
942+ throw new Error ( 'Invalid code point' )
943+ }
944+ }
945+
946+ return pos - offset
947+ }
948+
949+ function utf8ByteLength ( string ) {
950+ let leadSurrogate = 0
951+ let size = 0
952+
953+ for ( let i = 0 ; i < string . length ; i ++ ) {
954+ let codePoint = string . charCodeAt ( i )
955+
956+ // is surrogate component
957+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
958+ // last char was a lead
959+ if ( ! leadSurrogate ) {
960+ // no lead yet
961+ if ( codePoint > 0xdbff ) {
962+ // unexpected trail
963+ size += 3
964+ continue
965+ } else if ( i + 1 === string . length ) {
966+ // unpaired lead
967+ size += 3
968+ continue
969+ }
970+
971+ // valid lead
972+ leadSurrogate = codePoint
973+
974+ continue
975+ }
976+
977+ // 2 leads in a row
978+ if ( codePoint < 0xdc00 ) {
979+ size += 3
980+ leadSurrogate = codePoint
981+ continue
982+ }
983+
984+ // valid surrogate pair
985+ codePoint -= 0xdc00
986+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
987+ codePoint += 0x10000
988+ } else if ( leadSurrogate ) {
989+ // valid bmp char, but last char was a lead
990+ size += 3
991+ }
992+
993+ leadSurrogate = 0
994+
995+ // encode utf8
996+ size += 1
997+ size += ( codePoint >= 0x80 ) | 0
998+ size += ( codePoint >= 0x800 ) | 0
999+ size += ( codePoint >= 0x10000 ) | 0
1000+ }
1001+
1002+ return size
8691003}
8701004
8711005function asciiWrite ( buf , string , offset , length ) {
@@ -1991,90 +2125,18 @@ function base64clean (str) {
19912125 return str
19922126}
19932127
1994- function utf8ToBytes ( string , units ) {
1995- units = units || Infinity
1996- let codePoint
1997- const length = string . length
1998- let leadSurrogate = null
1999- const bytes = [ ]
2000-
2001- for ( let i = 0 ; i < length ; ++ i ) {
2002- codePoint = string . charCodeAt ( i )
2003-
2004- // is surrogate component
2005- if ( codePoint > 0xD7FF && codePoint < 0xE000 ) {
2006- // last char was a lead
2007- if ( ! leadSurrogate ) {
2008- // no lead yet
2009- if ( codePoint > 0xDBFF ) {
2010- // unexpected trail
2011- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2012- continue
2013- } else if ( i + 1 === length ) {
2014- // unpaired lead
2015- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2016- continue
2017- }
2018-
2019- // valid lead
2020- leadSurrogate = codePoint
2021-
2022- continue
2023- }
2024-
2025- // 2 leads in a row
2026- if ( codePoint < 0xDC00 ) {
2027- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2028- leadSurrogate = codePoint
2029- continue
2030- }
2031-
2032- // valid surrogate pair
2033- codePoint = ( leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00 ) + 0x10000
2034- } else if ( leadSurrogate ) {
2035- // valid bmp char, but last char was a lead
2036- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2037- }
2038-
2039- leadSurrogate = null
2040-
2041- // encode utf8
2042- if ( codePoint < 0x80 ) {
2043- if ( ( units -= 1 ) < 0 ) break
2044- bytes . push ( codePoint )
2045- } else if ( codePoint < 0x800 ) {
2046- if ( ( units -= 2 ) < 0 ) break
2047- bytes . push (
2048- codePoint >> 0x6 | 0xC0 ,
2049- codePoint & 0x3F | 0x80
2050- )
2051- } else if ( codePoint < 0x10000 ) {
2052- if ( ( units -= 3 ) < 0 ) break
2053- bytes . push (
2054- codePoint >> 0xC | 0xE0 ,
2055- codePoint >> 0x6 & 0x3F | 0x80 ,
2056- codePoint & 0x3F | 0x80
2057- )
2058- } else if ( codePoint < 0x110000 ) {
2059- if ( ( units -= 4 ) < 0 ) break
2060- bytes . push (
2061- codePoint >> 0x12 | 0xF0 ,
2062- codePoint >> 0xC & 0x3F | 0x80 ,
2063- codePoint >> 0x6 & 0x3F | 0x80 ,
2064- codePoint & 0x3F | 0x80
2065- )
2066- } else {
2067- throw new Error ( 'Invalid code point' )
2068- }
2069- }
2070-
2071- return bytes
2072- }
2073-
20742128function base64ToBytes ( str ) {
20752129 return base64 . toByteArray ( base64clean ( str ) )
20762130}
20772131
2132+ function writeInvalid ( buf , pos ) {
2133+ // U+FFFD (Replacement Character)
2134+ buf [ pos ++ ] = 0xef
2135+ buf [ pos ++ ] = 0xbf
2136+ buf [ pos ++ ] = 0xbd
2137+ return pos
2138+ }
2139+
20782140function blitBuffer ( src , dst , offset , length ) {
20792141 let i
20802142 for ( i = 0 ; i < length ; ++ i ) {
0 commit comments