Skip to content

Commit 633dc05

Browse files
committed
Optimize utf8Write
1 parent 04efaf9 commit 633dc05

1 file changed

Lines changed: 145 additions & 83 deletions

File tree

index.js

Lines changed: 145 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ function byteLength (string, encoding) {
475475
return len
476476
case 'utf8':
477477
case 'utf-8':
478-
return utf8ToBytes(string).length
478+
return utf8ByteLength(string)
479479
case 'ucs2':
480480
case 'ucs-2':
481481
case 'utf16le':
@@ -487,7 +487,7 @@ function byteLength (string, encoding) {
487487
return base64ToBytes(string).length
488488
default:
489489
if (loweredCase) {
490-
return mustMatch ? -1 : utf8ToBytes(string).length // assume utf8
490+
return mustMatch ? -1 : utf8ByteLength(string) // assume utf8
491491
}
492492
encoding = ('' + encoding).toLowerCase()
493493
loweredCase = true
@@ -865,7 +865,141 @@ function hexWrite (buf, string, offset, length) {
865865
}
866866

867867
function utf8Write (buf, string, offset, length) {
868-
return blitBuffer(utf8ToBytes(string, buf.length - offset), buf, offset, length)
868+
let remaining = length
869+
let leadSurrogate = 0
870+
let pos = offset
871+
872+
for (let i = 0; i < string.length; i++) {
873+
let codePoint = string.charCodeAt(i)
874+
875+
// is surrogate component
876+
if (codePoint > 0xd7ff && codePoint < 0xe000) {
877+
// last char was a lead
878+
if (!leadSurrogate) {
879+
// no lead yet
880+
if (codePoint > 0xdbff) {
881+
// unexpected trail
882+
if (remaining >= 3) pos = writeInvalid(buf, pos)
883+
remaining -= 3
884+
continue
885+
} else if (i + 1 === string.length) {
886+
// unpaired lead
887+
if (remaining >= 3) pos = writeInvalid(buf, pos)
888+
remaining -= 3
889+
continue
890+
}
891+
892+
// valid lead
893+
leadSurrogate = codePoint
894+
895+
continue
896+
}
897+
898+
// 2 leads in a row
899+
if (codePoint < 0xdc00) {
900+
if (remaining >= 3) pos = writeInvalid(buf, pos)
901+
remaining -= 3
902+
leadSurrogate = codePoint
903+
continue
904+
}
905+
906+
// valid surrogate pair
907+
codePoint -= 0xdc00
908+
codePoint |= (leadSurrogate - 0xd800) << 10
909+
codePoint += 0x10000
910+
} else if (leadSurrogate) {
911+
// valid bmp char, but last char was a lead
912+
if (remaining >= 3) pos = writeInvalid(buf, pos)
913+
remaining -= 3
914+
}
915+
916+
leadSurrogate = 0
917+
918+
// encode utf8
919+
if (codePoint < 0x80) {
920+
if (remaining < 1) break
921+
buf[pos++] = codePoint
922+
remaining -= 1
923+
} else if (codePoint < 0x800) {
924+
if (remaining < 2) break
925+
buf[pos++] = (codePoint >> 6) | 0xc0
926+
buf[pos++] = (codePoint & 0x3f) | 0x80
927+
remaining -= 2
928+
} else if (codePoint < 0x10000) {
929+
if (remaining < 3) break
930+
buf[pos++] = (codePoint >> 12) | 0xe0
931+
buf[pos++] = ((codePoint >> 6) & 0x3f) | 0x80
932+
buf[pos++] = (codePoint & 0x3f) | 0x80
933+
remaining -= 3
934+
} else if (codePoint < 0x110000) {
935+
if (remaining < 4) break
936+
buf[pos++] = (codePoint >> 18) | 0xf0
937+
buf[pos++] = ((codePoint >> 12) & 0x3f) | 0x80
938+
buf[pos++] = ((codePoint >> 6) & 0x3f) | 0x80
939+
buf[pos++] = (codePoint & 0x3f) | 0x80
940+
remaining -= 4
941+
} else {
942+
throw new Error('Invalid code point')
943+
}
944+
}
945+
946+
return pos - offset
947+
}
948+
949+
function utf8ByteLength (string) {
950+
let leadSurrogate = 0
951+
let size = 0
952+
953+
for (let i = 0; i < string.length; i++) {
954+
let codePoint = string.charCodeAt(i)
955+
956+
// is surrogate component
957+
if (codePoint > 0xd7ff && codePoint < 0xe000) {
958+
// last char was a lead
959+
if (!leadSurrogate) {
960+
// no lead yet
961+
if (codePoint > 0xdbff) {
962+
// unexpected trail
963+
size += 3
964+
continue
965+
} else if (i + 1 === string.length) {
966+
// unpaired lead
967+
size += 3
968+
continue
969+
}
970+
971+
// valid lead
972+
leadSurrogate = codePoint
973+
974+
continue
975+
}
976+
977+
// 2 leads in a row
978+
if (codePoint < 0xdc00) {
979+
size += 3
980+
leadSurrogate = codePoint
981+
continue
982+
}
983+
984+
// valid surrogate pair
985+
codePoint -= 0xdc00
986+
codePoint |= (leadSurrogate - 0xd800) << 10
987+
codePoint += 0x10000
988+
} else if (leadSurrogate) {
989+
// valid bmp char, but last char was a lead
990+
size += 3
991+
}
992+
993+
leadSurrogate = 0
994+
995+
// encode utf8
996+
size += 1
997+
size += (codePoint >= 0x80) | 0
998+
size += (codePoint >= 0x800) | 0
999+
size += (codePoint >= 0x10000) | 0
1000+
}
1001+
1002+
return size
8691003
}
8701004

8711005
function asciiWrite (buf, string, offset, length) {
@@ -1991,90 +2125,18 @@ function base64clean (str) {
19912125
return str
19922126
}
19932127

1994-
function utf8ToBytes (string, units) {
1995-
units = units || Infinity
1996-
let codePoint
1997-
const length = string.length
1998-
let leadSurrogate = null
1999-
const bytes = []
2000-
2001-
for (let i = 0; i < length; ++i) {
2002-
codePoint = string.charCodeAt(i)
2003-
2004-
// is surrogate component
2005-
if (codePoint > 0xD7FF && codePoint < 0xE000) {
2006-
// last char was a lead
2007-
if (!leadSurrogate) {
2008-
// no lead yet
2009-
if (codePoint > 0xDBFF) {
2010-
// unexpected trail
2011-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2012-
continue
2013-
} else if (i + 1 === length) {
2014-
// unpaired lead
2015-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2016-
continue
2017-
}
2018-
2019-
// valid lead
2020-
leadSurrogate = codePoint
2021-
2022-
continue
2023-
}
2024-
2025-
// 2 leads in a row
2026-
if (codePoint < 0xDC00) {
2027-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2028-
leadSurrogate = codePoint
2029-
continue
2030-
}
2031-
2032-
// valid surrogate pair
2033-
codePoint = (leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00) + 0x10000
2034-
} else if (leadSurrogate) {
2035-
// valid bmp char, but last char was a lead
2036-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2037-
}
2038-
2039-
leadSurrogate = null
2040-
2041-
// encode utf8
2042-
if (codePoint < 0x80) {
2043-
if ((units -= 1) < 0) break
2044-
bytes.push(codePoint)
2045-
} else if (codePoint < 0x800) {
2046-
if ((units -= 2) < 0) break
2047-
bytes.push(
2048-
codePoint >> 0x6 | 0xC0,
2049-
codePoint & 0x3F | 0x80
2050-
)
2051-
} else if (codePoint < 0x10000) {
2052-
if ((units -= 3) < 0) break
2053-
bytes.push(
2054-
codePoint >> 0xC | 0xE0,
2055-
codePoint >> 0x6 & 0x3F | 0x80,
2056-
codePoint & 0x3F | 0x80
2057-
)
2058-
} else if (codePoint < 0x110000) {
2059-
if ((units -= 4) < 0) break
2060-
bytes.push(
2061-
codePoint >> 0x12 | 0xF0,
2062-
codePoint >> 0xC & 0x3F | 0x80,
2063-
codePoint >> 0x6 & 0x3F | 0x80,
2064-
codePoint & 0x3F | 0x80
2065-
)
2066-
} else {
2067-
throw new Error('Invalid code point')
2068-
}
2069-
}
2070-
2071-
return bytes
2072-
}
2073-
20742128
function base64ToBytes (str) {
20752129
return base64.toByteArray(base64clean(str))
20762130
}
20772131

2132+
function writeInvalid (buf, pos) {
2133+
// U+FFFD (Replacement Character)
2134+
buf[pos++] = 0xef
2135+
buf[pos++] = 0xbf
2136+
buf[pos++] = 0xbd
2137+
return pos
2138+
}
2139+
20782140
function blitBuffer (src, dst, offset, length) {
20792141
let i
20802142
for (i = 0; i < length; ++i) {

0 commit comments

Comments
 (0)