Skip to content

Commit 9477323

Browse files
authored
perf: make encoding-browser 2x smaller (#60)
* perf: make encoding-browser 2x smaller
1 parent 39f38dd commit 9477323

7 files changed

Lines changed: 245 additions & 66 deletions

File tree

encoding-browser.browser.js

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
import {
2-
fromSource,
3-
getBOMEncoding,
4-
normalizeEncoding,
5-
E_ENCODING,
6-
} from './fallback/encoding.api.js'
7-
import labels from './fallback/encoding.labels.js'
1+
import { getBOMEncoding } from './fallback/encoding.api.js'
82

93
// Lite-weight version which re-exports existing implementations on browsers,
104
// while still being aliased to the full impl in RN and Node.js
@@ -13,17 +7,49 @@ import labels from './fallback/encoding.labels.js'
137

148
const { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } = globalThis
159

16-
export { normalizeEncoding, getBOMEncoding, labelToName } from './fallback/encoding.api.js'
10+
export { getBOMEncoding } from './fallback/encoding.api.js'
1711
export { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream }
1812

19-
// https://encoding.spec.whatwg.org/#decode
13+
export function normalizeEncoding(label) {
14+
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
15+
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
16+
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null
17+
const l = `${label}`.trim().toLowerCase()
18+
try {
19+
return new TextDecoder(l).encoding
20+
} catch {}
21+
22+
if (l === 'x-user-defined') return l
23+
if (
24+
l === 'replacement' ||
25+
l === 'csiso2022kr' ||
26+
l === 'hz-gb-2312' ||
27+
l === 'iso-2022-cn' ||
28+
l === 'iso-2022-cn-ext' ||
29+
l === 'iso-2022-kr'
30+
) {
31+
return 'replacement'
32+
}
33+
34+
return null
35+
}
36+
2037
export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
21-
let u8 = fromSource(input)
22-
const bomEncoding = getBOMEncoding(u8)
23-
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
24-
const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
25-
if (enc === 'utf-8') return new TextDecoder('utf-8', { ignoreBOM: true }).decode(u8) // fast path
26-
if (enc === 'replacement') return u8.byteLength > 0 ? '\uFFFD' : ''
27-
if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
28-
return new TextDecoder(enc, { ignoreBOM: true }).decode(u8)
38+
const enc = getBOMEncoding(input) ?? normalizeEncoding(fallbackEncoding)
39+
if (enc === 'replacement') return input.byteLength > 0 ? '\uFFFD' : ''
40+
return new TextDecoder(enc).decode(input)
41+
}
42+
43+
export function labelToName(label) {
44+
const enc = normalizeEncoding(label)
45+
if (enc === 'utf-8') return 'UTF-8'
46+
if (!enc) return enc
47+
const p = enc.slice(0, 3)
48+
if (p === 'utf' || p === 'iso' || p === 'koi' || p === 'euc' || p === 'ibm' || p === 'gbk') {
49+
return enc.toUpperCase()
50+
}
51+
52+
if (enc === 'big5') return 'Big5'
53+
if (enc === 'shift_jis') return 'Shift_JIS'
54+
return enc
2955
}

fallback/encoding.api.js

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,3 @@
1-
import labels from './encoding.labels.js'
2-
3-
let labelsMap
4-
5-
export const E_ENCODING = 'Unknown encoding'
6-
7-
// Warning: unlike whatwg-encoding, returns lowercased labels
8-
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
9-
// https://encoding.spec.whatwg.org/#names-and-labels
10-
export function normalizeEncoding(label) {
11-
// fast path
12-
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
13-
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
14-
// full map
15-
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
16-
const low = `${label}`.trim().toLowerCase()
17-
if (Object.hasOwn(labels, low)) return low
18-
if (!labelsMap) {
19-
labelsMap = new Map()
20-
for (const [label, aliases] of Object.entries(labels)) {
21-
for (const alias of aliases) labelsMap.set(alias, label)
22-
}
23-
}
24-
25-
const mapped = labelsMap.get(low)
26-
if (mapped) return mapped
27-
return null
28-
}
29-
301
// TODO: make this more strict against Symbol.toStringTag
312
// Is not very significant though, anything faking Symbol.toStringTag could as well override
323
// prototypes, which is not something we protect against
@@ -65,17 +36,3 @@ export function getBOMEncoding(input) {
6536
if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
6637
return null
6738
}
68-
69-
const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
70-
71-
// Unlike normalizeEncoding, case-sensitive
72-
// https://encoding.spec.whatwg.org/#names-and-labels
73-
export function labelToName(label) {
74-
const enc = normalizeEncoding(label)
75-
if (enc === 'utf-8') return 'UTF-8' // fast path
76-
if (!enc) return enc
77-
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
78-
if (enc === 'big5') return 'Big5'
79-
if (enc === 'shift_jis') return 'Shift_JIS'
80-
return enc
81-
}

fallback/encoding.js

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,56 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
55
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
66
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
77
import labels from './encoding.labels.js'
8-
import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
8+
import { fromSource, getBOMEncoding } from './encoding.api.js'
99
import { unfinishedBytes, mergePrefix } from './encoding.util.js'
1010

11-
export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
11+
export { getBOMEncoding } from './encoding.api.js'
1212

13+
export const E_ENCODING = 'Unknown encoding'
1314
const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
1415
const E_OPTIONS = 'The "options" argument must be of type object'
1516
const replacementChar = '\uFFFD'
1617
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
1718
let createMultibyteDecoder, multibyteEncoder
1819

20+
let labelsMap
21+
// Warning: unlike whatwg-encoding, returns lowercased labels
22+
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
23+
// https://encoding.spec.whatwg.org/#names-and-labels
24+
export function normalizeEncoding(label) {
25+
// fast path
26+
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
27+
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
28+
// full map
29+
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
30+
const low = `${label}`.trim().toLowerCase()
31+
if (Object.hasOwn(labels, low)) return low
32+
if (!labelsMap) {
33+
labelsMap = new Map()
34+
for (const [name, aliases] of Object.entries(labels)) {
35+
for (const alias of aliases) labelsMap.set(alias, name)
36+
}
37+
}
38+
39+
const mapped = labelsMap.get(low)
40+
if (mapped) return mapped
41+
return null
42+
}
43+
44+
const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
45+
46+
// Unlike normalizeEncoding, case-sensitive
47+
// https://encoding.spec.whatwg.org/#names-and-labels
48+
export function labelToName(label) {
49+
const enc = normalizeEncoding(label)
50+
if (enc === 'utf-8') return 'UTF-8' // fast path
51+
if (!enc) return enc
52+
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
53+
if (enc === 'big5') return 'Big5'
54+
if (enc === 'shift_jis') return 'Shift_JIS'
55+
return enc
56+
}
57+
1958
export const isMultibyte = (enc) => multibyteSet.has(enc)
2059
export function setMultibyte(createDecoder, createEncoder) {
2160
createMultibyteDecoder = createDecoder

tests/encoding/browser.test.js

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import {
2+
TextDecoder,
3+
TextEncoder,
4+
getBOMEncoding,
5+
legacyHookDecode,
6+
} from '@exodus/bytes/encoding-browser.js'
7+
import { fromHex } from '@exodus/bytes/hex.js'
8+
import { test, describe } from 'node:test'
9+
import unfinishedBytesFixtures from './fixtures/unfinishedBytes.js'
10+
11+
test('Unfinished bytes', (t) => {
12+
for (const [encoding, trail, u8] of unfinishedBytesFixtures) {
13+
const decoder = new TextDecoder(encoding)
14+
const a0 = decoder.decode(u8, { stream: true })
15+
const b0 = decoder.decode()
16+
const ab = new TextDecoder(encoding).decode(u8)
17+
const a1 = new TextDecoder(encoding).decode(u8.subarray(0, u8.length - trail))
18+
const b1 = new TextDecoder(encoding).decode(u8.subarray(u8.length - trail))
19+
t.assert.strictEqual(a0, a1)
20+
t.assert.strictEqual(b0, b1)
21+
t.assert.strictEqual(a0 + b0, ab)
22+
t.assert.strictEqual(decoder.decode(u8), ab) // reuse
23+
24+
if (trail === 0) {
25+
t.assert.strictEqual(a0, ab)
26+
t.assert.strictEqual(b0, '')
27+
}
28+
29+
if (trail === u8.length) {
30+
t.assert.strictEqual(a0, '')
31+
t.assert.strictEqual(b0, ab)
32+
}
33+
}
34+
})
35+
36+
test('String coercion', (t) => {
37+
const encoder = new TextEncoder()
38+
const map = [
39+
[{}, '[object Object]'],
40+
[null, 'null'],
41+
[undefined, 'undefined'],
42+
]
43+
44+
for (const [arg, string] of map) {
45+
const length = string.length
46+
const a = encoder.encode(string)
47+
t.assert.strictEqual(a.length, length)
48+
49+
const b = encoder.encode(arg)
50+
if (arg === undefined) {
51+
// undefined is special
52+
t.assert.strictEqual(b.length, 0)
53+
t.assert.deepStrictEqual(b, Uint8Array.of())
54+
} else {
55+
t.assert.strictEqual(b.length, length)
56+
t.assert.deepStrictEqual(b, a)
57+
}
58+
59+
const c = new Uint8Array(20)
60+
t.assert.deepStrictEqual(encoder.encodeInto(arg, c), { read: length, written: length })
61+
t.assert.deepStrictEqual(c.subarray(0, length), a)
62+
}
63+
})
64+
65+
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
66+
test('x-user-defined encoding', (t) => {
67+
const decoder = new TextDecoder('x-user-defined')
68+
for (let byte = 0; byte < 256; byte++) {
69+
const codePoint = byte >= 128 ? 0xf7_80 + byte - 0x80 : byte
70+
t.assert.strictEqual(decoder.decode(Uint8Array.of(byte)), String.fromCodePoint(codePoint))
71+
}
72+
})
73+
74+
// iso-8859-1, iso-8859-9, iso-8859-11 differ in WHATWG Encoding spec from https://unicode.org/Public/MAPPINGS/ISO8859
75+
// and map to windows-1252, windows-1254, windows-874 instead
76+
test('not all ISO-8859 encodings are present in TextDecoder', (t) => {
77+
t.assert.strictEqual(new TextDecoder('iso-8859-1').encoding, 'windows-1252')
78+
t.assert.strictEqual(new TextDecoder('iso-8859-2').encoding, 'iso-8859-2') // present
79+
t.assert.strictEqual(new TextDecoder('iso-8859-9').encoding, 'windows-1254')
80+
t.assert.strictEqual(new TextDecoder('iso-8859-11').encoding, 'windows-874')
81+
t.assert.throws(() => new TextDecoder('iso-8859-12'))
82+
t.assert.strictEqual(new TextDecoder('iso-8859-13').encoding, 'iso-8859-13') // present
83+
})
84+
85+
describe('legacyHookDecode', () => {
86+
const fixtures = {
87+
replacement: [
88+
['', ''],
89+
['00', '\uFFFD'],
90+
['ff', '\uFFFD'],
91+
['20', '\uFFFD'],
92+
['2020', '\uFFFD'],
93+
// BOM takes preference
94+
['efbbbf', ''],
95+
['efbbbf2a', '*'],
96+
['efbbbf202a', ' *'],
97+
['fffe', ''],
98+
['fffe2a20', '\u202A'],
99+
['fffe2a', '\uFFFD'],
100+
['fffe00d72a', '\uD700\uFFFD'],
101+
['fffe00d82a', '\uFFFD'],
102+
['fffe00dc2a', '\uFFFD\uFFFD'],
103+
['feff', ''],
104+
['feff202a', '\u202A'],
105+
['feff20', '\uFFFD'],
106+
['feffd70020', '\uD700\uFFFD'],
107+
['feffd80020', '\uFFFD'],
108+
['feffdc0020', '\uFFFD\uFFFD'],
109+
],
110+
// non-normalized names
111+
Utf8: [['c280', '\x80']],
112+
unicodefeff: [['c280', '\u80C2']],
113+
UnicodeFFFE: [['c280', '\uC280']],
114+
}
115+
116+
test('null encoding', (t) => {
117+
t.assert.throws(() => legacyHookDecode(Uint8Array.of(), null), RangeError)
118+
})
119+
120+
for (const [encoding, data] of Object.entries(fixtures)) {
121+
test(encoding, (t) => {
122+
for (const [hex, string] of data) {
123+
t.assert.strictEqual(legacyHookDecode(fromHex(hex), encoding), string, `${hex}`)
124+
}
125+
})
126+
}
127+
})
128+
129+
test('getBOMEncoding', (t) => {
130+
const fixtures = [
131+
[null, ''],
132+
[null, 'ff'],
133+
[null, 'fe'],
134+
[null, 'ef'],
135+
[null, 'efbb'],
136+
[null, 'efbb00'],
137+
[null, 'efbfbb'],
138+
[null, 'ffbbbf'],
139+
['utf-8', 'efbbbf'],
140+
['utf-8', 'efbbbf00'],
141+
['utf-16le', 'fffe'],
142+
['utf-16le', 'fffefffe'],
143+
['utf-16le', 'fffefffefffe'],
144+
['utf-16le', 'fffebb'],
145+
['utf-16le', 'fffebf'],
146+
['utf-16be', 'feff'],
147+
['utf-16be', 'fefffeff'],
148+
['utf-16be', 'fefffefffeff'],
149+
]
150+
151+
for (const [enc, hex] of fixtures) {
152+
t.assert.strictEqual(getBOMEncoding(fromHex(hex)), enc, `${hex} -> ${enc}`)
153+
}
154+
})

tests/encoding/generic.test.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ test('String coercion', (t) => {
5353
t.assert.strictEqual(b.length, 0)
5454
t.assert.deepStrictEqual(b, Uint8Array.of())
5555
} else {
56-
const b = encoder.encode(arg)
5756
t.assert.strictEqual(b.length, length)
5857
t.assert.deepStrictEqual(b, a)
5958
}

tests/vendor/whatwg-encoding/whatwg-encoding-mock.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import * as api from '@exodus/bytes/encoding.js'
1+
import * as api from '@exodus/bytes/encoding-browser.js'
22

33
// prettier-ignore
44
const supported = new Set([

whatwg.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import { utf8fromStringLoose } from '@exodus/bytes/utf8.js'
22
import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
3-
import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js'
4-
import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js'
3+
import {
4+
isMultibyte,
5+
getMultibyteEncoder,
6+
normalizeEncoding,
7+
E_ENCODING,
8+
} from './fallback/encoding.js'
59
import { percentEncoder } from './fallback/percent.js'
610
import { encodeMap } from './fallback/single-byte.js'
711
import { E_STRING } from './fallback/_utils.js'

0 commit comments

Comments
 (0)