Skip to content

Commit 40cacbb

Browse files
committed
perf: make encoding-browser 2x smaller
1 parent d4afaf3 commit 40cacbb

File tree

6 files changed

+259
-65
lines changed

6 files changed

+259
-65
lines changed

encoding-browser.browser.js

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
import {
2-
fromSource,
3-
getBOMEncoding,
4-
normalizeEncoding,
5-
E_ENCODING,
6-
} from './fallback/encoding.api.js'
7-
import labels from './fallback/encoding.labels.js'
1+
import { getBOMEncoding } from './fallback/encoding.api.js'
82

93
// Lite-weight version which re-exports existing implementations on browsers,
104
// while still being aliased to the full impl in RN and Node.js
@@ -13,17 +7,46 @@ import labels from './fallback/encoding.labels.js'
137

148
const { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } = globalThis
159

16-
export { normalizeEncoding, getBOMEncoding, labelToName } from './fallback/encoding.api.js'
10+
export { getBOMEncoding } from './fallback/encoding.api.js'
1711
export { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream }
1812

19-
// https://encoding.spec.whatwg.org/#decode
13+
export function normalizeEncoding(label) {
14+
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
15+
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
16+
try {
17+
return new TextDecoder(label).name
18+
} catch {}
19+
20+
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null
21+
const l = `${label}`.trim().toLowerCase()
22+
if (
23+
l === 'csiso2022kr' ||
24+
l === 'hz-gb-2312' ||
25+
l === 'iso-2022-cn' ||
26+
l === 'iso-2022-cn-ext' ||
27+
l === 'iso-2022-kr'
28+
) {
29+
return 'replacement'
30+
}
31+
return null
32+
}
33+
2034
export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
21-
let u8 = fromSource(input)
22-
const bomEncoding = getBOMEncoding(u8)
23-
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
24-
const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
25-
if (enc === 'utf-8') return new TextDecoder('utf-8', { ignoreBOM: true }).decode(u8) // fast path
26-
if (enc === 'replacement') return u8.byteLength > 0 ? '\uFFFD' : ''
27-
if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
28-
return new TextDecoder(enc, { ignoreBOM: true }).decode(u8)
35+
const enc = getBOMEncoding(input) ?? normalizeEncoding(fallbackEncoding)
36+
if (enc === 'replacement') return input.byteLength > 0 ? '\uFFFD' : ''
37+
return new TextDecoder(enc).decode(input)
38+
}
39+
40+
export function labelToName(label) {
41+
const enc = normalizeEncoding(label)
42+
if (enc === 'utf-8') return 'UTF-8' // fast path
43+
if (!enc) return enc
44+
const p = enc.slice(0, 3)
45+
if (p === 'utf' || p === 'iso' || p === 'koi' || p === 'euc' || p === 'ibm' || p === 'gbk') {
46+
return enc.toUpperCase()
47+
}
48+
49+
if (enc === 'big5') return 'Big5'
50+
if (enc === 'shift_jis') return 'Shift_JIS'
51+
return enc
2952
}

fallback/encoding.api.js

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,3 @@
1-
import labels from './encoding.labels.js'
2-
3-
let labelsMap
4-
5-
export const E_ENCODING = 'Unknown encoding'
6-
7-
// Warning: unlike whatwg-encoding, returns lowercased labels
8-
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
9-
// https://encoding.spec.whatwg.org/#names-and-labels
10-
export function normalizeEncoding(label) {
11-
// fast path
12-
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
13-
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
14-
// full map
15-
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
16-
const low = `${label}`.trim().toLowerCase()
17-
if (Object.hasOwn(labels, low)) return low
18-
if (!labelsMap) {
19-
labelsMap = new Map()
20-
for (const [label, aliases] of Object.entries(labels)) {
21-
for (const alias of aliases) labelsMap.set(alias, label)
22-
}
23-
}
24-
25-
const mapped = labelsMap.get(low)
26-
if (mapped) return mapped
27-
return null
28-
}
29-
301
// TODO: make this more strict against Symbol.toStringTag
312
// Is not very significant though, anything faking Symbol.toStringTag could as well override
323
// prototypes, which is not something we protect against
@@ -65,17 +36,3 @@ export function getBOMEncoding(input) {
6536
if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
6637
return null
6738
}
68-
69-
const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
70-
71-
// Unlike normalizeEncoding, case-sensitive
72-
// https://encoding.spec.whatwg.org/#names-and-labels
73-
export function labelToName(label) {
74-
const enc = normalizeEncoding(label)
75-
if (enc === 'utf-8') return 'UTF-8' // fast path
76-
if (!enc) return enc
77-
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
78-
if (enc === 'big5') return 'Big5'
79-
if (enc === 'shift_jis') return 'Shift_JIS'
80-
return enc
81-
}

fallback/encoding.js

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,56 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
55
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
66
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
77
import labels from './encoding.labels.js'
8-
import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
8+
import { fromSource, getBOMEncoding } from './encoding.api.js'
99
import { unfinishedBytes, mergePrefix } from './encoding.util.js'
1010

11-
export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
11+
export { getBOMEncoding } from './encoding.api.js'
1212

13+
export const E_ENCODING = 'Unknown encoding'
1314
const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
1415
const E_OPTIONS = 'The "options" argument must be of type object'
1516
const replacementChar = '\uFFFD'
1617
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
1718
let createMultibyteDecoder, multibyteEncoder
1819

20+
let labelsMap
21+
// Warning: unlike whatwg-encoding, returns lowercased labels
22+
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
23+
// https://encoding.spec.whatwg.org/#names-and-labels
24+
export function normalizeEncoding(label) {
25+
// fast path
26+
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
27+
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
28+
// full map
29+
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
30+
const low = `${label}`.trim().toLowerCase()
31+
if (Object.hasOwn(labels, low)) return low
32+
if (!labelsMap) {
33+
labelsMap = new Map()
34+
for (const [label, aliases] of Object.entries(labels)) {
35+
for (const alias of aliases) labelsMap.set(alias, label)
36+
}
37+
}
38+
39+
const mapped = labelsMap.get(low)
40+
if (mapped) return mapped
41+
return null
42+
}
43+
44+
const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
45+
46+
// Unlike normalizeEncoding, case-sensitive
47+
// https://encoding.spec.whatwg.org/#names-and-labels
48+
export function labelToName(label) {
49+
const enc = normalizeEncoding(label)
50+
if (enc === 'utf-8') return 'UTF-8' // fast path
51+
if (!enc) return enc
52+
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
53+
if (enc === 'big5') return 'Big5'
54+
if (enc === 'shift_jis') return 'Shift_JIS'
55+
return enc
56+
}
57+
1958
export const isMultibyte = (enc) => multibyteSet.has(enc)
2059
export function setMultibyte(createDecoder, createEncoder) {
2160
createMultibyteDecoder = createDecoder

tests/encoding/browser.test.js

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import {
2+
TextDecoder,
3+
TextEncoder,
4+
getBOMEncoding,
5+
legacyHookDecode,
6+
} from '@exodus/bytes/encoding-browser.js'
7+
import { fromHex } from '@exodus/bytes/hex.js'
8+
import { test, describe } from 'node:test'
9+
import { labels } from './fixtures/encodings.cjs'
10+
import unfinishedBytesFixtures from './fixtures/unfinishedBytes.js'
11+
12+
test('Unfinished bytes', (t) => {
13+
for (const [encoding, trail, u8] of unfinishedBytesFixtures) {
14+
const decoder = new TextDecoder(encoding)
15+
const a0 = decoder.decode(u8, { stream: true })
16+
const b0 = decoder.decode()
17+
const ab = new TextDecoder(encoding).decode(u8)
18+
const a1 = new TextDecoder(encoding).decode(u8.subarray(0, u8.length - trail))
19+
const b1 = new TextDecoder(encoding).decode(u8.subarray(u8.length - trail))
20+
t.assert.strictEqual(a0, a1)
21+
t.assert.strictEqual(b0, b1)
22+
t.assert.strictEqual(a0 + b0, ab)
23+
t.assert.strictEqual(decoder.decode(u8), ab) // reuse
24+
25+
if (trail === 0) {
26+
t.assert.strictEqual(a0, ab)
27+
t.assert.strictEqual(b0, '')
28+
}
29+
30+
if (trail === u8.length) {
31+
t.assert.strictEqual(a0, '')
32+
t.assert.strictEqual(b0, ab)
33+
}
34+
}
35+
})
36+
37+
test('String coercion', (t) => {
38+
const encoder = new TextEncoder()
39+
const map = [
40+
[{}, '[object Object]'],
41+
[null, 'null'],
42+
[undefined, 'undefined'],
43+
]
44+
45+
for (const [arg, string] of map) {
46+
const length = string.length
47+
const a = encoder.encode(string)
48+
t.assert.strictEqual(a.length, length)
49+
50+
const b = encoder.encode(arg)
51+
if (arg === undefined) {
52+
// undefined is special
53+
t.assert.strictEqual(b.length, 0)
54+
t.assert.deepStrictEqual(b, Uint8Array.of())
55+
} else {
56+
const b = encoder.encode(arg)
57+
t.assert.strictEqual(b.length, length)
58+
t.assert.deepStrictEqual(b, a)
59+
}
60+
61+
const c = new Uint8Array(20)
62+
t.assert.deepStrictEqual(encoder.encodeInto(arg, c), { read: length, written: length })
63+
t.assert.deepStrictEqual(c.subarray(0, length), a)
64+
}
65+
})
66+
67+
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
68+
test('x-user-defined encoding', (t) => {
69+
const decoder = new TextDecoder('x-user-defined')
70+
for (let byte = 0; byte < 256; byte++) {
71+
const codePoint = byte >= 128 ? 0xf7_80 + byte - 0x80 : byte
72+
t.assert.strictEqual(decoder.decode(Uint8Array.of(byte)), String.fromCodePoint(codePoint))
73+
}
74+
})
75+
76+
// iso-8859-1, iso-8859-9, iso-8859-11 differ in WHATWG Encoding spec from https://unicode.org/Public/MAPPINGS/ISO8859
77+
// and map to windows-1252, windows-1254, windows-874 instead
78+
test('not all ISO-8859 encodings are present in TextDecoder', (t) => {
79+
t.assert.strictEqual(new TextDecoder('iso-8859-1').encoding, 'windows-1252')
80+
t.assert.strictEqual(new TextDecoder('iso-8859-2').encoding, 'iso-8859-2') // present
81+
t.assert.strictEqual(new TextDecoder('iso-8859-9').encoding, 'windows-1254')
82+
t.assert.strictEqual(new TextDecoder('iso-8859-11').encoding, 'windows-874')
83+
t.assert.throws(() => new TextDecoder('iso-8859-12'))
84+
t.assert.strictEqual(new TextDecoder('iso-8859-13').encoding, 'iso-8859-13') // present
85+
})
86+
87+
describe('encodings are ASCII supersets, except utf-16 and iso-2022-jp', () => {
88+
for (const label of labels) {
89+
if (label === 'replacement' || label === 'utf-16le' || label === 'utf-16be') continue
90+
test(label, (t) => {
91+
const loose = new TextDecoder(label)
92+
const fatal = new TextDecoder(label, { fatal: true })
93+
for (let i = 0; i < 128; i++) {
94+
if (label === 'iso-2022-jp' && [0x0e, 0x0f, 0x1b].includes(i)) continue
95+
t.assert.strictEqual(loose.decode(Uint8Array.of(i)), String.fromCodePoint(i))
96+
t.assert.strictEqual(fatal.decode(Uint8Array.of(i)), String.fromCodePoint(i))
97+
}
98+
})
99+
}
100+
})
101+
102+
describe('legacyHookDecode', () => {
103+
const fixtures = {
104+
replacement: [
105+
['', ''],
106+
['00', '\uFFFD'],
107+
['ff', '\uFFFD'],
108+
['20', '\uFFFD'],
109+
['2020', '\uFFFD'],
110+
// BOM takes preference
111+
['efbbbf', ''],
112+
['efbbbf2a', '*'],
113+
['efbbbf202a', ' *'],
114+
['fffe', ''],
115+
['fffe2a20', '\u202A'],
116+
['fffe2a', '\uFFFD'],
117+
['fffe00d72a', '\uD700\uFFFD'],
118+
['fffe00d82a', '\uFFFD'],
119+
['fffe00dc2a', '\uFFFD\uFFFD'],
120+
['feff', ''],
121+
['feff202a', '\u202A'],
122+
['feff20', '\uFFFD'],
123+
['feffd70020', '\uD700\uFFFD'],
124+
['feffd80020', '\uFFFD'],
125+
['feffdc0020', '\uFFFD\uFFFD'],
126+
],
127+
// non-normalized names
128+
Utf8: [['c280', '\x80']],
129+
unicodefeff: [['c280', '\u80C2']],
130+
UnicodeFFFE: [['c280', '\uC280']],
131+
}
132+
133+
test('null encoding', (t) => {
134+
t.assert.throws(() => legacyHookDecode(Uint8Array.of(), null), RangeError)
135+
})
136+
137+
for (const [encoding, data] of Object.entries(fixtures)) {
138+
test(encoding, (t) => {
139+
for (const [hex, string] of data) {
140+
t.assert.strictEqual(legacyHookDecode(fromHex(hex), encoding), string, `${hex}`)
141+
}
142+
})
143+
}
144+
})
145+
146+
test('getBOMEncoding', (t) => {
147+
const fixtures = [
148+
[null, ''],
149+
[null, 'ff'],
150+
[null, 'fe'],
151+
[null, 'ef'],
152+
[null, 'efbb'],
153+
[null, 'efbb00'],
154+
[null, 'efbfbb'],
155+
[null, 'ffbbbf'],
156+
['utf-8', 'efbbbf'],
157+
['utf-8', 'efbbbf00'],
158+
['utf-16le', 'fffe'],
159+
['utf-16le', 'fffefffe'],
160+
['utf-16le', 'fffefffefffe'],
161+
['utf-16le', 'fffebb'],
162+
['utf-16le', 'fffebf'],
163+
['utf-16be', 'feff'],
164+
['utf-16be', 'fefffeff'],
165+
['utf-16be', 'fefffefffeff'],
166+
]
167+
168+
for (const [enc, hex] of fixtures) {
169+
t.assert.strictEqual(getBOMEncoding(fromHex(hex)), enc, `${hex} -> ${enc}`)
170+
}
171+
})

tests/vendor/whatwg-encoding/whatwg-encoding-mock.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import * as api from '@exodus/bytes/encoding.js'
1+
import * as api from '@exodus/bytes/encoding-browser.js'
22

33
// prettier-ignore
44
const supported = new Set([

whatwg.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import { utf8fromStringLoose } from '@exodus/bytes/utf8.js'
22
import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
3-
import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js'
4-
import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js'
3+
import {
4+
isMultibyte,
5+
getMultibyteEncoder,
6+
normalizeEncoding,
7+
E_ENCODING,
8+
} from './fallback/encoding.js'
59
import { percentEncoder } from './fallback/percent.js'
610
import { encodeMap } from './fallback/single-byte.js'
711
import { E_STRING } from './fallback/_utils.js'

0 commit comments

Comments
 (0)