perf: make encoding-browser 2x smaller

ChALkeR · ChALkeR · commit 40cacbb76790 · 2026-02-09T16:39:13.000+04:00
diff --git a/encoding-browser.browser.js b/encoding-browser.browser.js
@@ -1,10 +1,4 @@
-import {
-  fromSource,
-  getBOMEncoding,
-  normalizeEncoding,
-  E_ENCODING,
-} from './fallback/encoding.api.js'
-import labels from './fallback/encoding.labels.js'
+import { getBOMEncoding } from './fallback/encoding.api.js'
 
 // Lite-weight version which re-exports existing implementations on browsers,
 // while still being aliased to the full impl in RN and Node.js
@@ -13,17 +7,46 @@ import labels from './fallback/encoding.labels.js'
 
 const { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } = globalThis
 
-export { normalizeEncoding, getBOMEncoding, labelToName } from './fallback/encoding.api.js'
+export { getBOMEncoding } from './fallback/encoding.api.js'
 export { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream }
 
-// https://encoding.spec.whatwg.org/#decode
+export function normalizeEncoding(label) {
+  if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
+  if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
+  try {
+    return new TextDecoder(label).name
+  } catch {}
+
+  if (/[^\w\t\n\f\r .:-]/i.test(label)) return null
+  const l = `${label}`.trim().toLowerCase()
+  if (
+    l === 'csiso2022kr' ||
+    l === 'hz-gb-2312' ||
+    l === 'iso-2022-cn' ||
+    l === 'iso-2022-cn-ext' ||
+    l === 'iso-2022-kr'
+  ) {
+    return 'replacement'
+  }
+  return null
+}
+
 export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
-  let u8 = fromSource(input)
-  const bomEncoding = getBOMEncoding(u8)
-  if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
-  const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
-  if (enc === 'utf-8') return new TextDecoder('utf-8', { ignoreBOM: true }).decode(u8) // fast path
-  if (enc === 'replacement') return u8.byteLength > 0 ? '\uFFFD' : ''
-  if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
-  return new TextDecoder(enc, { ignoreBOM: true }).decode(u8)
+  const enc = getBOMEncoding(input) ?? normalizeEncoding(fallbackEncoding)
+  if (enc === 'replacement') return input.byteLength > 0 ? '\uFFFD' : ''
+  return new TextDecoder(enc).decode(input)
+}
+
+export function labelToName(label) {
+  const enc = normalizeEncoding(label)
+  if (enc === 'utf-8') return 'UTF-8' // fast path
+  if (!enc) return enc
+  const p = enc.slice(0, 3)
+  if (p === 'utf' || p === 'iso' || p === 'koi' || p === 'euc' || p === 'ibm' || p === 'gbk') {
+    return enc.toUpperCase()
+  }
+
+  if (enc === 'big5') return 'Big5'
+  if (enc === 'shift_jis') return 'Shift_JIS'
+  return enc
 }
diff --git a/fallback/encoding.api.js b/fallback/encoding.api.js
@@ -1,32 +1,3 @@
-import labels from './encoding.labels.js'
-
-let labelsMap
-
-export const E_ENCODING = 'Unknown encoding'
-
-// Warning: unlike whatwg-encoding, returns lowercased labels
-// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
-// https://encoding.spec.whatwg.org/#names-and-labels
-export function normalizeEncoding(label) {
-  // fast path
-  if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
-  if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
-  // full map
-  if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
-  const low = `${label}`.trim().toLowerCase()
-  if (Object.hasOwn(labels, low)) return low
-  if (!labelsMap) {
-    labelsMap = new Map()
-    for (const [label, aliases] of Object.entries(labels)) {
-      for (const alias of aliases) labelsMap.set(alias, label)
-    }
-  }
-
-  const mapped = labelsMap.get(low)
-  if (mapped) return mapped
-  return null
-}
-
 // TODO: make this more strict against Symbol.toStringTag
 // Is not very significant though, anything faking Symbol.toStringTag could as well override
 // prototypes, which is not something we protect against
@@ -65,17 +36,3 @@ export function getBOMEncoding(input) {
   if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
   return null
 }
-
-const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
-
-// Unlike normalizeEncoding, case-sensitive
-// https://encoding.spec.whatwg.org/#names-and-labels
-export function labelToName(label) {
-  const enc = normalizeEncoding(label)
-  if (enc === 'utf-8') return 'UTF-8' // fast path
-  if (!enc) return enc
-  if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
-  if (enc === 'big5') return 'Big5'
-  if (enc === 'shift_jis') return 'Shift_JIS'
-  return enc
-}
diff --git a/fallback/encoding.js b/fallback/encoding.js
@@ -5,17 +5,56 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
 import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
 import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
 import labels from './encoding.labels.js'
-import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
+import { fromSource, getBOMEncoding } from './encoding.api.js'
 import { unfinishedBytes, mergePrefix } from './encoding.util.js'
 
-export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
+export { getBOMEncoding } from './encoding.api.js'
 
+export const E_ENCODING = 'Unknown encoding'
 const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
 const E_OPTIONS = 'The "options" argument must be of type object'
 const replacementChar = '\uFFFD'
 const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
 let createMultibyteDecoder, multibyteEncoder
 
+let labelsMap
+// Warning: unlike whatwg-encoding, returns lowercased labels
+// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
+// https://encoding.spec.whatwg.org/#names-and-labels
+export function normalizeEncoding(label) {
+  // fast path
+  if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
+  if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
+  // full map
+  if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
+  const low = `${label}`.trim().toLowerCase()
+  if (Object.hasOwn(labels, low)) return low
+  if (!labelsMap) {
+    labelsMap = new Map()
+    for (const [label, aliases] of Object.entries(labels)) {
+      for (const alias of aliases) labelsMap.set(alias, label)
+    }
+  }
+
+  const mapped = labelsMap.get(low)
+  if (mapped) return mapped
+  return null
+}
+
+const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
+
+// Unlike normalizeEncoding, case-sensitive
+// https://encoding.spec.whatwg.org/#names-and-labels
+export function labelToName(label) {
+  const enc = normalizeEncoding(label)
+  if (enc === 'utf-8') return 'UTF-8' // fast path
+  if (!enc) return enc
+  if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
+  if (enc === 'big5') return 'Big5'
+  if (enc === 'shift_jis') return 'Shift_JIS'
+  return enc
+}
+
 export const isMultibyte = (enc) => multibyteSet.has(enc)
 export function setMultibyte(createDecoder, createEncoder) {
   createMultibyteDecoder = createDecoder
diff --git a/tests/encoding/browser.test.js b/tests/encoding/browser.test.js
@@ -0,0 +1,171 @@
+import {
+  TextDecoder,
+  TextEncoder,
+  getBOMEncoding,
+  legacyHookDecode,
+} from '@exodus/bytes/encoding-browser.js'
+import { fromHex } from '@exodus/bytes/hex.js'
+import { test, describe } from 'node:test'
+import { labels } from './fixtures/encodings.cjs'
+import unfinishedBytesFixtures from './fixtures/unfinishedBytes.js'
+
+test('Unfinished bytes', (t) => {
+  for (const [encoding, trail, u8] of unfinishedBytesFixtures) {
+    const decoder = new TextDecoder(encoding)
+    const a0 = decoder.decode(u8, { stream: true })
+    const b0 = decoder.decode()
+    const ab = new TextDecoder(encoding).decode(u8)
+    const a1 = new TextDecoder(encoding).decode(u8.subarray(0, u8.length - trail))
+    const b1 = new TextDecoder(encoding).decode(u8.subarray(u8.length - trail))
+    t.assert.strictEqual(a0, a1)
+    t.assert.strictEqual(b0, b1)
+    t.assert.strictEqual(a0 + b0, ab)
+    t.assert.strictEqual(decoder.decode(u8), ab) // reuse
+
+    if (trail === 0) {
+      t.assert.strictEqual(a0, ab)
+      t.assert.strictEqual(b0, '')
+    }
+
+    if (trail === u8.length) {
+      t.assert.strictEqual(a0, '')
+      t.assert.strictEqual(b0, ab)
+    }
+  }
+})
+
+test('String coercion', (t) => {
+  const encoder = new TextEncoder()
+  const map = [
+    [{}, '[object Object]'],
+    [null, 'null'],
+    [undefined, 'undefined'],
+  ]
+
+  for (const [arg, string] of map) {
+    const length = string.length
+    const a = encoder.encode(string)
+    t.assert.strictEqual(a.length, length)
+
+    const b = encoder.encode(arg)
+    if (arg === undefined) {
+      // undefined is special
+      t.assert.strictEqual(b.length, 0)
+      t.assert.deepStrictEqual(b, Uint8Array.of())
+    } else {
+      const b = encoder.encode(arg)
+      t.assert.strictEqual(b.length, length)
+      t.assert.deepStrictEqual(b, a)
+    }
+
+    const c = new Uint8Array(20)
+    t.assert.deepStrictEqual(encoder.encodeInto(arg, c), { read: length, written: length })
+    t.assert.deepStrictEqual(c.subarray(0, length), a)
+  }
+})
+
+// https://encoding.spec.whatwg.org/#x-user-defined-decoder
+test('x-user-defined encoding', (t) => {
+  const decoder = new TextDecoder('x-user-defined')
+  for (let byte = 0; byte < 256; byte++) {
+    const codePoint = byte >= 128 ? 0xf7_80 + byte - 0x80 : byte
+    t.assert.strictEqual(decoder.decode(Uint8Array.of(byte)), String.fromCodePoint(codePoint))
+  }
+})
+
+// iso-8859-1, iso-8859-9, iso-8859-11 differ in WHATWG Encoding spec from https://unicode.org/Public/MAPPINGS/ISO8859
+// and map to windows-1252, windows-1254, windows-874 instead
+test('not all ISO-8859 encodings are present in TextDecoder', (t) => {
+  t.assert.strictEqual(new TextDecoder('iso-8859-1').encoding, 'windows-1252')
+  t.assert.strictEqual(new TextDecoder('iso-8859-2').encoding, 'iso-8859-2') // present
+  t.assert.strictEqual(new TextDecoder('iso-8859-9').encoding, 'windows-1254')
+  t.assert.strictEqual(new TextDecoder('iso-8859-11').encoding, 'windows-874')
+  t.assert.throws(() => new TextDecoder('iso-8859-12'))
+  t.assert.strictEqual(new TextDecoder('iso-8859-13').encoding, 'iso-8859-13') // present
+})
+
+describe('encodings are ASCII supersets, except utf-16 and iso-2022-jp', () => {
+  for (const label of labels) {
+    if (label === 'replacement' || label === 'utf-16le' || label === 'utf-16be') continue
+    test(label, (t) => {
+      const loose = new TextDecoder(label)
+      const fatal = new TextDecoder(label, { fatal: true })
+      for (let i = 0; i < 128; i++) {
+        if (label === 'iso-2022-jp' && [0x0e, 0x0f, 0x1b].includes(i)) continue
+        t.assert.strictEqual(loose.decode(Uint8Array.of(i)), String.fromCodePoint(i))
+        t.assert.strictEqual(fatal.decode(Uint8Array.of(i)), String.fromCodePoint(i))
+      }
+    })
+  }
+})
+
+describe('legacyHookDecode', () => {
+  const fixtures = {
+    replacement: [
+      ['', ''],
+      ['00', '\uFFFD'],
+      ['ff', '\uFFFD'],
+      ['20', '\uFFFD'],
+      ['2020', '\uFFFD'],
+      // BOM takes preference
+      ['efbbbf', ''],
+      ['efbbbf2a', '*'],
+      ['efbbbf202a', ' *'],
+      ['fffe', ''],
+      ['fffe2a20', '\u202A'],
+      ['fffe2a', '\uFFFD'],
+      ['fffe00d72a', '\uD700\uFFFD'],
+      ['fffe00d82a', '\uFFFD'],
+      ['fffe00dc2a', '\uFFFD\uFFFD'],
+      ['feff', ''],
+      ['feff202a', '\u202A'],
+      ['feff20', '\uFFFD'],
+      ['feffd70020', '\uD700\uFFFD'],
+      ['feffd80020', '\uFFFD'],
+      ['feffdc0020', '\uFFFD\uFFFD'],
+    ],
+    // non-normalized names
+    Utf8: [['c280', '\x80']],
+    unicodefeff: [['c280', '\u80C2']],
+    UnicodeFFFE: [['c280', '\uC280']],
+  }
+
+  test('null encoding', (t) => {
+    t.assert.throws(() => legacyHookDecode(Uint8Array.of(), null), RangeError)
+  })
+
+  for (const [encoding, data] of Object.entries(fixtures)) {
+    test(encoding, (t) => {
+      for (const [hex, string] of data) {
+        t.assert.strictEqual(legacyHookDecode(fromHex(hex), encoding), string, `${hex}`)
+      }
+    })
+  }
+})
+
+test('getBOMEncoding', (t) => {
+  const fixtures = [
+    [null, ''],
+    [null, 'ff'],
+    [null, 'fe'],
+    [null, 'ef'],
+    [null, 'efbb'],
+    [null, 'efbb00'],
+    [null, 'efbfbb'],
+    [null, 'ffbbbf'],
+    ['utf-8', 'efbbbf'],
+    ['utf-8', 'efbbbf00'],
+    ['utf-16le', 'fffe'],
+    ['utf-16le', 'fffefffe'],
+    ['utf-16le', 'fffefffefffe'],
+    ['utf-16le', 'fffebb'],
+    ['utf-16le', 'fffebf'],
+    ['utf-16be', 'feff'],
+    ['utf-16be', 'fefffeff'],
+    ['utf-16be', 'fefffefffeff'],
+  ]
+
+  for (const [enc, hex] of fixtures) {
+    t.assert.strictEqual(getBOMEncoding(fromHex(hex)), enc, `${hex} -> ${enc}`)
+  }
+})
diff --git a/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js b/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js
@@ -1,4 +1,4 @@
-import * as api from '@exodus/bytes/encoding.js'
+import * as api from '@exodus/bytes/encoding-browser.js'
 
 // prettier-ignore
 const supported = new Set([
diff --git a/whatwg.js b/whatwg.js
@@ -1,7 +1,11 @@
 import { utf8fromStringLoose } from '@exodus/bytes/utf8.js'
 import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
-import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js'
-import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js'
+import {
+  isMultibyte,
+  getMultibyteEncoder,
+  normalizeEncoding,
+  E_ENCODING,
+} from './fallback/encoding.js'
 import { percentEncoder } from './fallback/percent.js'
 import { encodeMap } from './fallback/single-byte.js'
 import { E_STRING } from './fallback/_utils.js'

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import * as api from '@exodus/bytes/encoding.js'`
	`1`	`+import * as api from '@exodus/bytes/encoding-browser.js'`
`2`	`2`
`3`	`3`	`// prettier-ignore`
`4`	`4`	`const supported = new Set([`