feat: add percentEncodeAfterEncoding

ChALkeR · ChALkeR · commit eacc182bccc6 · 2026-01-24T01:57:10.000+04:00
diff --git a/encoding-lite.js b/encoding-lite.js
@@ -7,4 +7,5 @@ export {
   getBOMEncoding,
   labelToName,
   legacyHookDecode,
+  percentEncodeAfterEncoding,
 } from './fallback/encoding.js'
diff --git a/encoding.js b/encoding.js
@@ -1,7 +1,8 @@
-import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js' // eslint-disable-line @exodus/import/no-unresolved
-import { setMultibyteDecoder } from './fallback/encoding.js'
+import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js'
+import { multibyteEncoder } from './fallback/multi-byte.js'
+import { setMultibyte } from './fallback/encoding.js'
 
-setMultibyteDecoder(createMultibyteDecoder)
+setMultibyte(createMultibyteDecoder, multibyteEncoder)
 
 export {
   TextDecoder,
@@ -12,4 +13,5 @@ export {
   getBOMEncoding,
   labelToName,
   legacyHookDecode,
+  percentEncodeAfterEncoding,
 } from './fallback/encoding.js'
diff --git a/fallback/encoding.js b/fallback/encoding.js
@@ -3,10 +3,14 @@
 
 import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
 import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
-import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
+import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
 import labels from './encoding.labels.js'
 import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
 import { unfinishedBytes, mergePrefix } from './encoding.util.js'
+import { percentEncoder } from './percent.js'
+import { encodeMap } from './single-byte.js'
+import { E_STRICT_UNICODE } from './utf8.js'
+import { E_STRING } from './_utils.js'
 
 export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
 
@@ -15,10 +19,11 @@ const E_MULTI =
   'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
 const replacementChar = '\uFFFD'
 const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
-let createMultibyteDecoder
+let createMultibyteDecoder, multibyteEncoder
 
-export function setMultibyteDecoder(createDecoder) {
+export function setMultibyte(createDecoder, createEncoder) {
   createMultibyteDecoder = createDecoder
+  multibyteEncoder = createEncoder
 }
 
 const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
@@ -311,3 +316,65 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
 
   return createSinglebyteDecoder(enc, true)(u8)
 }
+
+// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
+// Codepoints below 0x20, 0x7F specifically, and above 0x7F (non-ASCII) are always encoded
+// > A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive.
+// > The C0 control percent-encode set are the C0 controls and all code points greater than U+007E (~).
+// Throws on non-well-formed (non scalar-value) strings as they are not supposed to be used here (and would be inconsistent)
+export function percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false) {
+  const e = normalizeEncoding(encoding)
+  // Ref: https://encoding.spec.whatwg.org/#get-an-encoder
+  if (e === 'replacement' || e === 'utf16-le' || e === 'utf16-be') throw new RangeError(E_ENCODING)
+
+  const percent = percentEncoder(percentEncodeSet, spaceAsPlus)
+  if (e === 'utf-8') return percent(utf8fromStringLoose(input))
+
+  const multi = multibyteSet.has(e)
+  const fatal = multi ? multibyteEncoder(e) : createSinglebyteEncoder(e)
+  try {
+    return percent(fatal(input))
+  } catch {}
+
+  let res = ''
+  let last = 0
+  if (multi) {
+    const escaping = multibyteEncoder(e, (cp, u, i) => {
+      if (cp >= 0xd8_00 && cp < 0xe0_00) throw new SyntaxError(E_STRICT_UNICODE)
+      res += `${percent(u, last, i)}%26%23${cp}%3B` // &#cp;
+      last = i
+      return 0 // no bytes emitted
+    })
+
+    const u = escaping(input) // has side effects on res
+    res += percent(u, last)
+  } else {
+    if (typeof input !== 'string') throw new TypeError(E_STRING) // all other paths have their own validation
+    const m = encodeMap(e)
+    const len = input.length
+    const u = new Uint8Array(len)
+    for (let i = 0; i < len; i++) {
+      const x = input.charCodeAt(i)
+      const b = m[x]
+      if (!b && x) {
+        let cp = x
+        if (x >= 0xd8_00 && x < 0xe0_00) {
+          if (x >= 0xdc_00 || i + 1 === len) throw new SyntaxError(E_STRICT_UNICODE)
+          const x1 = input.charCodeAt(i + 1)
+          if (x1 < 0xdc_00 || x1 >= 0xe0_00) throw new SyntaxError(E_STRICT_UNICODE)
+          cp = 0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))
+          i++
+        }
+
+        res += `${percent(u, last, i)}%26%23${cp}%3B` // &#cp;
+        last = i + 1 // skip current
+      }
+
+      u[i] = b
+    }
+
+    res += percent(u, last)
+  }
+
+  return res
+}
diff --git a/fallback/percent.js b/fallback/percent.js
@@ -0,0 +1,33 @@
+import { decodeAscii, encodeLatin1 } from './latin1.js'
+import { decode2string } from './_utils.js'
+
+const E_PERCENT_ENCODE_SET =
+  'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
+
+const percentMap = new Map()
+let hex, base
+
+export function percentEncoder(set, spaceAsPlus = false) {
+  if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(E_PERCENT_ENCODE_SET)
+  if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
+  const id = set + +spaceAsPlus
+  const cached = percentMap.get(id)
+  if (cached) return cached
+
+  const n = new Uint8Array(new Set(encodeLatin1(set))).sort() // string checked above to be ascii
+  if (decodeAscii(n) !== set) throw new TypeError(E_PERCENT_ENCODE_SET)
+
+  if (!base) {
+    hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
+    base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
+  }
+
+  const map = base.slice() // copy
+  for (const c of n) map[c] = hex[c]
+  if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it
+
+  // Input is not typechecked, for internal use only
+  const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
+  percentMap.set(id, percentEncode)
+  return percentEncode
+}
diff --git a/package.json b/package.json
@@ -70,6 +70,7 @@
     "/fallback/encoding.util.js",
     "/fallback/hex.js",
     "/fallback/latin1.js",
+    "/fallback/percent.js",
     "/fallback/multi-byte.encodings.cjs",
     "/fallback/multi-byte.encodings.json",
     "/fallback/multi-byte.js",
diff --git a/tests/wpt/mulibyte-encoder.test.js b/tests/wpt/mulibyte-encoder.test.js
@@ -1,11 +1,13 @@
+import { percentEncodeAfterEncoding } from '@exodus/bytes/encoding.js'
 import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
 import { multibyteEncoder } from '../../fallback/multi-byte.js'
 import { encodeLatin1 } from '../../fallback/latin1.js'
 import { describe, test } from 'node:test'
 
 const { unescape } = globalThis
 
-// query percent-encode set
+// C0 control percent-encode set: < 0x20 || > 0x7e
+// query percent-encode set: C0 control percent-encode set + ` "#<>`
 const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
 const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
 const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
@@ -39,6 +41,7 @@ function testEncoder(encoding, fn) {
 
         // Full check
         t.assert.strictEqual(toUrl(encoding, input), escaped)
+        t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, ' "#<>'), escaped)
       })
     })
   })