Skip to content

Commit eacc182

Browse files
committed
feat: add percentEncodeAfterEncoding
1 parent 2da475b commit eacc182

File tree

6 files changed

+114
-7
lines changed

6 files changed

+114
-7
lines changed

encoding-lite.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ export {
77
getBOMEncoding,
88
labelToName,
99
legacyHookDecode,
10+
percentEncodeAfterEncoding,
1011
} from './fallback/encoding.js'

encoding.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js' // eslint-disable-line @exodus/import/no-unresolved
2-
import { setMultibyteDecoder } from './fallback/encoding.js'
1+
import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js'
2+
import { multibyteEncoder } from './fallback/multi-byte.js'
3+
import { setMultibyte } from './fallback/encoding.js'
34

4-
setMultibyteDecoder(createMultibyteDecoder)
5+
setMultibyte(createMultibyteDecoder, multibyteEncoder)
56

67
export {
78
TextDecoder,
@@ -12,4 +13,5 @@ export {
1213
getBOMEncoding,
1314
labelToName,
1415
legacyHookDecode,
16+
percentEncodeAfterEncoding,
1517
} from './fallback/encoding.js'

fallback/encoding.js

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33

44
import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
55
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
6-
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
6+
import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
77
import labels from './encoding.labels.js'
88
import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
99
import { unfinishedBytes, mergePrefix } from './encoding.util.js'
10+
import { percentEncoder } from './percent.js'
11+
import { encodeMap } from './single-byte.js'
12+
import { E_STRICT_UNICODE } from './utf8.js'
13+
import { E_STRING } from './_utils.js'
1014

1115
export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
1216

@@ -15,10 +19,11 @@ const E_MULTI =
1519
'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
1620
const replacementChar = '\uFFFD'
1721
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
18-
let createMultibyteDecoder
22+
let createMultibyteDecoder, multibyteEncoder
1923

20-
export function setMultibyteDecoder(createDecoder) {
24+
export function setMultibyte(createDecoder, createEncoder) {
2125
createMultibyteDecoder = createDecoder
26+
multibyteEncoder = createEncoder
2227
}
2328

2429
const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
@@ -311,3 +316,65 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
311316

312317
return createSinglebyteDecoder(enc, true)(u8)
313318
}
319+
320+
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
321+
// Codepoints below 0x20, 0x7F specifically, and above 0x7F (non-ASCII) are always encoded
322+
// > A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive.
323+
// > The C0 control percent-encode set are the C0 controls and all code points greater than U+007E (~).
324+
// Throws on non-well-formed (non scalar-value) strings as they are not supposed to be used here (and would be inconsistent)
325+
export function percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false) {
326+
const e = normalizeEncoding(encoding)
327+
// Ref: https://encoding.spec.whatwg.org/#get-an-encoder
328+
if (e === 'replacement' || e === 'utf16-le' || e === 'utf16-be') throw new RangeError(E_ENCODING)
329+
330+
const percent = percentEncoder(percentEncodeSet, spaceAsPlus)
331+
if (e === 'utf-8') return percent(utf8fromStringLoose(input))
332+
333+
const multi = multibyteSet.has(e)
334+
const fatal = multi ? multibyteEncoder(e) : createSinglebyteEncoder(e)
335+
try {
336+
return percent(fatal(input))
337+
} catch {}
338+
339+
let res = ''
340+
let last = 0
341+
if (multi) {
342+
const escaping = multibyteEncoder(e, (cp, u, i) => {
343+
if (cp >= 0xd8_00 && cp < 0xe0_00) throw new SyntaxError(E_STRICT_UNICODE)
344+
res += `${percent(u, last, i)}%26%23${cp}%3B` // &#cp;
345+
last = i
346+
return 0 // no bytes emitted
347+
})
348+
349+
const u = escaping(input) // has side effects on res
350+
res += percent(u, last)
351+
} else {
352+
if (typeof input !== 'string') throw new TypeError(E_STRING) // all other paths have their own validation
353+
const m = encodeMap(e)
354+
const len = input.length
355+
const u = new Uint8Array(len)
356+
for (let i = 0; i < len; i++) {
357+
const x = input.charCodeAt(i)
358+
const b = m[x]
359+
if (!b && x) {
360+
let cp = x
361+
if (x >= 0xd8_00 && x < 0xe0_00) {
362+
if (x >= 0xdc_00 || i + 1 === len) throw new SyntaxError(E_STRICT_UNICODE)
363+
const x1 = input.charCodeAt(i + 1)
364+
if (x1 < 0xdc_00 || x1 >= 0xe0_00) throw new SyntaxError(E_STRICT_UNICODE)
365+
cp = 0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))
366+
i++
367+
}
368+
369+
res += `${percent(u, last, i)}%26%23${cp}%3B` // &#cp;
370+
last = i + 1 // skip current
371+
}
372+
373+
u[i] = b
374+
}
375+
376+
res += percent(u, last)
377+
}
378+
379+
return res
380+
}

fallback/percent.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { decodeAscii, encodeLatin1 } from './latin1.js'
2+
import { decode2string } from './_utils.js'
3+
4+
const E_PERCENT_ENCODE_SET =
5+
'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
6+
7+
const percentMap = new Map()
8+
let hex, base
9+
10+
export function percentEncoder(set, spaceAsPlus = false) {
11+
if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(E_PERCENT_ENCODE_SET)
12+
if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
13+
const id = set + +spaceAsPlus
14+
const cached = percentMap.get(id)
15+
if (cached) return cached
16+
17+
const n = new Uint8Array(new Set(encodeLatin1(set))).sort() // string checked above to be ascii
18+
if (decodeAscii(n) !== set) throw new TypeError(E_PERCENT_ENCODE_SET)
19+
20+
if (!base) {
21+
hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
22+
base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
23+
}
24+
25+
const map = base.slice() // copy
26+
for (const c of n) map[c] = hex[c]
27+
if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it
28+
29+
// Input is not typechecked, for internal use only
30+
const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
31+
percentMap.set(id, percentEncode)
32+
return percentEncode
33+
}

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"/fallback/encoding.util.js",
7171
"/fallback/hex.js",
7272
"/fallback/latin1.js",
73+
"/fallback/percent.js",
7374
"/fallback/multi-byte.encodings.cjs",
7475
"/fallback/multi-byte.encodings.json",
7576
"/fallback/multi-byte.js",

tests/wpt/mulibyte-encoder.test.js

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
import { percentEncodeAfterEncoding } from '@exodus/bytes/encoding.js'
12
import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
23
import { multibyteEncoder } from '../../fallback/multi-byte.js'
34
import { encodeLatin1 } from '../../fallback/latin1.js'
45
import { describe, test } from 'node:test'
56

67
const { unescape } = globalThis
78

8-
// query percent-encode set
9+
// C0 control percent-encode set: < 0x20 || > 0x7e
10+
// query percent-encode set: C0 control percent-encode set + ` "#<>`
911
const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
1012
const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
1113
const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
@@ -39,6 +41,7 @@ function testEncoder(encoding, fn) {
3941

4042
// Full check
4143
t.assert.strictEqual(toUrl(encoding, input), escaped)
44+
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, ' "#<>'), escaped)
4245
})
4346
})
4447
})

0 commit comments

Comments
 (0)