Skip to content

Commit f8e5f94

Browse files
committed
feat: add percentEncodeAfterEncoding
1 parent 7a4347e commit f8e5f94

File tree

8 files changed

+378
-21
lines changed

8 files changed

+378
-21
lines changed

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,41 @@ do not provide sufficiently complete / non-buggy `TextDecoder` APIs.
801801
> but they are fixing them and the expected update window is short.\
802802
> If you want to circumvent browser bugs, use full `@exodus/bytes/encoding.js` import.
803803
804+
### `@exodus/bytes/whatwg.js`
805+
806+
WHATWG helpers
807+
808+
```js
809+
import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support
810+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
811+
```
812+
813+
#### `percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false)`
814+
815+
Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)
816+
per WHATWG URL specification.
817+
818+
> [!IMPORTANT]
819+
> You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings.
820+
821+
Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted.
822+
823+
[C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is
824+
always percent-encoded.
825+
826+
`percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints
827+
in range 0x20 - 0x7e, e.g. `' "#<>'`.
828+
829+
This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them
830+
to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString).
831+
This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates:
832+
```js
833+
> percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component
834+
'%EF%BF%BD'
835+
> encodeURIComponent('\ud800')
836+
Uncaught URIError: URI malformed
837+
```
838+
804839
## Changelog
805840
806841
See [GitHub Releases](https://github.com/ExodusOSS/bytes/releases) tab

fallback/percent.js

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { decodeAscii, encodeLatin1 } from './latin1.js'
2+
import { decode2string } from './_utils.js'
3+
4+
const ERR = 'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
5+
const percentMap = new Map()
6+
let hex, base
7+
8+
export function percentEncoder(set, spaceAsPlus = false) {
9+
if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(ERR)
10+
if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
11+
const id = set + +spaceAsPlus
12+
const cached = percentMap.get(id)
13+
if (cached) return cached
14+
15+
const n = encodeLatin1(set).sort() // string checked above to be ascii
16+
if (decodeAscii(n) !== set || new Set(n).size !== n.length) throw new TypeError(ERR)
17+
18+
if (!base) {
19+
hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
20+
base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
21+
}
22+
23+
const map = base.slice() // copy
24+
for (const c of n) map[c] = hex[c]
25+
if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it
26+
27+
// Input is not typechecked, for internal use only
28+
const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
29+
percentMap.set(id, percentEncode)
30+
return percentEncode
31+
}

package.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"/fallback/encoding.util.js",
7171
"/fallback/hex.js",
7272
"/fallback/latin1.js",
73+
"/fallback/percent.js",
7374
"/fallback/multi-byte.encodings.cjs",
7475
"/fallback/multi-byte.encodings.json",
7576
"/fallback/multi-byte.js",
@@ -119,6 +120,8 @@
119120
"/utf8.js",
120121
"/utf8.d.ts",
121122
"/utf8.node.js",
123+
"/whatwg.js",
124+
"/whatwg.d.ts",
122125
"/wif.js",
123126
"/wif.d.ts"
124127
],
@@ -199,6 +202,10 @@
199202
"node": "./utf8.node.js",
200203
"default": "./utf8.js"
201204
},
205+
"./whatwg.js": {
206+
"types": "./whatwg.d.ts",
207+
"default": "./whatwg.js"
208+
},
202209
"./wif.js": {
203210
"types": "./wif.d.ts",
204211
"default": "./wif.js"

tests/whatwg.test.js

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import '@exodus/bytes/encoding.js'
2+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
3+
import { describe, test } from 'node:test'
4+
import { labels } from './encoding/fixtures/encodings.cjs'
5+
6+
const userinfo = ' "#/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#userinfo-percent-encode-set
7+
const jsuri = ' "%<>[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuri-uri
8+
const jsuricomponent = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent
9+
10+
const sets = ['', userinfo, jsuri, jsuricomponent]
11+
const invalid = ['replacement', 'utf-16le', 'utf-16be'] // https://encoding.spec.whatwg.org/#get-an-encoder
12+
13+
const slowEngine =
14+
process.env.EXODUS_TEST_PLATFORM === 'quickjs' ||
15+
process.env.EXODUS_TEST_PLATFORM === 'xs' ||
16+
process.env.EXODUS_TEST_PLATFORM === 'engine262'
17+
18+
describe('percent-encode after encoding', () => {
19+
const f = percentEncodeAfterEncoding
20+
21+
// https://url.spec.whatwg.org/#example-percent-encode-operations
22+
test('examples from spec', (t) => {
23+
t.assert.strictEqual(f('Shift_JIS', ' ', userinfo), '%20')
24+
t.assert.strictEqual(f('Shift_JIS', '≡', userinfo), '%81%DF')
25+
t.assert.strictEqual(f('Shift_JIS', '‽', userinfo), '%26%238253%3B')
26+
t.assert.strictEqual(f('ISO-2022-JP', '¥', userinfo), '%1B(J%5C%1B(B')
27+
t.assert.strictEqual(
28+
f('Shift_JIS', '1+1 ≡ 2%20‽', userinfo, true),
29+
'1+1+%81%DF+2%20%26%238253%3B'
30+
)
31+
t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1')
32+
t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD')
33+
t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD')
34+
})
35+
36+
// https://encoding.spec.whatwg.org/#get-an-encoder
37+
describe('throws on unknown, utf-16 and replacement', () => {
38+
for (const encoding of [...invalid, 'what', 'UTF-16', 'unicode']) {
39+
test(encoding, (t) => {
40+
for (const set of sets) {
41+
t.assert.throws(() => f(encoding, '', set), /encoding/)
42+
t.assert.throws(() => f(encoding, ' ', set), /encoding/)
43+
t.assert.throws(() => f(encoding, ' ', set, true), /encoding/)
44+
t.assert.throws(() => f(encoding, '\uFFFD', set, true), /encoding/)
45+
}
46+
})
47+
}
48+
})
49+
50+
describe('all valid encodings are recognized', () => {
51+
for (const encoding of labels) {
52+
if (invalid.includes(encoding)) continue
53+
test(encoding, (t) => {
54+
for (const set of sets) {
55+
t.assert.strictEqual(f(encoding, '', set), '')
56+
// Even non-ASCII encodings passthrough on a lone space
57+
t.assert.strictEqual(f(encoding, ' ', set), set.includes(' ') ? '%20' : ' ')
58+
t.assert.strictEqual(f(encoding, ' ', set, true), '+')
59+
}
60+
})
61+
}
62+
})
63+
64+
describe('replaces non-scalarvalue', () => {
65+
for (const encoding of labels) {
66+
if (invalid.includes(encoding)) continue
67+
test(encoding, (t) => {
68+
const a = f(encoding, '\uFFFD', userinfo)
69+
const b = f(encoding, '\uFFFD', jsuri)
70+
for (let cp = 0xd8_00; cp < 0xe0_00; cp++) {
71+
const s = String.fromCodePoint(cp)
72+
t.assert.strictEqual(f(encoding, s, userinfo), a)
73+
t.assert.strictEqual(f(encoding, s, jsuri), b)
74+
}
75+
})
76+
}
77+
})
78+
79+
describe('encodeURI / encodeURIComponent', () => {
80+
test('coherence', (t) => {
81+
// https://tc39.es/ecma262/#sec-encodeuri-uri step 2, coherence check
82+
t.assert.deepStrictEqual([...(jsuri + ';/?:@&=+$,#')].sort(), [...jsuricomponent].sort())
83+
})
84+
85+
describe('ASCII supersets', (t) => {
86+
const ascii = Array.from({ length: 128 }, (_, i) => String.fromCharCode(i)).join('')
87+
for (const encoding of labels) {
88+
if (invalid.includes(encoding)) continue
89+
if (encoding === 'iso-2022-jp') continue // not an ASCII superset
90+
test(encoding, (t) => {
91+
t.assert.strictEqual(f(encoding, ascii, jsuricomponent), encodeURIComponent(ascii))
92+
t.assert.strictEqual(f(encoding, ascii, jsuri), encodeURI(ascii))
93+
for (let i = 0; i < 128; i++) {
94+
const s = String.fromCharCode(i)
95+
t.assert.strictEqual(f(encoding, s, jsuricomponent), encodeURIComponent(s))
96+
t.assert.strictEqual(f(encoding, s, jsuri), encodeURI(s))
97+
}
98+
})
99+
}
100+
})
101+
102+
test('UTF-8: full Unicode', (t) => {
103+
const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint
104+
for (let cp = 0; cp <= MAX; cp++) {
105+
if (cp >= 0xd8_00 && cp < 0xe0_00) continue
106+
const s = String.fromCodePoint(cp)
107+
t.assert.strictEqual(f('utf8', s, jsuricomponent), encodeURIComponent(s))
108+
t.assert.strictEqual(f('utf8', s, jsuri), encodeURI(s))
109+
}
110+
})
111+
})
112+
})
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
[
2+
"Tests for percent-encoding.",
3+
{
4+
"input": "\u2020",
5+
"output": {
6+
"big5": "%26%238224%3B",
7+
"euc-kr": "%A2%D3",
8+
"utf-8": "%E2%80%A0",
9+
"windows-1252": "%86"
10+
}
11+
},
12+
"This uses a trailing A to prevent the URL parser from trimming the C0 control.",
13+
{
14+
"input": "\u000EA",
15+
"output": {
16+
"big5": "%0EA",
17+
"iso-2022-jp": "%26%2365533%3BA",
18+
"utf-8": "%0EA"
19+
}
20+
},
21+
{
22+
"input": "\u203E\u005C",
23+
"output": {
24+
"iso-2022-jp": "%1B(J~%1B(B\\",
25+
"utf-8": "%E2%80%BE\\"
26+
}
27+
},
28+
{
29+
"input": "\uE5E5",
30+
"output": {
31+
"gb18030": "%26%2358853%3B",
32+
"utf-8": "%EE%97%A5"
33+
}
34+
},
35+
{
36+
"input": "\u2212",
37+
"output": {
38+
"shift_jis": "%81|",
39+
"utf-8": "%E2%88%92"
40+
}
41+
},
42+
{
43+
"input": "á|",
44+
"output": {
45+
"utf-8": "%C3%A1|"
46+
}
47+
}
48+
]

tests/wpt/mulibyte-encoder.test.js

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,13 @@
11
import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
2-
import { multibyteEncoder } from '../../fallback/multi-byte.js'
2+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
3+
import '@exodus/bytes/encoding.js'
34
import { encodeLatin1 } from '../../fallback/latin1.js'
45
import { describe, test } from 'node:test'
6+
import { readFileSync } from 'node:fs'
7+
import { join } from 'node:path'
58

69
const { unescape } = globalThis
710

8-
// query percent-encode set
9-
const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
10-
const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
11-
const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
12-
13-
function toUrl(encoding, input) {
14-
let encoded = ''
15-
let last = 0
16-
const escaping = multibyteEncoder(encoding, (cp, u, i) => {
17-
encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp;
18-
last = i
19-
return 0 // no bytes emitted
20-
})
21-
22-
const u = escaping(input)
23-
encoded += escArr(u.subarray(last))
24-
return encoded
25-
}
26-
2711
function testEncoder(encoding, fn) {
2812
describe(encoding, () => {
2913
const fatal = createMultibyteEncoder(encoding)
@@ -38,7 +22,7 @@ function testEncoder(encoding, fn) {
3822
}
3923

4024
// Full check
41-
t.assert.strictEqual(toUrl(encoding, input), escaped)
25+
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, ' "#<>'), escaped)
4226
})
4327
})
4428
})
@@ -109,3 +93,20 @@ testEncoder('iso-2022-jp', (encode) => {
10993
encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD')
11094
encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD')
11195
})
96+
97+
test('url/resources/percent-encoding.json', (t) => {
98+
const data = JSON.parse(
99+
readFileSync(join(import.meta.dirname, `fixtures/url/resources/percent-encoding.json`), 'utf8')
100+
)
101+
102+
// Doc: https://github.com/web-platform-tests/wpt/blob/master/url/README.md
103+
// > _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false.
104+
const set = ` "#'<>` // https://url.spec.whatwg.org/#special-query-percent-encode-set
105+
const spaceAsPlus = false
106+
for (const { input, output } of data) {
107+
if (!input && !output) continue // comment
108+
for (const [encoding, escaped] of Object.entries(output)) {
109+
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, set, spaceAsPlus), escaped)
110+
}
111+
}
112+
})

whatwg.d.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/**
2+
* WHATWG helpers
3+
*
4+
* ```js
5+
* import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support
6+
* import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
7+
* ```
8+
*
9+
* @module @exodus/bytes/whatwg.js
10+
*/
11+
12+
/**
13+
* Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)
14+
* per WHATWG URL specification.
15+
*
16+
* > [!IMPORTANT]
17+
* > You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings.
18+
*
19+
* Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted.
20+
*
21+
* [C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is
22+
* always percent-encoded.
23+
*
24+
* `percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints
25+
* in range 0x20 - 0x7e, e.g. `' "#<>'`.
26+
*
27+
* This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them
28+
* to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString).
29+
* This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates:
30+
* ```js
31+
* > percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component
32+
* '%EF%BF%BD'
33+
* > encodeURIComponent('\ud800')
34+
* Uncaught URIError: URI malformed
35+
* ```
36+
*
37+
* @param encoding - The encoding label per WHATWG Encoding spec
38+
* @param input - Input scalar-value string to encode
39+
* @param percentEncodeSet - A string of ASCII chars to escape in addition to C0 control percent-encode set
40+
* @param spaceAsPlus - Whether to encode space as `'+'` instead of `'%20'` or `' '` (default: false)
41+
* @returns The percent-encoded string
42+
*/
43+
export function percentEncodeAfterEncoding(
44+
encoding: string,
45+
input: string,
46+
percentEncodeSet: string,
47+
spaceAsPlus?: boolean
48+
): string;

0 commit comments

Comments
 (0)