Skip to content

Commit 9b7f280

Browse files
committed
feat: add percentEncodeAfterEncoding
1 parent fa90577 commit 9b7f280

File tree

8 files changed

+417
-21
lines changed

8 files changed

+417
-21
lines changed

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,41 @@ do not provide sufficiently complete / non-buggy `TextDecoder` APIs.
801801
> but they are fixing them and the expected update window is short.\
802802
> If you want to circumvent browser bugs, use full `@exodus/bytes/encoding.js` import.
803803
804+
### `@exodus/bytes/whatwg.js`
805+
806+
WHATWG helpers
807+
808+
```js
809+
import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support
810+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
811+
```
812+
813+
#### `percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false)`
814+
815+
Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)
816+
per WHATWG URL specification.
817+
818+
> [!IMPORTANT]
819+
> You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings.
820+
821+
Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted.
822+
823+
[C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is
824+
always percent-encoded.
825+
826+
`percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints
827+
in range 0x20 - 0x7e, e.g. `' "#<>'`.
828+
829+
This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them
830+
to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString).
831+
This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates:
832+
```js
833+
> percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component
834+
'%EF%BF%BD'
835+
> encodeURIComponent('\ud800')
836+
Uncaught URIError: URI malformed
837+
```
838+
804839
## Changelog
805840
806841
See [GitHub Releases](https://github.com/ExodusOSS/bytes/releases) tab

fallback/percent.js

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { decodeAscii, encodeLatin1 } from './latin1.js'
2+
import { decode2string } from './_utils.js'
3+
4+
const ERR = 'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
5+
const percentMap = new Map()
6+
let hex, base
7+
8+
export function percentEncoder(set, spaceAsPlus = false) {
9+
if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(ERR)
10+
if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
11+
const id = set + +spaceAsPlus
12+
const cached = percentMap.get(id)
13+
if (cached) return cached
14+
15+
const n = encodeLatin1(set).sort() // string checked above to be ascii
16+
if (decodeAscii(n) !== set || new Set(n).size !== n.length) throw new TypeError(ERR)
17+
18+
if (!base) {
19+
hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
20+
base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
21+
}
22+
23+
const map = base.slice() // copy
24+
for (const c of n) map[c] = hex[c]
25+
if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it
26+
27+
// Input is not typechecked, for internal use only
28+
const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
29+
percentMap.set(id, percentEncode)
30+
return percentEncode
31+
}

package.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"/fallback/encoding.util.js",
7171
"/fallback/hex.js",
7272
"/fallback/latin1.js",
73+
"/fallback/percent.js",
7374
"/fallback/multi-byte.encodings.cjs",
7475
"/fallback/multi-byte.encodings.json",
7576
"/fallback/multi-byte.js",
@@ -119,6 +120,8 @@
119120
"/utf8.js",
120121
"/utf8.d.ts",
121122
"/utf8.node.js",
123+
"/whatwg.js",
124+
"/whatwg.d.ts",
122125
"/wif.js",
123126
"/wif.d.ts"
124127
],
@@ -199,6 +202,10 @@
199202
"node": "./utf8.node.js",
200203
"default": "./utf8.js"
201204
},
205+
"./whatwg.js": {
206+
"types": "./whatwg.d.ts",
207+
"default": "./whatwg.js"
208+
},
202209
"./wif.js": {
203210
"types": "./wif.d.ts",
204211
"default": "./wif.js"

tests/whatwg.test.js

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import '@exodus/bytes/encoding.js'
2+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
3+
import { describe, test } from 'node:test'
4+
import { labels } from './encoding/fixtures/encodings.cjs'
5+
6+
const jsuri = ' "%<>[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuri-uri
7+
const jsuricomponent = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent
8+
const fragment = ' "<>`' // https://url.spec.whatwg.org/#fragment-percent-encode-set
9+
const query = ' "#<>' // https://url.spec.whatwg.org/#query-percent-encode-set
10+
const specialquery = ' "#\'<>' // https://url.spec.whatwg.org/#special-query-percent-encode-set
11+
const path = ' "#<>?^`{}' // https://url.spec.whatwg.org/#path-percent-encode-set
12+
const userinfo = ' "#/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#userinfo-percent-encode-set
13+
const component = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#component-percent-encode-set
14+
const form = ' !"#$%&\'()+,/:;<=>?@[\\]^`{|}~' // https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
15+
16+
const sets = ['', userinfo, jsuri, jsuricomponent]
17+
const invalid = ['replacement', 'utf-16le', 'utf-16be'] // https://encoding.spec.whatwg.org/#get-an-encoder
18+
19+
const slowEngine =
20+
process.env.EXODUS_TEST_PLATFORM === 'quickjs' ||
21+
process.env.EXODUS_TEST_PLATFORM === 'xs' ||
22+
process.env.EXODUS_TEST_PLATFORM === 'engine262'
23+
24+
test('perncent-encode sets coherence', (t) => {
25+
const eq = (a, b) => t.assert.deepStrictEqual([...a], [...b].sort())
26+
// https://tc39.es/ecma262/#sec-encodeuri-uri step 2
27+
eq(jsuricomponent, jsuri + ';/?:@&=+$,#')
28+
// https://url.spec.whatwg.org/#fragment-percent-encode-set
29+
eq(fragment, String.fromCharCode(0x20, 0x22, 0x3c, 0x3e, 0x60))
30+
// https://url.spec.whatwg.org/#query-percent-encode-set
31+
eq(query, String.fromCharCode(0x20, 0x22, 0x23, 0x3c, 0x3e))
32+
// https://url.spec.whatwg.org/#special-query-percent-encode-set
33+
eq(specialquery, query + String.fromCharCode(0x27))
34+
// https://url.spec.whatwg.org/#path-percent-encode-set
35+
eq(path, query + String.fromCharCode(0x3f, 0x5e, 0x60, 0x7b, 0x7d))
36+
// https://url.spec.whatwg.org/#userinfo-percent-encode-set
37+
eq(userinfo, path + String.fromCharCode(0x2f, 0x3a, 0x3b, 0x3d, 0x40, 0x5b, 0x5c, 0x5d, 0x7c))
38+
// https://url.spec.whatwg.org/#component-percent-encode-set
39+
eq(component, userinfo + String.fromCharCode(0x24, 0x25, 0x26, 0x2b, 0x2c))
40+
t.assert.strictEqual(jsuricomponent, component)
41+
// https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
42+
eq(form, component + String.fromCharCode(0x21, 0x27, 0x28, 0x29, 0x7e))
43+
})
44+
45+
describe('percent-encode after encoding', () => {
46+
const f = percentEncodeAfterEncoding
47+
48+
// https://url.spec.whatwg.org/#example-percent-encode-operations
49+
test('examples from spec', (t) => {
50+
// At https://github.com/whatwg/url/commit/5c50135f8304dc8cb9bb49367b364699cc5bb031
51+
t.assert.strictEqual(f('Shift_JIS', ' ', userinfo), '%20')
52+
t.assert.strictEqual(f('Shift_JIS', '≡', userinfo), '%81%DF')
53+
t.assert.strictEqual(f('Shift_JIS', '‽', userinfo), '%26%238253%3B')
54+
t.assert.strictEqual(f('ISO-2022-JP', '¥', userinfo), '%1B(J%5C%1B(B')
55+
t.assert.strictEqual(
56+
f('Shift_JIS', '1+1 ≡ 2%20‽', userinfo, true),
57+
'1+1+%81%DF+2%20%26%238253%3B'
58+
)
59+
t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1')
60+
t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD')
61+
t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD')
62+
63+
// At https://github.com/whatwg/url/pull/896
64+
t.assert.strictEqual(f('Shift_JIS', ' ', specialquery), '%20')
65+
t.assert.strictEqual(f('Shift_JIS', '≡', specialquery), '%81%DF')
66+
t.assert.strictEqual(f('Shift_JIS', '‽', specialquery), '%26%238253%3B')
67+
t.assert.strictEqual(f('ISO-2022-JP', '¥', specialquery), '%1B(J\\%1B(B')
68+
t.assert.strictEqual(
69+
f('Shift_JIS', '1+1 ≡ 2%20‽', form, true),
70+
'1%2B1+%81%DF+2%2520%26%238253%3B'
71+
)
72+
t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1')
73+
t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD')
74+
t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD')
75+
})
76+
77+
// https://encoding.spec.whatwg.org/#get-an-encoder
78+
describe('throws on unknown, utf-16 and replacement', () => {
79+
for (const encoding of [...invalid, 'what', 'UTF-16', 'unicode']) {
80+
test(encoding, (t) => {
81+
for (const set of sets) {
82+
t.assert.throws(() => f(encoding, '', set), /encoding/)
83+
t.assert.throws(() => f(encoding, ' ', set), /encoding/)
84+
t.assert.throws(() => f(encoding, ' ', set, true), /encoding/)
85+
t.assert.throws(() => f(encoding, '\uFFFD', set, true), /encoding/)
86+
}
87+
})
88+
}
89+
})
90+
91+
describe('all valid encodings are recognized', () => {
92+
for (const encoding of labels) {
93+
if (invalid.includes(encoding)) continue
94+
test(encoding, (t) => {
95+
for (const set of sets) {
96+
t.assert.strictEqual(f(encoding, '', set), '')
97+
// Even non-ASCII encodings passthrough on a lone space
98+
t.assert.strictEqual(f(encoding, ' ', set), set.includes(' ') ? '%20' : ' ')
99+
t.assert.strictEqual(f(encoding, ' ', set, true), '+')
100+
}
101+
})
102+
}
103+
})
104+
105+
describe('replaces non-scalarvalue', () => {
106+
for (const encoding of labels) {
107+
if (invalid.includes(encoding)) continue
108+
test(encoding, (t) => {
109+
const a = f(encoding, '\uFFFD', userinfo)
110+
const b = f(encoding, '\uFFFD', jsuri)
111+
for (let cp = 0xd8_00; cp < 0xe0_00; cp++) {
112+
const s = String.fromCodePoint(cp)
113+
t.assert.strictEqual(f(encoding, s, userinfo), a)
114+
t.assert.strictEqual(f(encoding, s, jsuri), b)
115+
}
116+
})
117+
}
118+
})
119+
120+
describe('encodeURI / encodeURIComponent', () => {
121+
describe('ASCII supersets', (t) => {
122+
const ascii = Array.from({ length: 128 }, (_, i) => String.fromCharCode(i)).join('')
123+
for (const encoding of labels) {
124+
if (invalid.includes(encoding)) continue
125+
if (encoding === 'iso-2022-jp') continue // not an ASCII superset
126+
test(encoding, (t) => {
127+
t.assert.strictEqual(f(encoding, ascii, jsuricomponent), encodeURIComponent(ascii))
128+
t.assert.strictEqual(f(encoding, ascii, jsuri), encodeURI(ascii))
129+
for (let i = 0; i < 128; i++) {
130+
const s = String.fromCharCode(i)
131+
t.assert.strictEqual(f(encoding, s, jsuricomponent), encodeURIComponent(s))
132+
t.assert.strictEqual(f(encoding, s, jsuri), encodeURI(s))
133+
}
134+
})
135+
}
136+
})
137+
138+
test('UTF-8: full Unicode', (t) => {
139+
const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint
140+
for (let cp = 0; cp <= MAX; cp++) {
141+
if (cp >= 0xd8_00 && cp < 0xe0_00) continue
142+
const s = String.fromCodePoint(cp)
143+
t.assert.strictEqual(f('utf8', s, jsuricomponent), encodeURIComponent(s))
144+
t.assert.strictEqual(f('utf8', s, jsuri), encodeURI(s))
145+
}
146+
})
147+
})
148+
})
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
[
2+
"Tests for percent-encoding.",
3+
{
4+
"input": "\u2020",
5+
"output": {
6+
"big5": "%26%238224%3B",
7+
"euc-kr": "%A2%D3",
8+
"utf-8": "%E2%80%A0",
9+
"windows-1252": "%86"
10+
}
11+
},
12+
"This uses a trailing A to prevent the URL parser from trimming the C0 control.",
13+
{
14+
"input": "\u000EA",
15+
"output": {
16+
"big5": "%0EA",
17+
"iso-2022-jp": "%26%2365533%3BA",
18+
"utf-8": "%0EA"
19+
}
20+
},
21+
{
22+
"input": "\u203E\u005C",
23+
"output": {
24+
"iso-2022-jp": "%1B(J~%1B(B\\",
25+
"utf-8": "%E2%80%BE\\"
26+
}
27+
},
28+
{
29+
"input": "\uE5E5",
30+
"output": {
31+
"gb18030": "%26%2358853%3B",
32+
"utf-8": "%EE%97%A5"
33+
}
34+
},
35+
{
36+
"input": "\u2212",
37+
"output": {
38+
"shift_jis": "%81|",
39+
"utf-8": "%E2%88%92"
40+
}
41+
},
42+
{
43+
"input": "á|",
44+
"output": {
45+
"utf-8": "%C3%A1|"
46+
}
47+
}
48+
]

tests/wpt/mulibyte-encoder.test.js

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,14 @@
11
import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
2-
import { multibyteEncoder } from '../../fallback/multi-byte.js'
2+
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
3+
import '@exodus/bytes/encoding.js'
34
import { encodeLatin1 } from '../../fallback/latin1.js'
45
import { describe, test } from 'node:test'
6+
import { readFileSync } from 'node:fs'
7+
import { join } from 'node:path'
58

6-
const { unescape } = globalThis
7-
8-
// query percent-encode set
9-
const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
10-
const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
11-
const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
9+
const specialquery = ` "#'<>` // https://url.spec.whatwg.org/#special-query-percent-encode-set
1210

13-
function toUrl(encoding, input) {
14-
let encoded = ''
15-
let last = 0
16-
const escaping = multibyteEncoder(encoding, (cp, u, i) => {
17-
encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp;
18-
last = i
19-
return 0 // no bytes emitted
20-
})
21-
22-
const u = escaping(input)
23-
encoded += escArr(u.subarray(last))
24-
return encoded
25-
}
11+
const { unescape } = globalThis
2612

2713
function testEncoder(encoding, fn) {
2814
describe(encoding, () => {
@@ -38,7 +24,7 @@ function testEncoder(encoding, fn) {
3824
}
3925

4026
// Full check
41-
t.assert.strictEqual(toUrl(encoding, input), escaped)
27+
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, specialquery), escaped)
4228
})
4329
})
4430
})
@@ -109,3 +95,20 @@ testEncoder('iso-2022-jp', (encode) => {
10995
encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD')
11096
encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD')
11197
})
98+
99+
test('url/resources/percent-encoding.json', (t) => {
100+
const data = JSON.parse(
101+
readFileSync(join(import.meta.dirname, `fixtures/url/resources/percent-encoding.json`), 'utf8')
102+
)
103+
104+
// Doc: https://github.com/web-platform-tests/wpt/blob/master/url/README.md
105+
// > _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false.
106+
const set = specialquery
107+
const spaceAsPlus = false
108+
for (const { input, output } of data) {
109+
if (!input && !output) continue // comment
110+
for (const [encoding, escaped] of Object.entries(output)) {
111+
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, set, spaceAsPlus), escaped)
112+
}
113+
}
114+
})

0 commit comments

Comments
 (0)