Skip to content

Commit c085e1b

Browse files
committed
test: extra multibyte encoder tests from WPT
1 parent 76ead9c commit c085e1b

File tree

1 file changed

+111
-0
lines changed

1 file changed

+111
-0
lines changed

tests/wpt/mulibyte-encoder.test.js

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
2+
import { multibyteEncoder } from '../../fallback/multi-byte.js'
3+
import { encodeLatin1 } from '../../fallback/latin1.js'
4+
import { describe, test } from 'node:test'
5+
6+
const { unescape } = globalThis
7+
8+
// query percent-encode set
9+
const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
10+
const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
11+
const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
12+
13+
function toUrl(encoding, input) {
14+
let encoded = ''
15+
let last = 0
16+
const escaping = multibyteEncoder(encoding, (cp, u, i) => {
17+
encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp;
18+
last = i
19+
return 0 // no bytes emitted
20+
})
21+
22+
const u = escaping(input)
23+
encoded += escArr(u.subarray(last))
24+
return encoded
25+
}
26+
27+
function testEncoder(encoding, fn) {
28+
describe(encoding, () => {
29+
const fatal = createMultibyteEncoder(encoding)
30+
fn((input, escaped, desc) => {
31+
test(desc, (t) => {
32+
// Coherence
33+
if (escaped.includes('%26%23')) {
34+
t.assert.throws(() => fatal(input))
35+
} else {
36+
const bytes = fatal(input) // does not throw
37+
if (unescape) t.assert.deepStrictEqual(bytes, encodeLatin1(unescape(escaped)))
38+
}
39+
40+
// Full check
41+
t.assert.strictEqual(toUrl(encoding, input), escaped)
42+
})
43+
})
44+
})
45+
}
46+
47+
testEncoder('big5', (encode) => {
48+
// From https://github.com/web-platform-tests/wpt/blob/master/encoding/big5-encoder.html
49+
50+
encode('ab', 'ab', 'very basic')
51+
// edge cases
52+
encode('\u9EA6', '%26%2340614%3B', 'Highest-pointer BMP character excluded from encoder')
53+
encode('\uD858\uDE6B', '%26%23156267%3B', 'Highest-pointer character excluded from encoder')
54+
encode('\u3000', '%A1@', 'Lowest-pointer character included in encoder')
55+
encode(
56+
'\u20AC',
57+
'%A3%E1',
58+
'Euro; the highest-pointer character before a range of 30 unmapped pointers'
59+
)
60+
encode('\u4E00', '%A4@', 'The lowest-pointer character after the range of 30 unmapped pointers')
61+
encode(
62+
'\uD85D\uDE07',
63+
'%C8%A4',
64+
'The highest-pointer character before a range of 41 unmapped pointers'
65+
)
66+
encode('\uFFE2', '%C8%CD', 'The lowest-pointer character after the range of 41 unmapped pointers')
67+
encode('\u79D4', '%FE%FE', 'The last character in the index')
68+
// not in index
69+
encode('\u2603', '%26%239731%3B', 'The canonical BMP test character that is not in the index')
70+
encode(
71+
'\uD83D\uDCA9',
72+
'%26%23128169%3B',
73+
'The canonical astral test character that is not in the index'
74+
)
75+
// duplicate low bits
76+
encode(
77+
'\uD840\uDFB5',
78+
'%FDj',
79+
'A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer'
80+
)
81+
// prefer last
82+
encode(
83+
'\u2550',
84+
'%F9%F9',
85+
'A duplicate-mapped code point that prefers the highest pointer in the encoder'
86+
)
87+
})
88+
89+
testEncoder('iso-2022-jp', (encode) => {
90+
// From https://github.com/web-platform-tests/wpt/blob/master/encoding/iso-2022-jp-encoder.html
91+
encode('\x0E\x0F\x1Bx', '%26%2365533%3B%26%2365533%3B%26%2365533%3Bx', 'SO/SI ESC')
92+
encode(
93+
'\u203E\x0E\x0F\x1Bx',
94+
'%1B(J~%26%2365533%3B%26%2365533%3B%26%2365533%3Bx%1B(B',
95+
'Roman SO/SI ESC'
96+
)
97+
encode(
98+
'\uFF61\x0E\x0F\x1Bx',
99+
'%1B$B!%23%1B(B%26%2365533%3B%26%2365533%3B%26%2365533%3Bx',
100+
'Katakana SO/SI ESC'
101+
)
102+
encode(
103+
'\u0393\x0E\x0F\x1Bx',
104+
'%1B$B&%23%1B(B%26%2365533%3B%26%2365533%3B%26%2365533%3Bx',
105+
'jis0208 SO/SI ESC'
106+
)
107+
encode('\uFFFD', '%26%2365533%3B', 'U+FFFD')
108+
encode('\u203E\uFFFD', '%1B(J~%26%2365533%3B%1B(B', 'Roman U+FFFD')
109+
encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD')
110+
encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD')
111+
})

0 commit comments

Comments
 (0)