|
| 1 | +import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js' |
| 2 | +import { multibyteEncoder } from '../../fallback/multi-byte.js' |
| 3 | +import { encodeLatin1 } from '../../fallback/latin1.js' |
| 4 | +import { describe, test } from 'node:test' |
| 5 | + |
| 6 | +const { unescape } = globalThis |
| 7 | + |
| 8 | +// query percent-encode set |
| 9 | +const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e |
| 10 | +const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase() |
| 11 | +const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('') |
| 12 | + |
| 13 | +function toUrl(encoding, input) { |
| 14 | + let encoded = '' |
| 15 | + let last = 0 |
| 16 | + const escaping = multibyteEncoder(encoding, (cp, u, i) => { |
| 17 | + encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp; |
| 18 | + last = i |
| 19 | + return 0 // no bytes emitted |
| 20 | + }) |
| 21 | + |
| 22 | + const u = escaping(input) |
| 23 | + encoded += escArr(u.subarray(last)) |
| 24 | + return encoded |
| 25 | +} |
| 26 | + |
| 27 | +function testEncoder(encoding, fn) { |
| 28 | + describe(encoding, () => { |
| 29 | + const fatal = createMultibyteEncoder(encoding) |
| 30 | + fn((input, escaped, desc) => { |
| 31 | + test(desc, (t) => { |
| 32 | + // Coherence |
| 33 | + if (escaped.includes('%26%23')) { |
| 34 | + t.assert.throws(() => fatal(input)) |
| 35 | + } else { |
| 36 | + const bytes = fatal(input) // does not throw |
| 37 | + if (unescape) t.assert.deepStrictEqual(bytes, encodeLatin1(unescape(escaped))) |
| 38 | + } |
| 39 | + |
| 40 | + // Full check |
| 41 | + t.assert.strictEqual(toUrl(encoding, input), escaped) |
| 42 | + }) |
| 43 | + }) |
| 44 | + }) |
| 45 | +} |
| 46 | + |
| 47 | +testEncoder('big5', (encode) => { |
| 48 | + // From https://github.com/web-platform-tests/wpt/blob/master/encoding/big5-encoder.html |
| 49 | + |
| 50 | + encode('ab', 'ab', 'very basic') |
| 51 | + // edge cases |
| 52 | + encode('\u9EA6', '%26%2340614%3B', 'Highest-pointer BMP character excluded from encoder') |
| 53 | + encode('\uD858\uDE6B', '%26%23156267%3B', 'Highest-pointer character excluded from encoder') |
| 54 | + encode('\u3000', '%A1@', 'Lowest-pointer character included in encoder') |
| 55 | + encode( |
| 56 | + '\u20AC', |
| 57 | + '%A3%E1', |
| 58 | + 'Euro; the highest-pointer character before a range of 30 unmapped pointers' |
| 59 | + ) |
| 60 | + encode('\u4E00', '%A4@', 'The lowest-pointer character after the range of 30 unmapped pointers') |
| 61 | + encode( |
| 62 | + '\uD85D\uDE07', |
| 63 | + '%C8%A4', |
| 64 | + 'The highest-pointer character before a range of 41 unmapped pointers' |
| 65 | + ) |
| 66 | + encode('\uFFE2', '%C8%CD', 'The lowest-pointer character after the range of 41 unmapped pointers') |
| 67 | + encode('\u79D4', '%FE%FE', 'The last character in the index') |
| 68 | + // not in index |
| 69 | + encode('\u2603', '%26%239731%3B', 'The canonical BMP test character that is not in the index') |
| 70 | + encode( |
| 71 | + '\uD83D\uDCA9', |
| 72 | + '%26%23128169%3B', |
| 73 | + 'The canonical astral test character that is not in the index' |
| 74 | + ) |
| 75 | + // duplicate low bits |
| 76 | + encode( |
| 77 | + '\uD840\uDFB5', |
| 78 | + '%FDj', |
| 79 | + 'A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer' |
| 80 | + ) |
| 81 | + // prefer last |
| 82 | + encode( |
| 83 | + '\u2550', |
| 84 | + '%F9%F9', |
| 85 | + 'A duplicate-mapped code point that prefers the highest pointer in the encoder' |
| 86 | + ) |
| 87 | +}) |
| 88 | + |
| 89 | +testEncoder('iso-2022-jp', (encode) => { |
| 90 | + // From https://github.com/web-platform-tests/wpt/blob/master/encoding/iso-2022-jp-encoder.html |
| 91 | + encode('\x0E\x0F\x1Bx', '%26%2365533%3B%26%2365533%3B%26%2365533%3Bx', 'SO/SI ESC') |
| 92 | + encode( |
| 93 | + '\u203E\x0E\x0F\x1Bx', |
| 94 | + '%1B(J~%26%2365533%3B%26%2365533%3B%26%2365533%3Bx%1B(B', |
| 95 | + 'Roman SO/SI ESC' |
| 96 | + ) |
| 97 | + encode( |
| 98 | + '\uFF61\x0E\x0F\x1Bx', |
| 99 | + '%1B$B!%23%1B(B%26%2365533%3B%26%2365533%3B%26%2365533%3Bx', |
| 100 | + 'Katakana SO/SI ESC' |
| 101 | + ) |
| 102 | + encode( |
| 103 | + '\u0393\x0E\x0F\x1Bx', |
| 104 | + '%1B$B&%23%1B(B%26%2365533%3B%26%2365533%3B%26%2365533%3Bx', |
| 105 | + 'jis0208 SO/SI ESC' |
| 106 | + ) |
| 107 | + encode('\uFFFD', '%26%2365533%3B', 'U+FFFD') |
| 108 | + encode('\u203E\uFFFD', '%1B(J~%26%2365533%3B%1B(B', 'Roman U+FFFD') |
| 109 | + encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD') |
| 110 | + encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD') |
| 111 | +}) |
0 commit comments