Skip to content

Commit 5d085d7

Browse files
committed
feat: add iso-2022-jp fatal encoder
1 parent 1219f1f commit 5d085d7

File tree

2 files changed

+89
-20
lines changed

2 files changed

+89
-20
lines changed

fallback/multi-byte.js

Lines changed: 87 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,7 @@ const preencoders = {
688688
const t = p % 188
689689
return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t)
690690
},
691+
'iso-2022-jp': (p) => ((((p / 94) | 0) + 0x21) << 8) | ((p % 94) + 0x21),
691692
'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1),
692693
'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41),
693694
gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)),
@@ -697,11 +698,13 @@ preencoders.gbk = preencoders.gb18030
697698

698699
// We accept that encoders use non-trivial amount of mem, for perf
699700
// most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use
700-
function getMap(id, size) {
701+
function getMap(id, size, ascii) {
701702
const cached = maps.get(id)
702703
if (cached) return cached
703704
let tname = id
704705
const sjis = id === 'shift_jis'
706+
const iso2022jp = id === 'iso-2022-jp'
707+
if (iso2022jp) tname = 'jis0208'
705708
if (id === 'gbk') tname = 'gb18030'
706709
if (id === 'euc-jp' || sjis) tname = 'jis0208'
707710
const table = getTable(tname)
@@ -738,7 +741,7 @@ function getMap(id, size) {
738741
}
739742
}
740743

741-
for (let i = 0; i < 0x80; i++) map[i] = i
744+
if (ascii) for (let i = 0; i < 0x80; i++) map[i] = i
742745
if (sjis || id === 'euc-jp') {
743746
if (sjis) map[0x80] = 0x80
744747
const d = sjis ? 0xfe_c0 : 0x70_c0
@@ -757,32 +760,38 @@ function getMap(id, size) {
757760
return map
758761
}
759762

760-
const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030'])
761763
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
762-
let gb18030r
764+
let gb18030r, katakana
763765

764766
export function multibyteEncoder(enc, onError) {
765-
if (!encoders.has(enc)) throw new RangeError('Unsupported encoding')
767+
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
766768
const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1
767-
const width = enc === 'gb18030' ? 4 : 2
768-
const map = getMap(enc, size)
769-
if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges')
770-
769+
const iso2022jp = enc === 'iso-2022-jp'
770+
const gb18030 = enc === 'gb18030'
771+
const ascii = isAsciiSuperset(enc)
772+
const width = iso2022jp ? 5 : gb18030 ? 4 : 2
773+
const tailsize = iso2022jp ? 3 : 0
774+
const map = getMap(enc, size, ascii)
775+
if (gb18030 && !gb18030r) gb18030r = getTable('gb18030-ranges')
776+
if (iso2022jp && !katakana) katakana = getTable('iso-2022-jp-katakana')
771777
return (str) => {
772778
if (typeof str !== 'string') throw new TypeError(E_STRING)
773-
if (!NON_LATIN.test(str)) {
779+
if (ascii && !NON_LATIN.test(str)) {
774780
try {
775781
return encodeAscii(str, E_STRICT)
776782
} catch {}
777783
}
778784

779785
const length = str.length
780-
const u8 = new Uint8Array(length * width)
786+
const u8 = new Uint8Array(length * width + tailsize)
781787
let i = 0
782-
while (i < length) {
783-
const x = str.charCodeAt(i)
784-
if (x >= 128) break
785-
u8[i++] = x
788+
789+
if (ascii) {
790+
while (i < length) {
791+
const x = str.charCodeAt(i)
792+
if (x >= 128) break
793+
u8[i++] = x
794+
}
786795
}
787796

788797
// eslint-disable-next-line unicorn/consistent-function-scoping
@@ -793,7 +802,69 @@ export function multibyteEncoder(enc, onError) {
793802

794803
if (!map || map.length < size) /* c8 ignore next */ throw new Error('Unreachable') // Important for perf
795804

796-
if (enc === 'gb18030') {
805+
if (iso2022jp) {
806+
let state = 0 // 0 = ASCII, 1 = Roman, 2 = jis0208
807+
const restore = () => {
808+
state = 0
809+
u8[i++] = 0x1b
810+
u8[i++] = 0x28
811+
u8[i++] = 0x42
812+
}
813+
814+
for (let j = 0; j < length; j++) {
815+
let x = str.charCodeAt(j)
816+
if (x >= 0xd8_00 && x < 0xe0_00) {
817+
if (state === 2) restore()
818+
if (x >= 0xdc_00 || j + 1 === length) {
819+
i += err(x) // lone
820+
} else {
821+
const x1 = str.charCodeAt(j + 1)
822+
if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
823+
i += err(x) // lone
824+
} else {
825+
j++ // consume x1
826+
i += err(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
827+
}
828+
}
829+
} else if (x < 0x80) {
830+
if (state === 2 || (state === 1 && (x === 0x5c || x === 0x7e))) restore()
831+
if (x === 0xe || x === 0xf || x === 0x1b) {
832+
i += err(0xff_fd) // 12.2.2. step 3: This returns U+FFFD rather than codePoint to prevent attacks
833+
} else {
834+
u8[i++] = x
835+
}
836+
} else if (x === 0xa5 || x === 0x20_3e) {
837+
if (state !== 1) {
838+
state = 1
839+
u8[i++] = 0x1b
840+
u8[i++] = 0x28
841+
u8[i++] = 0x4a
842+
}
843+
844+
u8[i++] = x === 0xa5 ? 0x5c : 0x7e
845+
} else {
846+
if (x === 0x22_12) x = 0xff_0d
847+
if (x >= 0xff_61 && x <= 0xff_9f) x = katakana[x - 0xff_61]
848+
const e = map[x]
849+
if (e) {
850+
if (state !== 2) {
851+
state = 2
852+
u8[i++] = 0x1b
853+
u8[i++] = 0x24
854+
u8[i++] = 0x42
855+
}
856+
857+
u8[i++] = e >> 8
858+
u8[i++] = e & 0xff
859+
} else {
860+
if (state === 2) restore()
861+
i += err(x)
862+
}
863+
}
864+
}
865+
866+
if (state) restore()
867+
} else if (gb18030) {
797868
// Deduping this branch hurts other encoders perf
798869
const encode = (cp) => {
799870
let a = 0, b = 0 // prettier-ignore

tests/wpt/loader.cjs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,7 @@ function loadTextDecoderHtml(fullName) {
147147
assert.ok(encoding && encoding.length > 0)
148148
const decoder = new globalThis.TextDecoder(encoding)
149149
const fatal = new globalThis.TextDecoder(encoding, { fatal: true })
150-
const encode =
151-
decoder.encoding === 'iso-2022-jp' ? null : createMultibyteEncoder(decoder.encoding) // TODO: iso-2022-jp
150+
const encode = createMultibyteEncoder(decoder.encoding)
152151

153152
if (fullName.endsWith('_errors.html')) {
154153
const sep0 = '<span>'
@@ -223,8 +222,7 @@ function loadTextDecoderHtml(fullName) {
223222
// This is limited, encoders are asymmetrical
224223
if (
225224
!(decoder.encoding === 'euc-jp' && bytes.length === 3) && // no jis0212 encoding in spec
226-
!(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) && // encoding excludes pointers less than (0xA1 - 0x81) × 157.
227-
decoder.encoding !== 'iso-2022-jp' // Not implemented yet
225+
!(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) // encoding excludes pointers less than (0xA1 - 0x81) × 157.
228226
) {
229227
t.assert.doesNotThrow(
230228
() => t.assert.deepEqual(encode(String.fromCodePoint(cp)), bytes),

0 commit comments

Comments
 (0)