Skip to content

Commit 0a6149e

Browse files
committed
feat: add iso-2022-jp fatal encoder
1 parent 8993271 commit 0a6149e

File tree

2 files changed

+94
-17
lines changed

2 files changed

+94
-17
lines changed

fallback/multi-byte.js

Lines changed: 92 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,7 @@ const preencoders = {
688688
const t = p % 188
689689
return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t)
690690
},
691+
'iso-2022-jp': (p) => ((((p / 94) | 0) + 0x21) << 8) | ((p % 94) + 0x21),
691692
'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1),
692693
'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41),
693694
gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)),
@@ -702,6 +703,8 @@ function getMap(id, size) {
702703
if (cached) return cached
703704
let tname = id
704705
const sjis = id === 'shift_jis'
706+
const iso2022jp = id === 'iso-2022-jp'
707+
if (iso2022jp) tname = 'jis0208'
705708
if (id === 'gbk') tname = 'gb18030'
706709
if (id === 'euc-jp' || sjis) tname = 'jis0208'
707710
const table = getTable(tname)
@@ -738,7 +741,7 @@ function getMap(id, size) {
738741
}
739742
}
740743

741-
for (let i = 0; i < 0x80; i++) map[i] = i
744+
if (isAsciiSuperset(id)) for (let i = 0; i < 0x80; i++) map[i] = i
742745
if (sjis || id === 'euc-jp') {
743746
if (sjis) map[0x80] = 0x80
744747
const d = sjis ? 0xfe_c0 : 0x70_c0
@@ -757,17 +760,19 @@ function getMap(id, size) {
757760
return map
758761
}
759762

760-
const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030'])
761763
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
762-
let gb18030r
764+
let gb18030r, katakana
763765

764766
export function multibyteEncoder(enc, onError) {
765-
if (!encoders.has(enc)) throw new RangeError('Unsupported encoding')
767+
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
766768
const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1
767-
const width = enc === 'gb18030' ? 4 : 2
769+
const iso2022jp = enc === 'iso-2022-jp'
770+
const gb18030 = enc === 'gb18030'
771+
const width = iso2022jp ? 5 : gb18030 ? 4 : 2
772+
const tailsize = iso2022jp ? 3 : 0
768773
const map = getMap(enc, size)
769-
if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges')
770-
774+
if (gb18030 && !gb18030r) gb18030r = getTable('gb18030-ranges')
775+
if (iso2022jp && !katakana) katakana = getTable('iso-2022-jp-katakana')
771776
return (str) => {
772777
if (typeof str !== 'string') throw new TypeError(E_STRING)
773778
if (!NON_LATIN.test(str)) {
@@ -777,12 +782,15 @@ export function multibyteEncoder(enc, onError) {
777782
}
778783

779784
const length = str.length
780-
const u8 = new Uint8Array(length * width)
785+
const u8 = new Uint8Array(length * width + tailsize)
781786
let i = 0
782-
while (i < length) {
783-
const x = str.charCodeAt(i)
784-
if (x >= 128) break
785-
u8[i++] = x
787+
788+
if (isAsciiSuperset(enc)) {
789+
while (i < length) {
790+
const x = str.charCodeAt(i)
791+
if (x >= 128) break
792+
u8[i++] = x
793+
}
786794
}
787795

788796
// eslint-disable-next-line unicorn/consistent-function-scoping
@@ -793,7 +801,78 @@ export function multibyteEncoder(enc, onError) {
793801

794802
if (!map || map.length < size) /* c8 ignore next */ throw new Error('Unreachable') // Important for perf
795803

796-
if (enc === 'gb18030') {
804+
if (iso2022jp) {
805+
let state = 0 // 0 = ASCII, 1 = Roman, 2 = jis0208
806+
for (let j = 0; j < length; j++) {
807+
let x = str.charCodeAt(j)
808+
if (x >= 0xd8_00 && x < 0xe0_00) {
809+
if (x >= 0xdc_00 || j + 1 === length) {
810+
i += err(x) // lone
811+
} else {
812+
const x1 = str.charCodeAt(j + 1)
813+
if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
814+
i += err(x) // lone
815+
} else {
816+
j++ // consume x1
817+
i += err(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
818+
}
819+
}
820+
} else if (x < 0x80) {
821+
if (state === 2 || (state === 1 && (x === 0x5c || x === 0x7e))) {
822+
state = 0
823+
u8[i++] = 0x1b
824+
u8[i++] = 0x28
825+
u8[i++] = 0x42
826+
}
827+
828+
if (x === 0xe || x === 0xf || x === 0x1b) {
829+
i += err(0xff_fd) // 12.2.2. step 3: This returns U+FFFD rather than codePoint to prevent attacks
830+
} else {
831+
u8[i++] = x
832+
}
833+
} else if (x === 0xa5 || x === 0x20_3e) {
834+
if (state !== 1) {
835+
state = 1
836+
u8[i++] = 0x1b
837+
u8[i++] = 0x28
838+
u8[i++] = 0x4a
839+
}
840+
841+
u8[i++] = x === 0xa5 ? 0x5c : 0x7e
842+
} else {
843+
// Checks above are unaffected by these, all these are above 0x30_00
844+
if (x === 0x22_12) x = 0xff_0d
845+
if (x >= 0xff_61 && x <= 0xff_9f) x = katakana[x - 0xff_61]
846+
const e = map[x]
847+
if (e) {
848+
if (state !== 2) {
849+
state = 2
850+
u8[i++] = 0x1b
851+
u8[i++] = 0x24
852+
u8[i++] = 0x42
853+
}
854+
855+
u8[i++] = e >> 8
856+
u8[i++] = e & 0xff
857+
} else {
858+
if (state === 2) {
859+
state = 0
860+
u8[i++] = 0x1b
861+
u8[i++] = 0x28
862+
u8[i++] = 0x42
863+
}
864+
865+
i += err(x)
866+
}
867+
}
868+
}
869+
870+
if (state) {
871+
u8[i++] = 0x1b
872+
u8[i++] = 0x28
873+
u8[i++] = 0x42
874+
}
875+
} else if (gb18030) {
797876
// Deduping this branch hurts other encoders perf
798877
const encode = (cp) => {
799878
let a = 0, b = 0 // prettier-ignore

tests/wpt/loader.cjs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,7 @@ function loadTextDecoderHtml(fullName) {
147147
assert.ok(encoding && encoding.length > 0)
148148
const decoder = new globalThis.TextDecoder(encoding)
149149
const fatal = new globalThis.TextDecoder(encoding, { fatal: true })
150-
const encode =
151-
decoder.encoding === 'iso-2022-jp' ? null : createMultibyteEncoder(decoder.encoding) // TODO: iso-2022-jp
150+
const encode = createMultibyteEncoder(decoder.encoding)
152151

153152
if (fullName.endsWith('_errors.html')) {
154153
const sep0 = '<span>'
@@ -223,8 +222,7 @@ function loadTextDecoderHtml(fullName) {
223222
// This is limited, encoders are asymmetrical
224223
if (
225224
!(decoder.encoding === 'euc-jp' && bytes.length === 3) && // no jis0212 encoding in spec
226-
!(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) && // encoding excludes pointers less than (0xA1 - 0x81) × 157.
227-
decoder.encoding !== 'iso-2022-jp' // Not implemented yet
225+
!(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) // encoding excludes pointers less than (0xA1 - 0x81) × 157.
228226
) {
229227
t.assert.doesNotThrow(
230228
() => t.assert.deepEqual(encode(String.fromCodePoint(cp)), bytes),

0 commit comments

Comments
 (0)