Skip to content

Commit c64807c

Browse files
committed
perf: move iso-2022-jp to fast impls, unify multibyte decoders
1 parent 5c3064c commit c64807c

File tree

1 file changed

+77
-58
lines changed

1 file changed

+77
-58
lines changed

fallback/multi-byte.js

Lines changed: 77 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ const mappers = {
2424
return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
2525
}
2626

27-
const fast = (arr, start, end, stream) => {
27+
const decode = (arr, start, end, stream) => {
2828
let res = ''
2929
let i = start
3030

@@ -49,7 +49,7 @@ const mappers = {
4949
return res
5050
}
5151

52-
return { fast, isAscii: () => lead === 0 }
52+
return { decode, isAscii: () => lead === 0 }
5353
},
5454
// https://encoding.spec.whatwg.org/#euc-jp-decoder
5555
'euc-jp': (err) => {
@@ -81,7 +81,7 @@ const mappers = {
8181
return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
8282
}
8383

84-
const fast = (arr, start, end, stream) => {
84+
const decode = (arr, start, end, stream) => {
8585
let res = ''
8686
let i = start
8787

@@ -109,26 +109,20 @@ const mappers = {
109109
return res
110110
}
111111

112-
return { fast, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
112+
return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
113113
},
114114
// https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
115-
// Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
116115
'iso-2022-jp': (err) => {
117116
const jis0208 = getTable('jis0208')
118-
const EOF = -1
119117
let dState = 1
120118
let oState = 1
121-
let lead = 0
119+
let lead = 0 // 0 or 0x21-0x7e
122120
let out = false
123121

124-
const pushback = []
125-
const bytes = (b) => {
126-
if (dState < 5) {
127-
if (b === EOF) return null
128-
if (b === 0x1b) {
129-
dState = 6 // escape start
130-
return
131-
}
122+
const bytes = (pushback, b) => {
123+
if (dState < 5 && b === 0x1b) {
124+
dState = 6 // escape start
125+
return
132126
}
133127

134128
switch (dState) {
@@ -180,7 +174,7 @@ const mappers = {
180174

181175
out = false
182176
dState = oState
183-
if (b !== EOF) pushback.push(b)
177+
pushback.push(b)
184178
return err()
185179
case 7: {
186180
// Escape
@@ -209,16 +203,72 @@ const mappers = {
209203

210204
out = false
211205
dState = oState
212-
if (b !== EOF) pushback.push(b)
213-
pushback.push(l)
206+
pushback.push(b, l)
214207
return err()
215208
}
216209
}
217210
}
218211

219-
const eof = () => bytes(EOF)
212+
const eof = (pushback) => {
213+
if (dState < 5) return null
214+
switch (dState) {
215+
case 5:
216+
out = false
217+
dState = 4
218+
return err()
219+
case 6:
220+
out = false
221+
dState = oState
222+
return err()
223+
case 7: {
224+
out = false
225+
dState = oState
226+
pushback.push(lead) // lead is always ASCII
227+
lead = 0
228+
return err()
229+
}
230+
}
231+
}
232+
233+
const decode = (arr, start, end, stream) => {
234+
let res = ''
235+
let i = start
236+
const pushback = [] // local and auto-cleared
237+
238+
// First, dump everything until EOF
239+
// Same as the full loop, but without EOF handling
240+
while (i < end || pushback.length > 0) {
241+
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
242+
if (c !== undefined) res += String.fromCodePoint(c)
243+
}
244+
245+
// Then, dump EOF. This needs the same loop as the characters can be pushed back
246+
if (!stream) {
247+
while (i <= end || pushback.length > 0) {
248+
if (i < end || pushback.length > 0) {
249+
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
250+
if (c !== undefined) res += String.fromCodePoint(c)
251+
} else {
252+
const c = eof(pushback)
253+
if (c === null) break // clean exit
254+
if (c !== undefined) res += String.fromCodePoint(c)
255+
}
256+
}
257+
}
220258

221-
return { bytes, eof, pushback }
259+
// Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
260+
// > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
261+
// > Set this’s do not flush to options["stream"]
262+
if (!stream) {
263+
dState = oState = 1
264+
lead = 0
265+
out = false
266+
}
267+
268+
return res
269+
}
270+
271+
return { decode, isAscii: () => false }
222272
},
223273
// https://encoding.spec.whatwg.org/#shift_jis-decoder
224274
shift_jis: (err) => {
@@ -238,7 +288,7 @@ const mappers = {
238288
return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
239289
}
240290

241-
const fast = (arr, start, end, stream) => {
291+
const decode = (arr, start, end, stream) => {
242292
let res = ''
243293
let i = start
244294

@@ -265,7 +315,7 @@ const mappers = {
265315
return res
266316
}
267317

268-
return { fast, isAscii: () => lead === 0 }
318+
return { decode, isAscii: () => lead === 0 }
269319
},
270320
// https://encoding.spec.whatwg.org/#gbk-decoder
271321
gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
@@ -291,7 +341,7 @@ const mappers = {
291341
// g2 is 0 or 0x30-0x39
292342
// g3 is 0 or 0x81-0xfe
293343

294-
const fast = (arr, start, end, stream) => {
344+
const decode = (arr, start, end, stream) => {
295345
let res = ''
296346
let i = start
297347
const pushback = [] // local and auto-cleared
@@ -359,7 +409,7 @@ const mappers = {
359409
return res
360410
}
361411

362-
return { fast, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
412+
return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
363413
},
364414
// https://encoding.spec.whatwg.org/#big5
365415
big5: (err) => {
@@ -380,7 +430,7 @@ const mappers = {
380430
// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
381431
// We store that as strings
382432
// eslint-disable-next-line sonarjs/no-identical-functions
383-
const fast = (arr, start, end, stream) => {
433+
const decode = (arr, start, end, stream) => {
384434
let res = ''
385435
let i = start
386436

@@ -405,7 +455,7 @@ const mappers = {
405455
return res
406456
}
407457

408-
return { fast, isAscii: () => lead === 0 }
458+
return { decode, isAscii: () => lead === 0 }
409459
},
410460
}
411461

@@ -421,7 +471,6 @@ export function multibyteDecoder(enc, loose = false) {
421471
const onErr = loose
422472
? () => REP
423473
: () => {
424-
if (mapper.pushback) mapper.pushback.length = 0 // the queue is cleared on returning an error
425474
// The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
426475
// Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
427476
// iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
@@ -431,43 +480,13 @@ export function multibyteDecoder(enc, loose = false) {
431480

432481
return (arr, stream = false) => {
433482
let res = ''
434-
const length = arr.length
435483
if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
436484
res = decodeLatin1(arr, 0, asciiPrefix(arr))
437485
if (res.length === arr.length) return res // ascii
438486
}
439487

440488
streaming = stream // affects onErr
441489
if (!mapper) mapper = mappers[enc](onErr)
442-
if (mapper.fast) return res + mapper.fast(arr, res.length, arr.length, stream) // does not need mapper deletion
443-
const { bytes, eof, pushback } = mapper
444-
let i = res.length
445-
446-
// First, dump everything until EOF
447-
// Same as the full loop, but without EOF handling
448-
while (i < length || pushback.length > 0) {
449-
const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
450-
if (c === undefined) continue // consuming
451-
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
452-
}
453-
454-
// Then, dump EOF. This needs the same loop as the characters can be pushed back
455-
// TODO: only some encodings need this, most can be optimized
456-
if (!stream) {
457-
while (i <= length || pushback.length > 0) {
458-
const isEOF = i === length && pushback.length === 0
459-
const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
460-
if (isEOF && c === null) break // clean exit
461-
if (c === undefined) continue // consuming
462-
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
463-
}
464-
}
465-
466-
// Chrome and WebKit fail on this, we don't: completely destroy the old decoder instance when finished streaming
467-
// > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
468-
// > Set this’s do not flush to options["stream"]
469-
if (!stream) mapper = null
470-
471-
return res
490+
return res + mapper.decode(arr, res.length, arr.length, stream)
472491
}
473492
}

0 commit comments

Comments
 (0)