Skip to content

Commit 9df2491

Browse files
committed
refactor: add fast unified decoder for big5
1 parent c790a71 commit 9df2491

File tree

1 file changed

+46
-58
lines changed

1 file changed

+46
-58
lines changed

fallback/multi-byte.js

Lines changed: 46 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -331,12 +331,57 @@ const mappers = {
331331

332332
return { bytes, eof, pushback }
333333
},
334+
// https://encoding.spec.whatwg.org/#big5
335+
big5: (err) => {
336+
const big5 = getTable('big5')
337+
let lead = 0
338+
339+
const decodeLead = (b) => {
340+
let cp
341+
if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
342+
cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
343+
}
344+
345+
lead = 0
346+
if (cp) return cp // strings
347+
return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
348+
}
349+
350+
// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
351+
// We store that as strings
352+
// eslint-disable-next-line sonarjs/no-identical-functions
353+
const fast = (arr, start, end, stream) => {
354+
let res = ''
355+
let i = start
356+
357+
if (lead && i < end) res += decodeLead(arr[i++])
358+
while (i < end) {
359+
const b = arr[i++]
360+
if (b < 128) {
361+
res += String.fromCharCode(b)
362+
} else if (b < 0x81 || b === 0xff) {
363+
res += String.fromCharCode(err())
364+
} else {
365+
lead = b
366+
if (i < end) res += decodeLead(arr[i++])
367+
}
368+
}
369+
370+
if (lead && !stream) {
371+
lead = 0
372+
res += String.fromCharCode(err())
373+
}
374+
375+
return res
376+
}
377+
378+
return { fast, isAscii: () => lead === 0 }
379+
},
334380
}
335381

336382
export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
337383

338384
export function multibyteDecoder(enc, loose = false) {
339-
if (enc === 'big5') return big5decoder(loose)
340385
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
341386

342387
// Input is assumed to be typechecked already
@@ -394,60 +439,3 @@ export function multibyteDecoder(enc, loose = false) {
394439
return res
395440
}
396441
}
397-
398-
// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
399-
// We store that as strings
400-
function big5decoder(loose) {
401-
// Input is assumed to be typechecked already
402-
let lead = 0
403-
let big5
404-
return (arr, stream = false) => {
405-
const onErr = loose
406-
? () => '\uFFFD'
407-
: () => {
408-
// Lead is always already cleared before throwing
409-
throw new TypeError(E_STRICT)
410-
}
411-
412-
let res = ''
413-
const length = arr.length
414-
if (!lead) {
415-
res = decodeLatin1(arr, 0, asciiPrefix(arr))
416-
if (res.length === arr.length) return res // ascii
417-
}
418-
419-
if (!big5) big5 = getTable('big5')
420-
for (let i = res.length; i < length; i++) {
421-
const b = arr[i]
422-
if (lead) {
423-
let cp
424-
if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
425-
cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
426-
}
427-
428-
lead = 0
429-
if (cp) {
430-
res += cp // strings
431-
} else {
432-
res += onErr()
433-
// same as pushing it back: lead is cleared, pushed back can't contain more than 1 byte
434-
if (b < 128) res += String.fromCharCode(b)
435-
}
436-
} else if (b < 128) {
437-
res += String.fromCharCode(b)
438-
} else if (b < 0x81 || b === 0xff) {
439-
res += onErr()
440-
} else {
441-
lead = b
442-
}
443-
}
444-
445-
if (!stream && lead) {
446-
// Destroy decoder state
447-
lead = 0
448-
res += onErr()
449-
}
450-
451-
return res
452-
}
453-
}

0 commit comments

Comments
 (0)