Skip to content

Commit 109c52d

Browse files
committed
perf: simplify multi-byte logic a bit
1 parent 6bc5e18 commit 109c52d

1 file changed

Lines changed: 44 additions & 58 deletions

File tree

fallback/multi-byte.js

Lines changed: 44 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
1313
const REP = 0xff_fd
1414
const mappers = {
1515
// https://encoding.spec.whatwg.org/#euc-kr-decoder
16-
'euc-kr': () => {
16+
'euc-kr': (err) => {
1717
const euc = getTable('euc-kr')
1818
let lead = 0
1919

@@ -24,25 +24,24 @@ const mappers = {
2424
lead = 0
2525
if (cp !== undefined && cp !== REP) return cp
2626
if (b < 128) pushback.push(b)
27-
return -2
27+
return err()
2828
}
2929

3030
if (b < 128) return b
31-
if (b < 0x81 || b === 0xff) return -2
31+
if (b < 0x81 || b === 0xff) return err()
3232
lead = b
33-
return -1
3433
}
3534

3635
const eof = () => {
3736
if (!lead) return null
3837
lead = 0
39-
return -2
38+
return err()
4039
}
4140

4241
return { bytes, eof, pushback }
4342
},
4443
// https://encoding.spec.whatwg.org/#euc-jp-decoder
45-
'euc-jp': () => {
44+
'euc-jp': (err) => {
4645
const jis0208 = getTable('jis0208')
4746
const jis0212 = getTable('jis0212')
4847
let j12 = false
@@ -58,7 +57,7 @@ const mappers = {
5857
if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
5958
j12 = true
6059
lead = b
61-
return -1
60+
return
6261
}
6362

6463
if (lead) {
@@ -71,27 +70,26 @@ const mappers = {
7170
j12 = false
7271
if (cp !== undefined && cp !== REP) return cp
7372
if (b < 128) pushback.push(b)
74-
return -2
73+
return err()
7574
}
7675

7776
if (b < 128) return b
78-
if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return -2
77+
if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return err()
7978
lead = b
80-
return -1
8179
}
8280

8381
// eslint-disable-next-line sonarjs/no-identical-functions
8482
const eof = () => {
8583
if (!lead) return null
8684
lead = 0
87-
return -2
85+
return err()
8886
}
8987

9088
return { bytes, eof, pushback }
9189
},
9290
// https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
9391
// Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
94-
'iso-2022-jp': () => {
92+
'iso-2022-jp': (err) => {
9593
const jis0208 = getTable('jis0208')
9694
const EOF = -1
9795
let dState = 1
@@ -105,7 +103,7 @@ const mappers = {
105103
if (b === EOF) return null
106104
if (b === 0x1b) {
107105
dState = 6 // escape start
108-
return -1
106+
return
109107
}
110108
}
111109

@@ -120,49 +118,46 @@ const mappers = {
120118
}
121119

122120
if (b <= 0x7f && b !== 0x0e && b !== 0x0f) return b
123-
return -2
121+
return err()
124122
case 3:
125123
// Katakana
126124
out = false
127125
if (b >= 0x21 && b <= 0x5f) return 0xff_40 + b
128-
return -2
126+
return err()
129127
case 4:
130128
// Leading byte
131129
out = false
132-
if ((b >= 0x21) & (b <= 0x7e)) {
133-
lead = b
134-
dState = 5
135-
return -1
136-
}
137-
138-
return -2
130+
if (b < 0x21 || b > 0x7e) return err()
131+
lead = b
132+
dState = 5
133+
return
139134
case 5:
140135
// Trailing byte
141136
out = false
142137
if (b === 0x1b) {
143138
dState = 6 // escape start
144-
return -2
139+
return err()
145140
}
146141

147142
dState = 4
148143
if (b >= 0x21 && b <= 0x7e) {
149144
const cp = jis0208[(lead - 0x21) * 94 + b - 0x21]
150-
return cp !== undefined && cp !== REP ? cp : -2
145+
if (cp !== undefined && cp !== REP) return cp
151146
}
152147

153-
return -2
148+
return err()
154149
case 6:
155150
// Escape start
156151
if (b === 0x24 || b === 0x28) {
157152
lead = b
158153
dState = 7
159-
return -1
154+
return
160155
}
161156

162157
out = false
163158
dState = oState
164159
if (b !== EOF) pushback.push(b)
165-
return -2
160+
return err()
166161
case 7: {
167162
// Escape
168163
const l = lead
@@ -185,14 +180,14 @@ const mappers = {
185180
dState = oState = s
186181
const output = out
187182
out = true
188-
return output ? -2 : -1
183+
return output ? err() : undefined
189184
}
190185

191186
out = false
192187
dState = oState
193188
if (b !== EOF) pushback.push(b)
194189
pushback.push(l)
195-
return -2
190+
return err()
196191
}
197192
}
198193
}
@@ -202,7 +197,7 @@ const mappers = {
202197
return { bytes, eof, pushback }
203198
},
204199
// https://encoding.spec.whatwg.org/#shift_jis-decoder
205-
shift_jis: () => {
200+
shift_jis: (err) => {
206201
const jis0208 = getTable('jis0208')
207202
let lead = 0
208203

@@ -219,29 +214,28 @@ const mappers = {
219214
}
220215

221216
if (b < 128) pushback.push(b)
222-
return -2
217+
return err()
223218
}
224219

225220
if (b <= 0x80) return b // 0x80 is allowed
226221
if (b >= 0xa1 && b <= 0xdf) return 0xff_61 - 0xa1 + b
227-
if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return -2
222+
if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return err()
228223
lead = b
229-
return -1
230224
}
231225

232226
// eslint-disable-next-line sonarjs/no-identical-functions
233227
const eof = () => {
234228
if (!lead) return null
235229
lead = 0 // this clears state completely on EOF
236-
return -2
230+
return err()
237231
}
238232

239233
return { bytes, eof, pushback }
240234
},
241235
// https://encoding.spec.whatwg.org/#gbk-decoder
242-
gbk: () => mappers.gb18030(), // 10.1.1. GBK’s decoder is gb18030’s decoder
236+
gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
243237
// https://encoding.spec.whatwg.org/#gb18030-decoder
244-
gb18030: () => {
238+
gb18030: (err) => {
245239
const gb18030 = getTable('gb18030')
246240
const gb18030r = getTable('gb18030-ranges')
247241
let g1 = 0, g2 = 0, g3 = 0 // prettier-ignore
@@ -264,30 +258,30 @@ const mappers = {
264258
if (b < 0x30 || b > 0x39) {
265259
pushback.push(b, g3, g2)
266260
g1 = g2 = g3 = 0
267-
return -2
261+
return err()
268262
}
269263

270264
const cp = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
271265
g1 = g2 = g3 = 0
272266
if (cp !== undefined) return cp // Can validly return replacement
273-
return -2
267+
return err()
274268
}
275269

276270
if (g2) {
277271
if (b >= 0x81 && b <= 0xfe) {
278272
g3 = b
279-
return -1
273+
return
280274
}
281275

282276
pushback.push(b, g2)
283277
g1 = g2 = 0
284-
return -2
278+
return err()
285279
}
286280

287281
if (g1) {
288282
if (b >= 0x30 && b <= 0x39) {
289283
g2 = b
290-
return -1
284+
return
291285
}
292286

293287
let cp
@@ -298,20 +292,19 @@ const mappers = {
298292
g1 = 0
299293
if (cp !== undefined && cp !== REP) return cp
300294
if (b < 128) pushback.push(b)
301-
return -2
295+
return err()
302296
}
303297

304298
if (b < 128) return b
305299
if (b === 0x80) return 0x20_ac
306-
if (b === 0xff) return -2
300+
if (b === 0xff) return err()
307301
g1 = b
308-
return -1
309302
}
310303

311304
const eof = () => {
312305
if (!g1 && !g2 && !g3) return null
313306
g1 = g2 = g3 = 0
314-
return -2
307+
return err()
315308
}
316309

317310
return { bytes, eof, pushback }
@@ -329,7 +322,7 @@ export function multibyteDecoder(enc, loose = false) {
329322
const asciiSuperset = isAsciiSuperset(enc)
330323
return (arr, stream = false) => {
331324
const onErr = loose
332-
? () => '\uFFFD'
325+
? () => REP
333326
: () => {
334327
mapper.pushback.length = 0 // the queue is cleared on returning an error
335328
// The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
@@ -346,19 +339,16 @@ export function multibyteDecoder(enc, loose = false) {
346339
if (res.length === arr.length) return res // ascii
347340
}
348341

349-
if (!mapper) mapper = mappers[enc]()
342+
if (!mapper) mapper = mappers[enc](onErr)
350343
const { bytes, eof, pushback } = mapper
351344
let i = res.length
352345

353346
// First, dump everything until EOF
354347
// Same as the full loop, but without EOF handling
355348
while (i < length || pushback.length > 0) {
356349
const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
357-
if (c >= 0) {
358-
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
359-
} else if (c === -2) {
360-
res += onErr()
361-
}
350+
if (c === undefined) continue // consuming
351+
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
362352
}
363353

364354
// Then, dump EOF. This needs the same loop as the characters can be pushed back
@@ -368,12 +358,8 @@ export function multibyteDecoder(enc, loose = false) {
368358
const isEOF = i === length && pushback.length === 0
369359
const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
370360
if (isEOF && c === null) break // clean exit
371-
if (c === -1) continue // consuming
372-
if (c === -2) {
373-
res += onErr()
374-
} else {
375-
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
376-
}
361+
if (c === undefined) continue // consuming
362+
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
377363
}
378364
}
379365

0 commit comments

Comments
 (0)