Skip to content

Commit edaaf76

Browse files
committed
chore: getting closer at utf-32 impl
1 parent 6b01b50 commit edaaf76

File tree

2 files changed

+12
-26
lines changed

2 files changed

+12
-26
lines changed

fallback/utf32.js

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ const to32 = (a) => new Uint32Array(a.buffer, a.byteOffset, a.byteLength / 4) //
88

99
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
1010

11+
// Surrogates are an UTF-16 thing and can not be represented in UTF-32, iconv-lite got it wrong
12+
// See https://unicode.org/faq/utf_bom#utf32-7
13+
1114
// Assumes checked length % 4 === 0, otherwise does not swap tail
1215
export function swap32(u8) {
1316
let i = 0
@@ -31,12 +34,8 @@ export function to32input(u8, le) {
3134
return to32(swap32(Uint8Array.from(u8)))
3235
}
3336

34-
export function decode(u32) {
35-
return String.fromCodePoint.apply(String, u32) // TODO: max len
36-
}
37-
3837
// No surrogates (paired or unpaired), no out of range codepoints
39-
export function isStrict(u32) {
38+
export function isWellFormed(u32) {
4039
const length = u32.length
4140
for (let i = 0; i < length; i++) {
4241
const x = u32[i]
@@ -50,27 +49,13 @@ export function toWellFormed(u32) {
5049
const length = u32.length
5150
for (let i = 0; i < length; i++) {
5251
const x = u32[i]
53-
if (x >= 0xd8_00) {
54-
if (x < 0xe0_00) {
55-
// An unexpected trail or a lead at the very end of input
56-
if (x > 0xdb_ff || i + 1 >= length) {
57-
u32[i] = replacementCodepoint
58-
} else {
59-
const next = u32[i + 1] // Process valid pairs immediately
60-
if (next < 0xdc_00 || next >= 0xe0_00) {
61-
u32[i] = replacementCodepoint
62-
} else {
63-
i++ // consume next
64-
}
65-
}
66-
} else if (x >= 0x11_00_00) {
67-
// also fix out-of-range in the same pass, both are unlikely
68-
u32[i] = replacementCodepoint
69-
}
70-
}
52+
if (x >= 0xd8_00 && (x < 0xe0_00 || x >= 0x11_00_00)) u32[i] = replacementCodepoint
7153
}
54+
}
7255

73-
return u32
56+
// Only defined on valid input
57+
export function decode(u32) {
58+
return String.fromCodePoint.apply(String, u32) // TODO: max len
7459
}
7560

7661
// Only defined on valid input

utf32.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,10 @@ function decode(input, loose = false, format = 'uint32') {
4646

4747
// TODO: recheck spidermonkey/Firefox/jsc perf
4848

49-
if (!js.isStrict(u32)) {
49+
if (!js.isWellFormed(u32)) {
5050
if (!loose) throw new RangeError(js.E_STRICT)
51-
u32 = js.toWellFormed(new Uint32Array(u32))
51+
if (u32.buffer === input.buffer) u32 = new Uint32Array(u32)
52+
js.toWellFormed(u32)
5253
}
5354

5455
// Significantly faster on Hermes

0 commit comments

Comments
 (0)