perf: store low charcodes as numbers in big5

ChALkeR · ChALkeR · commit 2bc302c5b3c6 · 2025-12-26T02:53:15.000+04:00
diff --git a/fallback/multi-byte.js b/fallback/multi-byte.js
@@ -15,13 +15,13 @@ function bigDecoder(err, pair) {
   let o16
 
   const decodeLead = (b) => {
-    const str = pair(lead, b)
+    const p = pair(lead, b)
     lead = 0
-    if (typeof str === 'number') {
-      o16[oi++] = str
-    } else if (str) {
+    if (typeof p === 'number') {
+      o16[oi++] = p
+    } else if (p) {
       // This is still faster than string concatenation. Can we optimize strings though?
-      for (let i = 0; i < str.length; i++) o16[oi++] = str.charCodeAt(i)
+      for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
     } else {
       o16[oi++] = err()
       if (b < 128) o16[oi++] = b
diff --git a/fallback/multi-byte.table.js b/fallback/multi-byte.table.js
@@ -56,7 +56,9 @@ function unwrap(res, t, pos, stringMode = false) {
         }
 
         if (stringMode) {
-          for (let k = 0; k < x; k++, pos++, code++) res[pos] = String.fromCodePoint(code)
+          for (let k = 0; k < x; k++, pos++, code++) {
+            res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
+          }
         } else {
           for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
         }
@@ -65,8 +67,13 @@ function unwrap(res, t, pos, stringMode = false) {
       pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
     } else if (stringMode) {
       const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
-      for (let i = 0; i < s.length; ) res[pos++] = s[i++] // TODO: splice?
-      code = s[s.length - 1].codePointAt(0) + 1
+      let char
+      for (let i = 0; i < s.length; ) {
+        char = s[i++]
+        res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
+      }
+
+      code = char.codePointAt(0) + 1
     } else {
       const u16 = to16input(loadBase64(x), true) // data is little-endian
       res.set(u16, pos)
diff --git a/tests/multi-byte.test.js b/tests/multi-byte.test.js
@@ -62,9 +62,11 @@ describe('multi-byte encodings tables', () => {
           t.assert.strictEqual(typeof table[i], 'string')
           t.assert.strictEqual(table[i].length, 2)
         } else if (row) {
-          const expected = non16bit ? String.fromCodePoint(row.code) : row.code
+          const expected =
+            non16bit && typeof table[i] === 'string' ? String.fromCodePoint(row.code) : row.code
           t.assert.strictEqual(i, row.i)
           t.assert.strictEqual(table[i], expected, `Offset ${i}: ${row.description}`)
+          if (typeof expected === 'number') t.assert.ok(expected > 0 && expected < 0xff_fd)
         } else {
           t.assert.strictEqual(table[i], non16bit ? undefined : 0xff_fd, `Offset ${i}`)
         }