node/lib/internal/encoding/util.js at b1f8aa331b5e66e049089404a8cbd62ac4f3413e · nodejs/node · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// From https://npmjs.com/package/@exodus/bytes
// Copyright Exodus Movement. Licensed under MIT License.

'use strict';

const {
  Uint8Array,
} = primordials;

/**
 * Get a number of last bytes in an Uint8Array `data` ending at `len` that don't
 * form a codepoint yet, but can be a part of a single codepoint on more data.
 * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
 * @param {number} len Position to look behind from
 * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
 * @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len`
 */
function unfinishedBytes(data, len, enc) {
  switch (enc) {
    case 'utf-8': {
      // 0-3
      let pos = 0;
      while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
      if (pos === len) return 0; // no space for lead
      const lead = data[len - pos - 1];
      if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
      if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
      if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing
      const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
      const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
      const next = data[len - pos];
      return next >= lower && next <= upper ? pos + 1 : 0;
    }

    case 'utf-16le':
    case 'utf-16be': {
      // 0-3
      const uneven = len % 2; // Uneven byte length adds 1
      if (len < 2) return uneven;
      const l = len - uneven - 1;
      const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l];
      return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2
    }
  }
}

/**
 * Merge prefix `chunk` with `data` and return new combined prefix.
 * For data.length < 3, fully consumes data and can return unfinished data,
 * otherwise returns a prefix with no unfinished bytes
 * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
 * @param {Uint8Array} chunk Prefix to prepend before `data`
 * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
 * @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data`
 *   so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data).
 */
function mergePrefix(data, chunk, enc) {
  if (data.length === 0) return chunk;
  if (data.length < 3) {
    // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
    const res = new Uint8Array(data.length + chunk.length);
    res.set(chunk);
    res.set(data, chunk.length);
    return res;
  }

  // Slice off a small portion of data into prefix chunk so we can decode them separately without extending array size
  const temp = new Uint8Array(chunk.length + 3); // We have 1-3 bytes and need 1-3 more bytes
  temp.set(chunk);
  temp.set(data.subarray(0, 3), chunk.length);

  // Stop at the first offset where unfinished bytes reaches 0 or fits into data
  // If that doesn't happen (data too short), just concat chunk and data completely (above)
  for (let i = 1; i <= 3; i++) {
    const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3
    if (unfinished <= i) {
      // Always reachable at 3, but we still need 'unfinished' value for it
      const add = i - unfinished; // 0-3
      return add > 0 ? temp.subarray(0, chunk.length + add) : chunk;
    }
  }

  // Unreachable
  return null;
}

module.exports = { unfinishedBytes, mergePrefix };