feat(perf): Add fast path for decodeLatin1()#1037
Conversation
|
@wh201906 is attempting to deploy a commit to the Margelo Team on Vercel. A member of the Team first needs to authorize it. |
|
Test cases from Node.js v24.15.0. Try to create 0-length buffers. Should not throw.Current test(
SUITE,
'[Node.js] Try to create 0-length buffers. Should not throw.',
() => {
const encodings = ['ascii', 'latin1', 'binary'] as const;
for (const encoding of encodings) {
const ab = stringToBuffer('', encoding);
expect(ab.byteLength).to.equal(0);
expect(bufferToString(ab, encoding)).to.equal('');
}
},
);Original Node.js ( // Try to create 0-length buffers. Should not throw.
Buffer.from('');
Buffer.from('', 'ascii');
Buffer.from('', 'latin1');
new Buffer('', 'binary');Buffer.from('foo', encoding).toString(encoding) returns 'foo'.Current test(
SUITE,
"[Node.js] Buffer.from('foo', encoding).toString(encoding) returns 'foo'.",
() => {
const encodings = ['utf8', 'utf16le', 'ascii', 'latin1', 'binary'] as const;
for (const encoding of encodings) {
const ab = stringToBuffer('foo', encoding);
expect(bufferToString(ab, encoding)).to.equal('foo');
}
},
);Original Node.js ( // utf8, ucs2, ascii, latin1, utf16le
for (const encoding of [
'utf8',
'utf-8',
'ucs2',
'ucs-2',
'ascii',
'latin1',
'binary',
'utf16le',
'utf-16le',
].flatMap(e => [e, e.toUpperCase()])) {
assert.strictEqual(Buffer.from('foo', encoding).toString(encoding), 'foo');
}Data "Hello, ÆÊÎÖÿ".Current test(SUITE, '[Node.js] Data "Hello, ÆÊÎÖÿ".', () => {
const str = 'Hello, ÆÊÎÖÿ';
const expected = new Uint8Array([
...Array.from('Hello, ', c => c.charCodeAt(0)),
0xc6,
0xca,
0xce,
0xd6,
0xff,
]);
const ab = stringToBuffer(str, 'latin1');
expect(toU8(ab)).to.deep.equal(expected);
expect(bufferToString(expected.buffer as ArrayBuffer, 'latin1')).to.equal(
str,
);
});Original Node.js ( // Data "Hello, ÆÊÎÖÿ"
static const char latin1_data[] = "Hello, \xC6\xCA\xCE\xD6\xFF";
static const char utf8_data[] = "Hello, ÆÊÎÖÿ";Verify that StringBytes::Write converts two-byte characters to one-byte characters, even if there is no valid one-byte representation.Current test(
SUITE,
'[Node.js] Verify that StringBytes::Write converts two-byte characters to one-byte characters, even if there is no valid one-byte representation.',
() => {
const expected = new Uint8Array([
...Array.from('Hello, ', c => c.charCodeAt(0)),
0x16,
0x4c,
]);
const ab = stringToBuffer('Hello, 世界', 'latin1');
expect(toU8(ab)).to.deep.equal(expected);
expect(bufferToString(ab, 'latin1')).to.equal(
String.fromCharCode(...expected),
);
},
);Original Node.js ( // Verify that StringBytes::Write converts two-byte characters to one-byte
// characters, even if there is no valid one-byte representation.
Local<String> utf8_str =
String::NewFromUtf8(isolate_, "Hello, 世界").ToLocalChecked();
ASSERT_STREQ("Hello, \x16\x4C", buf.out());Manually controlled string for checking binary output.Current test(
SUITE,
'[Node.js] Manually controlled string for checking binary output',
() => {
const ucs2Control = 'a\u0000';
const writeStr = 'a';
const bytes = toU8(stringToBuffer(writeStr, 'utf16le'));
expect(bytes[0]).to.equal(0x61);
expect(bytes[1]).to.equal(0);
expect(bufferToString(bytes.buffer as ArrayBuffer, 'latin1')).to.equal(
ucs2Control,
);
expect(bufferToString(bytes.buffer as ArrayBuffer, 'binary')).to.equal(
ucs2Control,
);
},
);Original Node.js ( // Manually controlled string for checking binary output
let ucs2_control = 'a\u0000';
let write_str = 'a';
// first check latin1
let c = b.toString('latin1');
// now check binary
c = b.toString('binary');Correspondence: Node creates ASCII slice test.Current test(SUITE, '[Node.js] ASCII slice test', () => {
{
const asciiString = 'hello world';
const bytes = new Uint8Array(128);
for (let i = 0; i < asciiString.length; i++) {
bytes[i] = asciiString.charCodeAt(i);
}
const asciiSlice = bufferToString(
bytes.buffer as ArrayBuffer,
'ascii',
0,
asciiString.length,
);
expect(asciiSlice).to.equal(asciiString);
}
{
const asciiString = 'hello world';
const offset = 100;
const bytes = new Uint8Array(128);
bytes.set(toU8(stringToBuffer(asciiString, 'ascii')), offset);
const asciiSlice = bufferToString(
bytes.buffer as ArrayBuffer,
'ascii',
offset,
offset + asciiString.length,
);
expect(asciiSlice).to.equal(asciiString);
}
});Original Node.js ( // ASCII slice test
{
const asciiString = 'hello world';
for (let i = 0; i < asciiString.length; i++) {
b[i] = asciiString.charCodeAt(i);
}
const asciiSlice = b.toString('ascii', 0, asciiString.length);
assert.strictEqual(asciiString, asciiSlice);
}
{
const asciiString = 'hello world';
const offset = 100;
assert.strictEqual(asciiString.length, b.write(asciiString, offset, 'ascii'));
const asciiSlice = b.toString('ascii', offset, offset + asciiString.length);
assert.strictEqual(asciiString, asciiSlice);
}Correspondence: the first RNQC block writes |
boorad
left a comment
There was a problem hiding this comment.
Nice perf win — reusing the decodeUtf16Le chunk-callback pattern keeps this very readable, and the Node.js-derived tests ('Hello, 世界' truncation, manually-controlled binary output, ASCII slice with offset) directly exercise the new path. A few small things below — mostly nits, plus one question about why the Hermes-only gate differs from decodeUtf16Le's unconditional use of getStringData.
Verified bun tsc passes on both packages.
Avoid zero-filling for pure ASCII path Allocate less memory than single resize() (might work)
Unify the private member name to xxx_ Remove this pointer Add comment


This PR adds the fast path for
decodeLatin1()when Hermes andgetStringData()are available.