diff --git a/benchmark/fs/readfile-utf8-fastpath.js b/benchmark/fs/readfile-utf8-fastpath.js new file mode 100644 index 00000000000000..9bf00717c5f0b2 --- /dev/null +++ b/benchmark/fs/readfile-utf8-fastpath.js @@ -0,0 +1,62 @@ +'use strict'; + +const common = require('../common.js'); +const fs = require('fs'); +const path = require('path'); +const tmpdir = require('../../test/common/tmpdir'); + +const bench = common.createBenchmark(main, { + size: [64, 1024, 16384, 262144, 4194304], + content: ['ascii', 'latin1', 'utf8_mixed'], + source: ['path', 'fd'], + n: [3e3], +}); + +function buildContent(kind, size) { + if (kind === 'ascii') { + return Buffer.alloc(size, 0x61); // 'a' + } + if (kind === 'latin1') { + // 'é' in UTF-8 is 0xC3 0xA9 (2 bytes per char) + const pair = Buffer.from([0xC3, 0xA9]); + const buf = Buffer.alloc(size); + for (let i = 0; i + 2 <= size; i += 2) pair.copy(buf, i); + return buf; + } + if (kind === 'utf8_mixed') { + // mixed ASCII + 3-byte CJK (U+4E2D 中 = E4 B8 AD) + const cjk = Buffer.from([0xE4, 0xB8, 0xAD]); + const buf = Buffer.alloc(size); + let i = 0; + while (i + 4 <= size) { + buf[i++] = 0x61; + cjk.copy(buf, i); + i += 3; + } + return buf; + } + throw new Error('unknown content: ' + kind); +} + +function main({ n, size, content, source }) { + tmpdir.refresh(); + const file = path.join(tmpdir.path, `bench-${content}-${size}.bin`); + fs.writeFileSync(file, buildContent(content, size)); + + let arg; + let shouldClose = false; + if (source === 'fd') { + arg = fs.openSync(file, 'r'); + shouldClose = true; + } else { + arg = file; + } + + bench.start(); + for (let i = 0; i < n; i++) { + fs.readFileSync(arg, 'utf8'); + } + bench.end(n); + + if (shouldClose) fs.closeSync(arg); +} diff --git a/benchmark/util/text-decoder.js b/benchmark/util/text-decoder.js index 1aa60f2dd0bcd6..ecfba045c52fab 100644 --- a/benchmark/util/text-decoder.js +++ b/benchmark/util/text-decoder.js @@ -6,26 +6,42 @@ const bench = common.createBenchmark(main, { encoding: ['utf-8', 'windows-1252', 'iso-8859-3'], ignoreBOM: [0, 1], fatal: [0, 1], + type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'], + content: ['ascii', 'one-byte-string', 'two-byte-string'], len: [256, 1024 * 16, 1024 * 128], n: [1e3], - type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'], }); -function main({ encoding, len, n, ignoreBOM, type, fatal }) { +function buildContent(content, len) { + let base; + switch (content) { + case 'ascii': base = 'a'; break; + case 'one-byte-string': base = '\xff'; break; + case 'two-byte-string': base = 'ğ'; break; + } + const unitBytes = Buffer.byteLength(base, 'utf8'); + const copies = Math.max(1, Math.floor(len / unitBytes)); + return Buffer.from(base.repeat(copies)); +} + +function main({ encoding, len, n, ignoreBOM, type, fatal, content }) { const decoder = new TextDecoder(encoding, { ignoreBOM, fatal }); + const seed = buildContent(content, len); let buf; switch (type) { case 'SharedArrayBuffer': { - buf = new SharedArrayBuffer(len); + buf = new SharedArrayBuffer(seed.length); + new Uint8Array(buf).set(seed); break; } case 'ArrayBuffer': { - buf = new ArrayBuffer(len); + buf = new ArrayBuffer(seed.length); + new Uint8Array(buf).set(seed); break; } case 'Buffer': { - buf = Buffer.allocUnsafe(len); + buf = seed; break; } } diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index c569375383e8d9..9c84d24c84576d 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -459,14 +459,15 @@ void BindingData::DecodeUTF8(const FunctionCallbackInfo& args) { return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( env->isolate(), "The encoded data was not valid for encoding utf-8"); } - - // TODO(chalker): save on utf8 validity recheck in StringBytes::Encode() } if (length == 0) return args.GetReturnValue().SetEmptyString(); Local ret; - if (StringBytes::Encode(env->isolate(), data, length, UTF8).ToLocal(&ret)) { + v8::MaybeLocal encoded = + has_fatal ? StringBytes::EncodeValidUtf8(env->isolate(), data, length) + : StringBytes::Encode(env->isolate(), data, length, UTF8); + if (encoded.ToLocal(&ret)) { args.GetReturnValue().Set(ret); } } diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 865302bfd1b4de..1d4ee3a81803b2 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -671,6 +671,40 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, } } +MaybeLocal StringBytes::EncodeValidUtf8(Isolate* isolate, + const char* buf, + size_t buflen) { + CHECK_BUFLEN_IN_RANGE(buflen); + if (!buflen) return String::Empty(isolate); + buflen = keep_buflen_in_range(buflen); + + // ASCII fast path + if (!simdutf::validate_ascii_with_errors(buf, buflen).error) { + return ExternOneByteString::NewFromCopy(isolate, buf, buflen); + } + + if (buflen >= 32) { + size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen); + if (u16size > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return MaybeLocal(); + } + return EncodeTwoByteString( + isolate, u16size, [buf, buflen, u16size](uint16_t* dst) { + size_t written = simdutf::convert_valid_utf8_to_utf16( + buf, buflen, reinterpret_cast(dst)); + CHECK_EQ(written, u16size); + }); + } + + Local str; + if (!String::NewFromUtf8(isolate, buf, v8::NewStringType::kNormal, buflen) + .ToLocal(&str)) { + isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate)); + } + return str; +} + MaybeLocal StringBytes::Encode(Isolate* isolate, const uint16_t* buf, size_t buflen) { diff --git a/src/string_bytes.h b/src/string_bytes.h index 9949f508f83ffe..71aa9ff1f90a7c 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -83,6 +83,11 @@ class StringBytes { size_t buflen, enum encoding encoding); + // Like Encode(..., UTF8) but does not re-validate. Input must be valid UTF-8. + static v8::MaybeLocal EncodeValidUtf8(v8::Isolate* isolate, + const char* buf, + size_t buflen); + // Warning: This reverses endianness on BE platforms, even though the // signature using uint16_t implies that it should not. // However, the brokenness is already public API and can't therefore diff --git a/src/util-inl.h b/src/util-inl.h index d59e30a635b08b..e357d15a14496d 100644 --- a/src/util-inl.h +++ b/src/util-inl.h @@ -341,22 +341,6 @@ v8::Maybe FromV8Array(v8::Local context, return js_array->Iterate(context, PushItemToVector, &data); } -v8::MaybeLocal ToV8Value(v8::Local context, - std::string_view str, - v8::Isolate* isolate) { - if (isolate == nullptr) isolate = v8::Isolate::GetCurrent(); - if (str.size() >= static_cast(v8::String::kMaxLength)) [[unlikely]] { - // V8 only has a TODO comment about adding an exception when the maximum - // string size is exceeded. - ThrowErrStringTooLong(isolate); - return v8::MaybeLocal(); - } - - return v8::String::NewFromUtf8( - isolate, str.data(), v8::NewStringType::kNormal, str.size()) - .FromMaybe(v8::Local()); -} - v8::MaybeLocal ToV8Value(v8::Local context, std::u16string_view str, v8::Isolate* isolate) { diff --git a/src/util.cc b/src/util.cc index 1ea51cf7012963..317b8db0daac69 100644 --- a/src/util.cc +++ b/src/util.cc @@ -812,4 +812,15 @@ v8::Maybe GetValidFileMode(Environment* env, return v8::Just(mode); } +v8::MaybeLocal ToV8Value(v8::Local context, + std::string_view str, + v8::Isolate* isolate) { + if (isolate == nullptr) isolate = v8::Isolate::GetCurrent(); + if (str.size() >= static_cast(v8::String::kMaxLength)) [[unlikely]] { + ThrowErrStringTooLong(isolate); + return v8::MaybeLocal(); + } + return StringBytes::Encode(isolate, str.data(), str.size(), UTF8); +} + } // namespace node diff --git a/src/util.h b/src/util.h index 3dedeca4d227e9..48305bfdc13143 100644 --- a/src/util.h +++ b/src/util.h @@ -701,9 +701,9 @@ inline v8::Maybe FromV8Array(v8::Local context, v8::Local js_array, std::vector>* out); -inline v8::MaybeLocal ToV8Value(v8::Local context, - std::string_view str, - v8::Isolate* isolate = nullptr); +v8::MaybeLocal ToV8Value(v8::Local context, + std::string_view str, + v8::Isolate* isolate = nullptr); inline v8::MaybeLocal ToV8Value(v8::Local context, std::u16string_view str, v8::Isolate* isolate = nullptr); diff --git a/test/parallel/test-fs-readfile-utf8-fast-path.js b/test/parallel/test-fs-readfile-utf8-fast-path.js new file mode 100644 index 00000000000000..18d0d884dfa455 --- /dev/null +++ b/test/parallel/test-fs-readfile-utf8-fast-path.js @@ -0,0 +1,103 @@ +'use strict'; + +require('../common'); +const fs = require('node:fs'); +const path = require('node:path'); +const assert = require('node:assert'); +const { describe, it } = require('node:test'); +const tmpdir = require('../common/tmpdir'); + +tmpdir.refresh(); + +function writeFile(name, buf) { + const p = path.join(tmpdir.path, name); + fs.writeFileSync(p, buf); + return p; +} + +function expectMatches(filePath, rawBuf) { + assert.strictEqual( + fs.readFileSync(filePath, 'utf8'), + rawBuf.toString('utf8'), + ); +} + +describe('fs.readFileSync utf8 simdutf dispatch', () => { + it('empty file', () => { + const p = writeFile('empty.txt', Buffer.alloc(0)); + assert.strictEqual(fs.readFileSync(p, 'utf8'), ''); + }); + + it('ascii small', () => { + const buf = Buffer.from('hello'); + expectMatches(writeFile('tiny-ascii.txt', buf), buf); + }); + + it('ascii 20KB', () => { + const buf = Buffer.alloc(20 * 1024, 0x41); + expectMatches(writeFile('medium-ascii.txt', buf), buf); + }); + + it('ascii 1MB', () => { + const buf = Buffer.alloc(1024 * 1024, 0x61); + expectMatches(writeFile('large-ascii.txt', buf), buf); + }); + + it('fd input', () => { + const buf = Buffer.alloc(50 * 1024, 0x62); + const p = writeFile('fd-ascii.txt', buf); + const fd = fs.openSync(p, 'r'); + try { + assert.strictEqual(fs.readFileSync(fd, 'utf8'), buf.toString('utf8')); + } finally { + fs.closeSync(fd); + } + }); + + it('multibyte UTF-8', () => { + const buf = Buffer.from('中文测试 — café — 🚀'.repeat(500), 'utf8'); + expectMatches(writeFile('multibyte.txt', buf), buf); + }); + + it('latin1-fits utf8', () => { + const buf = Buffer.from('naïve café résumé — niño Köln '.repeat(500), 'utf8'); + expectMatches(writeFile('latin1-fits.txt', buf), buf); + }); + + it('invalid: lone continuation byte', () => { + const buf = Buffer.from([0x68, 0x69, 0x80, 0x21]); + expectMatches(writeFile('invalid-cont.txt', buf), buf); + }); + + it('invalid: overlong', () => { + const buf = Buffer.from([0x41, 0xC0, 0xAF, 0x42]); + expectMatches(writeFile('invalid-overlong.txt', buf), buf); + }); + + it('invalid: surrogate', () => { + const buf = Buffer.from([0x41, 0xED, 0xA0, 0x80, 0x42]); + expectMatches(writeFile('invalid-surrogate.txt', buf), buf); + }); + + it('latin1 boundary U+00FF', () => { + const buf = Buffer.from('ÿ'.repeat(2048), 'utf8'); + expectMatches(writeFile('latin1-boundary.txt', buf), buf); + }); + + it('above latin1 U+0100', () => { + const buf = Buffer.from('ĀāĂ'.repeat(1024), 'utf8'); + expectMatches(writeFile('above-latin1.txt', buf), buf); + }); + + it('single codepoint each UTF-8 length', () => { + for (const cp of [0x41, 0x00E9, 0x4E2D, 0x1F600]) { + const buf = Buffer.from(String.fromCodePoint(cp), 'utf8'); + expectMatches(writeFile(`single-cp-${cp.toString(16)}.txt`, buf), buf); + } + }); + + it('truncated multibyte at EOF', () => { + const buf = Buffer.from([0x41, 0xE4, 0xB8]); + expectMatches(writeFile('truncated-multibyte.txt', buf), buf); + }); +});