Skip to content

Commit a92f8f1

Browse files
committed
src: optimize utf-8 length calculation for small and large strings
1 parent f6464c5 commit a92f8f1

File tree

1 file changed

+45
-40
lines changed

1 file changed

+45
-40
lines changed

src/node_buffer.cc

Lines changed: 45 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -761,9 +761,35 @@ void StringWrite(const FunctionCallbackInfo<Value>& args) {
761761
void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
762762
CHECK(args[0]->IsString());
763763

764-
// Fast case: avoid StringBytes on UTF8 string. Jump to v8.
765-
size_t result = args[0].As<String>()->Utf8LengthV2(args.GetIsolate());
766-
args.GetReturnValue().Set(static_cast<uint64_t>(result));
764+
Isolate* isolate = args.GetIsolate();
765+
Local<String> source = args[0].As<String>();
766+
767+
// For small strings, use V8's path for better performance
768+
static constexpr int kSmallStringThreshold = 128;
769+
if (source->Length() <= kSmallStringThreshold) {
770+
size_t result = source->Utf8LengthV2(isolate);
771+
args.GetReturnValue().Set(static_cast<uint64_t>(result));
772+
return;
773+
}
774+
775+
String::ValueView view(isolate, source);
776+
size_t length = view.length();
777+
size_t utf8_length;
778+
779+
if (view.is_one_byte()) {
780+
auto data = reinterpret_cast<const char*>(view.data8());
781+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
782+
if (result.error == simdutf::SUCCESS) {
783+
utf8_length = length; // Pure ASCII, length stays the same
784+
} else {
785+
utf8_length = simdutf::utf8_length_from_latin1(data, length);
786+
}
787+
} else {
788+
auto data = reinterpret_cast<const char16_t*>(view.data16());
789+
utf8_length = simdutf::utf8_length_from_utf16(data, length);
790+
}
791+
792+
args.GetReturnValue().Set(static_cast<uint64_t>(utf8_length));
767793
}
768794

769795
uint32_t FastByteLengthUtf8(
@@ -776,49 +802,28 @@ uint32_t FastByteLengthUtf8(
776802
CHECK(sourceValue->IsString());
777803
Local<String> sourceStr = sourceValue.As<String>();
778804

779-
if (!sourceStr->IsExternalOneByte()) {
805+
// For short inputs, use V8's path - function call overhead not worth it
806+
static constexpr int kSmallStringThreshold = 128;
807+
if (sourceStr->Length() <= kSmallStringThreshold) {
780808
return sourceStr->Utf8LengthV2(isolate);
781809
}
782-
auto source = sourceStr->GetExternalOneByteStringResource();
783-
// For short inputs, the function call overhead to simdutf is maybe
784-
// not worth it, reserve simdutf for long strings.
785-
if (source->length() > 128) {
786-
return simdutf::utf8_length_from_latin1(source->data(), source->length());
787-
}
788-
789-
uint32_t length = source->length();
790-
const auto input = reinterpret_cast<const uint8_t*>(source->data());
791-
792-
uint32_t answer = length;
793-
uint32_t i = 0;
794810

795-
auto pop = [](uint64_t v) {
796-
return static_cast<size_t>(((v >> 7) & UINT64_C(0x0101010101010101)) *
797-
UINT64_C(0x0101010101010101) >>
798-
56);
799-
};
811+
// For large strings, use simdutf with String::ValueView for direct access
812+
// This is ~6x faster for large strings
813+
String::ValueView view(isolate, sourceStr);
814+
size_t length = view.length();
800815

801-
for (; i + 32 <= length; i += 32) {
802-
uint64_t v;
803-
memcpy(&v, input + i, 8);
804-
answer += pop(v);
805-
memcpy(&v, input + i + 8, 8);
806-
answer += pop(v);
807-
memcpy(&v, input + i + 16, 8);
808-
answer += pop(v);
809-
memcpy(&v, input + i + 24, 8);
810-
answer += pop(v);
811-
}
812-
for (; i + 8 <= length; i += 8) {
813-
uint64_t v;
814-
memcpy(&v, input + i, 8);
815-
answer += pop(v);
816-
}
817-
for (; i + 1 <= length; i += 1) {
818-
answer += input[i] >> 7;
816+
if (view.is_one_byte()) {
817+
auto data = reinterpret_cast<const char*>(view.data8());
818+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
819+
if (result.error == simdutf::SUCCESS) {
820+
return length; // Pure ASCII, length stays the same
821+
}
822+
return simdutf::utf8_length_from_latin1(data, length);
819823
}
820824

821-
return answer;
825+
auto data = reinterpret_cast<const char16_t*>(view.data16());
826+
return simdutf::utf8_length_from_utf16(data, length);
822827
}
823828

824829
static CFunction fast_byte_length_utf8(CFunction::Make(FastByteLengthUtf8));

0 commit comments

Comments
 (0)