Skip to content

Commit 163fc13

Browse files
committed
reduce memory allocations
1 parent bed2d6c commit 163fc13

2 files changed

Lines changed: 54 additions & 33 deletions

File tree

extras/rapidfuzz_amalgamated.hpp

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2023-10-31 11:09:46.332642
4+
// Generated: 2023-10-31 11:32:43.133377
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -5822,29 +5822,43 @@ jaro_similarity_simd_long_s2(Range<double*> scores, const detail::BlockPatternMa
58225822
assert(block.size() % vecs == 0);
58235823
assert(static_cast<size_t>(s2.size()) > sizeof(VecType) * 8);
58245824

5825+
struct AlignedAlloc {
5826+
AlignedAlloc(size_t size)
5827+
{
5828+
// work around compilation failure in msvc
5829+
memory = operator new[](size, std::align_val_t(native_simd<VecType>::alignment));
5830+
}
5831+
5832+
~AlignedAlloc()
5833+
{
5834+
::operator delete[](memory, std::align_val_t(native_simd<VecType>::alignment));
5835+
}
5836+
5837+
void* memory = nullptr;
5838+
};
5839+
58255840
native_simd<VecType> zero(VecType(0));
58265841
native_simd<VecType> one(1);
58275842
size_t result_index = 0;
58285843

58295844
size_t s2_block_count = static_cast<size_t>(detail::ceil_div(s2.size(), sizeof(VecType) * 8));
5830-
std::vector<native_simd<VecType>> T_flag;
5831-
T_flag.resize(s2_block_count);
5832-
5833-
std::vector<native_simd<VecType>> counter;
5834-
counter.resize(s2_block_count);
5845+
AlignedAlloc memory(2 * s2_block_count * sizeof(native_simd<VecType>));
58355846

5836-
std::vector<std::array<VecType, vec_width>> T_flags;
5837-
T_flags.resize(s2_block_count);
5847+
native_simd<VecType>* T_flag = static_cast<native_simd<VecType>*>(memory.memory);
5848+
// reuse the same memory since counter is only required in the first half of the algorithm while
5849+
// T_flags is required in the second half
5850+
native_simd<VecType>* counter = static_cast<native_simd<VecType>*>(memory.memory) + s2_block_count;
5851+
VecType* T_flags = static_cast<VecType*>(memory.memory) + s2_block_count * vec_width;
58385852

58395853
for (size_t cur_vec = 0; cur_vec < block.size(); cur_vec += vecs) {
58405854
auto s2_cur = s2;
58415855
auto bounds = jaro_similarity_prepare_bound_long_s2(s1_lengths + result_index, s2_cur);
58425856

58435857
native_simd<VecType> P_flag(VecType(0));
58445858

5845-
std::fill(T_flag.begin(), T_flag.begin() + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
5859+
std::fill(T_flag, T_flag + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
58465860
native_simd<VecType>(VecType(0)));
5847-
std::fill(counter.begin(), counter.begin() + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
5861+
std::fill(counter, counter + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
58485862
native_simd<VecType>(VecType(1)));
58495863

58505864
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
@@ -5886,11 +5900,7 @@ jaro_similarity_simd_long_s2(Range<double*> scores, const detail::BlockPatternMa
58865900
P_flag.store(P_flags.data());
58875901

58885902
for (size_t i = 0; i < static_cast<size_t>(detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8)); ++i)
5889-
{
5890-
alignas(alignment) std::array<VecType, vec_width> T_flags_;
5891-
T_flag[i].store(T_flags_.data());
5892-
T_flags[i] = T_flags_;
5893-
}
5903+
T_flag[i].store(T_flags + i * vec_width);
58945904

58955905
for (size_t i = 0; i < vec_width; ++i) {
58965906
VecType CommonChars = counts[i];
@@ -5911,11 +5921,11 @@ jaro_similarity_simd_long_s2(Range<double*> scores, const detail::BlockPatternMa
59115921

59125922
{
59135923
size_t T_word_index = 0;
5914-
VecType T_flag_cur = T_flags[T_word_index][i];
5924+
VecType T_flag_cur = T_flags[T_word_index * vec_width + i];
59155925
while (P_flag_cur) {
59165926
while (!T_flag_cur) {
59175927
++T_word_index;
5918-
T_flag_cur = T_flags[T_word_index][i];
5928+
T_flag_cur = T_flags[T_word_index * vec_width + i];
59195929
}
59205930

59215931
VecType PatternFlagMask = blsi(P_flag_cur);

rapidfuzz/distance/Jaro_impl.hpp

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -585,28 +585,43 @@ static inline void jaro_similarity_simd_long_s2(Range<double*> scores, const det
585585
assert(block.size() % vecs == 0);
586586
assert(static_cast<size_t>(s2.size()) > sizeof(VecType)*8);
587587

588+
struct AlignedAlloc
589+
{
590+
AlignedAlloc(size_t size)
591+
{
592+
// work around compilation failure in msvc
593+
memory = operator new[](size, std::align_val_t(native_simd<VecType>::alignment));
594+
}
595+
596+
~AlignedAlloc()
597+
{
598+
::operator delete[] (memory, std::align_val_t(native_simd<VecType>::alignment));
599+
}
600+
601+
void* memory = nullptr;
602+
};
603+
588604
native_simd<VecType> zero(VecType(0));
589605
native_simd<VecType> one(1);
590606
size_t result_index = 0;
591607

592608
size_t s2_block_count = static_cast<size_t>(detail::ceil_div(s2.size(), sizeof(VecType)*8));
593-
std::vector<native_simd<VecType>> T_flag;
594-
T_flag.resize(s2_block_count);
595-
596-
std::vector<native_simd<VecType>> counter;
597-
counter.resize(s2_block_count);
609+
AlignedAlloc memory(2 * s2_block_count * sizeof(native_simd<VecType>));
598610

599-
std::vector<std::array<VecType, vec_width>> T_flags;
600-
T_flags.resize(s2_block_count);
611+
native_simd<VecType>* T_flag = static_cast<native_simd<VecType>*>(memory.memory);
612+
// reuse the same memory since counter is only required in the first half of the algorithm while
613+
// T_flags is required in the second half
614+
native_simd<VecType>* counter = static_cast<native_simd<VecType>*>(memory.memory) + s2_block_count;
615+
VecType* T_flags = static_cast<VecType*>(memory.memory) + s2_block_count * vec_width;
601616

602617
for (size_t cur_vec = 0; cur_vec < block.size(); cur_vec += vecs) {
603618
auto s2_cur = s2;
604619
auto bounds = jaro_similarity_prepare_bound_long_s2(s1_lengths + result_index, s2_cur);
605620

606621
native_simd<VecType> P_flag(VecType(0));
607622

608-
std::fill(T_flag.begin(), T_flag.begin() + detail::ceil_div(s2_cur.size(), sizeof(VecType)*8), native_simd<VecType>(VecType(0)));
609-
std::fill(counter.begin(), counter.begin() + detail::ceil_div(s2_cur.size(), sizeof(VecType)*8), native_simd<VecType>(VecType(1)));
623+
std::fill(T_flag, T_flag + detail::ceil_div(s2_cur.size(), sizeof(VecType)*8), native_simd<VecType>(VecType(0)));
624+
std::fill(counter, counter + detail::ceil_div(s2_cur.size(), sizeof(VecType)*8), native_simd<VecType>(VecType(1)));
610625

611626
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
612627
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
@@ -649,11 +664,7 @@ static inline void jaro_similarity_simd_long_s2(Range<double*> scores, const det
649664
P_flag.store(P_flags.data());
650665

651666
for(size_t i = 0; i < static_cast<size_t>(detail::ceil_div(s2_cur.size(), sizeof(VecType)*8)); ++i)
652-
{
653-
alignas(alignment) std::array<VecType, vec_width> T_flags_;
654-
T_flag[i].store(T_flags_.data());
655-
T_flags[i] = T_flags_;
656-
}
667+
T_flag[i].store(T_flags + i * vec_width);
657668

658669
for (size_t i = 0; i < vec_width; ++i) {
659670
VecType CommonChars = counts[i];
@@ -672,13 +683,13 @@ static inline void jaro_similarity_simd_long_s2(Range<double*> scores, const det
672683

673684
{
674685
size_t T_word_index = 0;
675-
VecType T_flag_cur = T_flags[T_word_index][i];
686+
VecType T_flag_cur = T_flags[T_word_index * vec_width + i];
676687
while(P_flag_cur)
677688
{
678689
while(!T_flag_cur)
679690
{
680691
++T_word_index;
681-
T_flag_cur = T_flags[T_word_index][i];
692+
T_flag_cur = T_flags[T_word_index * vec_width + i];
682693
}
683694

684695
VecType PatternFlagMask = blsi(P_flag_cur);

0 commit comments

Comments
 (0)