Skip to content

Commit 7a30d6b

Browse files
committed
fix edge case in simd implementation of Jaro/JaroWinkler
1 parent 74551df commit 7a30d6b

6 files changed

Lines changed: 54 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
## Changelog
22

3+
## [2.1.1] - 2023-10-08
4+
### Fixed
5+
- fix edge case in new simd implementation of Jaro and Jaro Winkler
6+
37
## [2.1.0] - 2023-10-08
48
### Changed
59
- add support for bidirectional iterators

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
3232
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
3333
endif()
3434

35-
project(rapidfuzz LANGUAGES CXX VERSION 2.1.0)
35+
project(rapidfuzz LANGUAGES CXX VERSION 2.1.1)
3636

3737
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
3838
include(GNUInstallDirs)

extras/rapidfuzz_amalgamated.hpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2023-10-08 19:49:45.137761
4+
// Generated: 2023-10-08 21:27:21.591281
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -5632,9 +5632,15 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
56325632

56335633
native_simd<VecType> counter(VecType(1));
56345634

5635-
for (const auto& ch : s2_cur) {
5635+
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
5636+
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
5637+
// would incorrectly always set the first bit to 1.
5638+
// this is solved by splitting the loop into two parts where after this boundary is reached
5639+
// the first bit inside boundMask is no longer set
5640+
int64_t j = 0;
5641+
for (; j < maxBound; ++j) {
56365642
alignas(32) std::array<uint64_t, vecs> stored;
5637-
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
5643+
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
56385644
native_simd<VecType> X(stored.data());
56395645
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);
56405646

@@ -5645,6 +5651,19 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
56455651
boundMask = (boundMask << 1) | ((boundMask <= boundMaskSize) & one);
56465652
}
56475653

5654+
for (; j < s2_cur.size(); ++j) {
5655+
alignas(32) std::array<uint64_t, vecs> stored;
5656+
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
5657+
native_simd<VecType> X(stored.data());
5658+
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);
5659+
5660+
P_flag |= blsi(PM_j);
5661+
T_flag |= andnot(counter, (PM_j == zero));
5662+
5663+
counter = counter << 1;
5664+
boundMask = boundMask << 1;
5665+
}
5666+
56485667
auto counts = popcount(P_flag);
56495668
alignas(32) std::array<VecType, vec_width> P_flags;
56505669
P_flag.store(P_flags.data());

rapidfuzz/distance/Jaro_impl.hpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,9 +516,16 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
516516

517517
native_simd<VecType> counter(VecType(1));
518518

519-
for (const auto& ch : s2_cur) {
519+
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
520+
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
521+
// would incorrectly always set the first bit to 1.
522+
// this is solved by splitting the loop into two parts where after this boundary is reached
523+
// the first bit inside boundMask is no longer set
524+
int64_t j = 0;
525+
for(; j < maxBound; ++j)
526+
{
520527
alignas(32) std::array<uint64_t, vecs> stored;
521-
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
528+
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
522529
native_simd<VecType> X(stored.data());
523530
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);
524531

@@ -529,6 +536,20 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
529536
boundMask = (boundMask << 1) | ((boundMask <= boundMaskSize) & one);
530537
}
531538

539+
for(; j < s2_cur.size(); ++j)
540+
{
541+
alignas(32) std::array<uint64_t, vecs> stored;
542+
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
543+
native_simd<VecType> X(stored.data());
544+
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);
545+
546+
P_flag |= blsi(PM_j);
547+
T_flag |= andnot(counter, (PM_j == zero));
548+
549+
counter = counter << 1;
550+
boundMask = boundMask << 1;
551+
}
552+
532553
auto counts = popcount(P_flag);
533554
alignas(32) std::array<VecType, vec_width> P_flags;
534555
P_flag.store(P_flags.data());

test/distance/tests-Jaro.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ TEST_CASE("JaroTest")
128128
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.994872));
129129
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.989744));
130130

131+
REQUIRE(jaro_sim_test(std::string("01"), std::string("1111100000")) == Approx(0.53333333));
132+
131133
REQUIRE(jaro_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
132134
REQUIRE(jaro_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.922233));
133135
REQUIRE(jaro_sim_test(

test/distance/tests-JaroWinkler.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ TEST_CASE("JaroWinklerTest")
135135
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.996923));
136136
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.993846));
137137

138+
REQUIRE(jaro_winkler_sim_test(std::string("01"), std::string("1111100000")) == Approx(0.53333333));
139+
138140
REQUIRE(jaro_winkler_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
139141
REQUIRE(jaro_winkler_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.95334));
140142
REQUIRE(jaro_winkler_sim_test(

0 commit comments

Comments
 (0)