Skip to content

Commit d13f1f7

Browse files
committed
fix edge case in jaro simd implementation
1 parent a2cc696 commit d13f1f7

12 files changed

Lines changed: 162 additions & 81 deletions

bench/bench-jarowinkler.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
2828
return output;
2929
}
3030

31-
3231
static void BM_JaroLongSimilarSequence(benchmark::State& state)
3332
{
3433
size_t len = state.range(0);

extras/rapidfuzz_amalgamated.hpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2023-10-08 18:06:53.104764
4+
// Generated: 2023-10-08 19:49:45.137761
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -5438,13 +5438,10 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len)
54385438
/* since jaro uses a sliding window some parts of T/P might never be in
54395439
* range an can be removed ahead of time
54405440
*/
5441-
int64_t Bound = 0;
5442-
if (T_len > P_len) {
5443-
Bound = T_len / 2 - 1;
5444-
}
5445-
else {
5446-
Bound = P_len / 2 - 1;
5447-
}
5441+
int64_t Bound = (T_len > P_len) ? T_len : P_len;
5442+
Bound /= 2;
5443+
if (Bound > 0) Bound--;
5444+
54485445
return Bound;
54495446
}
54505447

@@ -5457,6 +5454,10 @@ int64_t jaro_bounds(Range<InputIt1>& P, Range<InputIt2>& T)
54575454
int64_t P_len = P.size();
54585455
int64_t T_len = T.size();
54595456

5457+
// this is currently an early exit condition
5458+
// if this is changed handle this below, so Bound is never below 0
5459+
assert(P_len != 0 || T_len != 0);
5460+
54605461
/* since jaro uses a sliding window some parts of T/P might never be in
54615462
* range an can be removed ahead of time
54625463
*/

fuzzing/fuzz_jaro_similarity.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,51 @@ bool is_close(double a, double b, double epsilon)
1313
return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
1414
}
1515

16+
template <size_t MaxLen>
17+
void validate_simd(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2)
18+
{
19+
#ifdef RAPIDFUZZ_SIMD
20+
size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
21+
if (count == 0) return;
22+
23+
rapidfuzz::experimental::MultiJaro<MaxLen> scorer(count);
24+
25+
std::vector<std::basic_string<uint8_t>> strings;
26+
27+
for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
28+
if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
29+
strings.emplace_back(it1, s1.end());
30+
break;
31+
}
32+
else {
33+
strings.emplace_back(it1, it1 + MaxLen);
34+
}
35+
}
36+
37+
for (const auto& s : strings)
38+
scorer.insert(s);
39+
40+
std::vector<double> simd_results(scorer.result_count());
41+
scorer.similarity(&simd_results[0], simd_results.size(), s2);
42+
43+
for (size_t i = 0; i < strings.size(); ++i) {
44+
double reference_sim = rapidfuzz_reference::jaro_similarity(strings[i], s2);
45+
46+
if (!is_close(simd_results[i], reference_sim, 0.0001)) {
47+
print_seq("s1", s1);
48+
print_seq("s2", s2);
49+
throw std::logic_error(std::string("jaro similarity using simd failed (reference_score = ") +
50+
std::to_string(reference_sim) + std::string(", score = ") +
51+
std::to_string(simd_results[i]) + ")");
52+
}
53+
}
54+
55+
#else
56+
(void)s1;
57+
(void)s2;
58+
#endif
59+
}
60+
1661
void validate_distance(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2)
1762
{
1863
double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2);
@@ -25,6 +70,11 @@ void validate_distance(const std::basic_string<uint8_t>& s1, const std::basic_st
2570
std::to_string(reference_sim) + std::string(", score = ") +
2671
std::to_string(sim) + ")");
2772
}
73+
74+
validate_simd<8>(s1, s2);
75+
validate_simd<16>(s1, s2);
76+
validate_simd<32>(s1, s2);
77+
validate_simd<64>(s1, s2);
2878
}
2979

3080
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)

fuzzing/fuzz_levenshtein_editops.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
#include <stdexcept>
88
#include <string>
99

10-
void validate_editops(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2, int64_t score, int64_t score_hint = std::numeric_limits<int64_t>::max())
10+
void validate_editops(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2,
11+
int64_t score, int64_t score_hint = std::numeric_limits<int64_t>::max())
1112
{
1213
rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint);
1314
if (static_cast<int64_t>(ops.size()) == score && s2 != rapidfuzz::editops_apply<uint8_t>(ops, s1, s2))

rapidfuzz/distance/Jaro_impl.hpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -321,13 +321,11 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len)
321321
/* since jaro uses a sliding window some parts of T/P might never be in
322322
* range an can be removed ahead of time
323323
*/
324-
int64_t Bound = 0;
325-
if (T_len > P_len) {
326-
Bound = T_len / 2 - 1;
327-
}
328-
else {
329-
Bound = P_len / 2 - 1;
330-
}
324+
int64_t Bound = (T_len > P_len) ? T_len : P_len;
325+
Bound /= 2;
326+
if(Bound > 0)
327+
Bound--;
328+
331329
return Bound;
332330
}
333331

@@ -340,6 +338,10 @@ int64_t jaro_bounds(Range<InputIt1>& P, Range<InputIt2>& T)
340338
int64_t P_len = P.size();
341339
int64_t T_len = T.size();
342340

341+
// this is currently an early exit condition
342+
// if this is changed handle this below, so Bound is never below 0
343+
assert(P_len != 0 || T_len != 0);
344+
343345
/* since jaro uses a sliding window some parts of T/P might never be in
344346
* range an can be removed ahead of time
345347
*/

test/common.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,13 @@ class BidirectionalIterWrapper {
5757
private:
5858
T iter;
5959
};
60+
61+
template <typename T>
62+
std::basic_string<T> str_multiply(std::basic_string<T> a, size_t b)
63+
{
64+
std::basic_string<T> output;
65+
while (b--)
66+
output += a;
67+
68+
return output;
69+
}

test/distance/tests-DamerauLevenshtein.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,6 @@
88

99
#include "../common.hpp"
1010

11-
template <typename T>
12-
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
13-
{
14-
std::basic_string<T> output;
15-
while (b--)
16-
output += a;
17-
18-
return output;
19-
}
20-
2111
template <typename Sentence1, typename Sentence2>
2212
int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
2313
int64_t max = std::numeric_limits<int64_t>::max())

test/distance/tests-Jaro.cpp

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <catch2/catch_test_macros.hpp>
44
#include <rapidfuzz/distance/Jaro.hpp>
55

6+
#include "../common.hpp"
7+
68
using Catch::Approx;
79

810
template <typename Sentence1, typename Sentence2>
@@ -80,10 +82,25 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto
8082
return res1;
8183
}
8284

83-
/**
84-
* @name JaroWinklerFlagCharsTest
85-
*/
86-
TEST_CASE("JaroWinklerTest")
85+
template <typename Sentence1, typename Sentence2>
86+
double jaro_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
87+
{
88+
INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff);
89+
double Sim_original = rapidfuzz_reference::jaro_similarity(s1, s2, score_cutoff);
90+
double Sim_bitparallel = jaro_similarity(s1, s2, score_cutoff);
91+
double Dist_bitparallel = jaro_distance(s1, s2, 1.0 - score_cutoff);
92+
double Sim_bitparallel2 = jaro_similarity(s2, s1, score_cutoff);
93+
double Dist_bitparallel2 = jaro_distance(s2, s1, 1.0 - score_cutoff);
94+
95+
96+
REQUIRE(Sim_original == Approx(Sim_bitparallel));
97+
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
98+
REQUIRE(Sim_original == Approx(Sim_bitparallel2));
99+
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2));
100+
return Sim_original;
101+
}
102+
103+
TEST_CASE("JaroTest")
87104
{
88105
std::array<std::string, 20> names = {"james", "robert", "john", "michael", "william",
89106
"david", "joseph", "thomas", "charles", "mary",
@@ -96,14 +113,26 @@ TEST_CASE("JaroWinklerTest")
96113

97114
for (double score_cutoff : score_cutoffs)
98115
for (const auto& name1 : names)
99-
for (const auto& name2 : names) {
100-
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
101-
double Sim_original = rapidfuzz_reference::jaro_similarity(name1, name2, score_cutoff);
102-
double Sim_bitparallel = jaro_similarity(name1, name2, score_cutoff);
103-
double Dist_bitparallel = jaro_distance(name1, name2, 1.0 - score_cutoff);
116+
for (const auto& name2 : names)
117+
jaro_sim_test(name1, name2, score_cutoff);
118+
}
119+
120+
SECTION("testEdgeCaseLengths")
121+
{
122+
REQUIRE(jaro_sim_test(std::string(""), std::string("")) == Approx(1));
123+
REQUIRE(jaro_sim_test(std::string("0"), std::string("0")) == Approx(1));
124+
REQUIRE(jaro_sim_test(std::string("00"), std::string("00")) == Approx(1));
125+
REQUIRE(jaro_sim_test(std::string("0"), std::string("00")) == Approx(0.833333));
126+
127+
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1));
128+
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.994872));
129+
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.989744));
104130

105-
REQUIRE(Sim_original == Approx(Sim_bitparallel));
106-
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
107-
}
131+
REQUIRE(jaro_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
132+
REQUIRE(jaro_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.922233));
133+
REQUIRE(jaro_sim_test(
134+
std::string("00000000000000000000000000000000000000000000000000000000000000000"),
135+
std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000")
136+
) == Approx(0.8359375));
108137
}
109138
}

test/distance/tests-JaroWinkler.cpp

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <catch2/catch_test_macros.hpp>
44
#include <rapidfuzz/distance/JaroWinkler.hpp>
55

6+
#include "../common.hpp"
7+
68
using Catch::Approx;
79

810
template <typename Sentence1, typename Sentence2>
@@ -86,9 +88,24 @@ double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double pr
8688
return res1;
8789
}
8890

89-
/**
90-
* @name JaroWinklerFlagCharsTest
91-
*/
91+
template <typename Sentence1, typename Sentence2>
92+
double jaro_winkler_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
93+
{
94+
INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff);
95+
double Sim_original = rapidfuzz_reference::jaro_winkler_similarity(s1, s2, 0.1, score_cutoff);
96+
double Sim_bitparallel = jaro_winkler_similarity(s1, s2, 0.1, score_cutoff);
97+
double Dist_bitparallel = jaro_winkler_distance(s1, s2, 0.1, 1.0 - score_cutoff);
98+
double Sim_bitparallel2 = jaro_winkler_similarity(s2, s1, 0.1, score_cutoff);
99+
double Dist_bitparallel2 = jaro_winkler_distance(s2, s1, 0.1, 1.0 - score_cutoff);
100+
101+
102+
REQUIRE(Sim_original == Approx(Sim_bitparallel));
103+
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
104+
REQUIRE(Sim_original == Approx(Sim_bitparallel2));
105+
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2));
106+
return Sim_original;
107+
}
108+
92109
TEST_CASE("JaroWinklerTest")
93110
{
94111
std::array<std::string, 22> names = {"james", "robert", "john", "michael", "william",
@@ -103,14 +120,26 @@ TEST_CASE("JaroWinklerTest")
103120

104121
for (double score_cutoff : score_cutoffs)
105122
for (const auto& name1 : names)
106-
for (const auto& name2 : names) {
107-
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
108-
double Sim_original =
109-
rapidfuzz_reference::jaro_winkler_similarity(name1, name2, 0.1, score_cutoff);
110-
double Sim_bitparallel = jaro_winkler_similarity(name1, name2, 0.1, score_cutoff);
111-
double Dist_bitparallel = jaro_winkler_distance(name1, name2, 0.1, 1.0 - score_cutoff);
112-
REQUIRE(Sim_original == Approx(Sim_bitparallel));
113-
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
114-
}
123+
for (const auto& name2 : names)
124+
jaro_winkler_sim_test(name1, name2, score_cutoff);
125+
}
126+
127+
SECTION("testEdgeCaseLengths")
128+
{
129+
REQUIRE(jaro_winkler_sim_test(std::string(""), std::string("")) == Approx(1));
130+
REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("0")) == Approx(1));
131+
REQUIRE(jaro_winkler_sim_test(std::string("00"), std::string("00")) == Approx(1));
132+
REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("00")) == Approx(0.85));
133+
134+
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1));
135+
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.996923));
136+
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.993846));
137+
138+
REQUIRE(jaro_winkler_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
139+
REQUIRE(jaro_winkler_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.95334));
140+
REQUIRE(jaro_winkler_sim_test(
141+
std::string("00000000000000000000000000000000000000000000000000000000000000000"),
142+
std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000")
143+
) == Approx(0.852344));
115144
}
116145
}

test/distance/tests-LCSseq.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,6 @@
77

88
#include "../common.hpp"
99

10-
template <typename T>
11-
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
12-
{
13-
std::basic_string<T> output;
14-
while (b--)
15-
output += a;
16-
17-
return output;
18-
}
19-
2010
template <typename Sentence1, typename Sentence2>
2111
int64_t lcs_seq_distance(const Sentence1& s1, const Sentence2& s2,
2212
int64_t max = std::numeric_limits<int64_t>::max())

0 commit comments

Comments
 (0)