Skip to content

Commit 1678beb

Browse files
committed
fix out of bound reads
1 parent 7a30d6b commit 1678beb

3 files changed

Lines changed: 44 additions & 27 deletions

File tree

extras/rapidfuzz_amalgamated.hpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2023-10-08 21:27:21.591281
4+
// Generated: 2023-10-09 03:12:47.555069
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -5229,17 +5229,18 @@ FlaggedCharsWord flag_similar_characters_word(const PM_Vec& PM, [[maybe_unused]]
52295229
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);
52305230

52315231
int64_t j = 0;
5232-
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j) {
5233-
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
5232+
auto T_iter = T.begin();
5233+
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j, ++T_iter) {
5234+
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
52345235

52355236
flagged.P_flag |= blsi(PM_j);
52365237
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
52375238

52385239
BoundMask = (BoundMask << 1) | 1;
52395240
}
52405241

5241-
for (; j < T.size(); ++j) {
5242-
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
5242+
for (; j < T.size(); ++j, ++T_iter) {
5243+
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
52435244

52445245
flagged.P_flag |= blsi(PM_j);
52455246
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
@@ -5348,8 +5349,9 @@ static inline FlaggedCharsMultiword flag_similar_characters_block(const BlockPat
53485349
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
53495350
BoundMask.first_mask = ~UINT64_C(0);
53505351

5351-
for (int64_t j = 0; j < T.size(); ++j) {
5352-
flag_similar_characters_step(PM, T[j], flagged, static_cast<size_t>(j), BoundMask);
5352+
auto T_iter = T.begin();
5353+
for (int64_t j = 0; j < T.size(); ++j, ++T_iter) {
5354+
flag_similar_characters_step(PM, *T_iter, flagged, static_cast<size_t>(j), BoundMask);
53535355

53545356
if (j + Bound + 1 < P.size()) {
53555357
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
@@ -5486,7 +5488,7 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
54865488
/* filter out based on the length difference between the two strings */
54875489
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;
54885490

5489-
if (P_len == 1 && T_len == 1) return static_cast<double>(P[0] == T[0]);
5491+
if (P_len == 1 && T_len == 1) return static_cast<double>(P.front() == T.front());
54905492

54915493
int64_t Bound = jaro_bounds(P, T);
54925494

@@ -5638,7 +5640,7 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
56385640
// this is solved by splitting the loop into two parts where after this boundary is reached
56395641
// the first bit inside boundMask is no longer set
56405642
int64_t j = 0;
5641-
for (; j < maxBound; ++j) {
5643+
for (; j < std::min(maxBound, s2_cur.size()); ++j) {
56425644
alignas(32) std::array<uint64_t, vecs> stored;
56435645
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
56445646
native_simd<VecType> X(stored.data());

rapidfuzz/distance/Jaro_impl.hpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,17 +112,18 @@ FlaggedCharsWord flag_similar_characters_word(const PM_Vec& PM, [[maybe_unused]]
112112
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);
113113

114114
int64_t j = 0;
115-
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j) {
116-
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
115+
auto T_iter = T.begin();
116+
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j,++T_iter) {
117+
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
117118

118119
flagged.P_flag |= blsi(PM_j);
119120
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
120121

121122
BoundMask = (BoundMask << 1) | 1;
122123
}
123124

124-
for (; j < T.size(); ++j) {
125-
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
125+
for (; j < T.size(); ++j,++T_iter) {
126+
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
126127

127128
flagged.P_flag |= blsi(PM_j);
128129
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
@@ -231,8 +232,9 @@ static inline FlaggedCharsMultiword flag_similar_characters_block(const BlockPat
231232
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
232233
BoundMask.first_mask = ~UINT64_C(0);
233234

234-
for (int64_t j = 0; j < T.size(); ++j) {
235-
flag_similar_characters_step(PM, T[j], flagged, static_cast<size_t>(j), BoundMask);
235+
auto T_iter = T.begin();
236+
for (int64_t j = 0; j < T.size(); ++j,++T_iter) {
237+
flag_similar_characters_step(PM, *T_iter, flagged, static_cast<size_t>(j), BoundMask);
236238

237239
if (j + Bound + 1 < P.size()) {
238240
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
@@ -370,7 +372,7 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
370372
/* filter out based on the length difference between the two strings */
371373
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;
372374

373-
if (P_len == 1 && T_len == 1) return static_cast<double>(P[0] == T[0]);
375+
if (P_len == 1 && T_len == 1) return static_cast<double>(P.front() == T.front());
374376

375377
int64_t Bound = jaro_bounds(P, T);
376378

@@ -522,7 +524,7 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
522524
// this is solved by splitting the loop into two parts where after this boundary is reached
523525
// the first bit inside boundMask is no longer set
524526
int64_t j = 0;
525-
for(; j < maxBound; ++j)
527+
for(; j < std::min(maxBound, s2_cur.size()); ++j)
526528
{
527529
alignas(32) std::array<uint64_t, vecs> stored;
528530
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });

test/distance/tests-Jaro.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,17 @@ double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cu
1515
double res3 = rapidfuzz::jaro_normalized_similarity(s1, s2, score_cutoff);
1616
double res4 =
1717
rapidfuzz::jaro_normalized_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff);
18+
#if 0 // todo
19+
double res5 = rapidfuzz::jaro_similarity(
20+
BidirectionalIterWrapper(s1.begin()), BidirectionalIterWrapper(s1.end()),
21+
BidirectionalIterWrapper(s2.begin()), BidirectionalIterWrapper(s2.end()), score_cutoff);
22+
#endif
23+
1824
rapidfuzz::CachedJaro scorer(s1);
19-
double res5 = scorer.similarity(s2, score_cutoff);
20-
double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
21-
double res7 = scorer.normalized_similarity(s2, score_cutoff);
22-
double res8 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);
25+
double res6 = scorer.similarity(s2, score_cutoff);
26+
double res7 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
27+
double res8 = scorer.normalized_similarity(s2, score_cutoff);
28+
double res9 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);
2329

2430
#ifdef RAPIDFUZZ_SIMD
2531
std::vector<double> results(256 / 8);
@@ -52,10 +58,11 @@ double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cu
5258
REQUIRE(res1 == Approx(res2));
5359
REQUIRE(res1 == Approx(res3));
5460
REQUIRE(res1 == Approx(res4));
55-
REQUIRE(res1 == Approx(res5));
61+
//REQUIRE(res1 == Approx(res5));
5662
REQUIRE(res1 == Approx(res6));
5763
REQUIRE(res1 == Approx(res7));
5864
REQUIRE(res1 == Approx(res8));
65+
REQUIRE(res1 == Approx(res9));
5966
return res1;
6067
}
6168

@@ -67,18 +74,24 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto
6774
double res3 = rapidfuzz::jaro_normalized_distance(s1, s2, score_cutoff);
6875
double res4 =
6976
rapidfuzz::jaro_normalized_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff);
77+
#if 0 // todo
78+
double res5 = rapidfuzz::jaro_distance(
79+
BidirectionalIterWrapper(s1.begin()), BidirectionalIterWrapper(s1.end()),
80+
BidirectionalIterWrapper(s2.begin()), BidirectionalIterWrapper(s2.end()), score_cutoff);
81+
#endif
7082
rapidfuzz::CachedJaro scorer(s1);
71-
double res5 = scorer.distance(s2, score_cutoff);
72-
double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
73-
double res7 = scorer.normalized_distance(s2, score_cutoff);
74-
double res8 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
83+
double res6 = scorer.distance(s2, score_cutoff);
84+
double res7 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
85+
double res8 = scorer.normalized_distance(s2, score_cutoff);
86+
double res9 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
7587
REQUIRE(res1 == Approx(res2));
7688
REQUIRE(res1 == Approx(res3));
7789
REQUIRE(res1 == Approx(res4));
78-
REQUIRE(res1 == Approx(res5));
90+
//REQUIRE(res1 == Approx(res5));
7991
REQUIRE(res1 == Approx(res6));
8092
REQUIRE(res1 == Approx(res7));
8193
REQUIRE(res1 == Approx(res8));
94+
REQUIRE(res1 == Approx(res9));
8295
return res1;
8396
}
8497

0 commit comments

Comments
 (0)