Skip to content

Commit 05bc74e

Browse files
authored
Deprecate the multi-patterns cudf::strings::replace_re API (#22380)
Deprecates the `cudf::strings::replace_re` function that accepts multiple regex patterns and replacements. This API does not follow the other regex API which all accept a `regex_program` parameter and has become difficult to maintain. This function pattern is not support by Pandas and there is no JNI wrapper for it either. After trying to create a libcudf benchmark for this API, it was found the function crashes if called with more than a few dozen rows even with only 2 patterns. The crash is due to a bug in the code which has never been reported (the bug was introduced 4 years ago according to git). Therefore, I have complete confidence that this API has never been used and can be removed in a future release. The gtests have also been removed to prevent deprecation warnings. This PR also includes a fix for the bug for completeness. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - Yunsong Wang (https://github.com/PointKernel) URL: #22380
1 parent e5b4e1d commit 05bc74e

7 files changed

Lines changed: 21 additions & 77 deletions

File tree

cpp/include/cudf/strings/replace_re.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#pragma once
@@ -53,6 +53,8 @@ std::unique_ptr<column> replace_re(
5353
* @brief For each string, replaces any character sequence matching the given patterns
5454
* with the corresponding string in the `replacements` column.
5555
*
56+
* @deprecated in 26.06. To be removed in a future release.
57+
*
5658
* Any null string entries return corresponding null output column entries.
5759
*
5860
* See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
@@ -65,7 +67,7 @@ std::unique_ptr<column> replace_re(
6567
* @param mr Device memory resource used to allocate the returned column's device memory
6668
* @return New strings column
6769
*/
68-
std::unique_ptr<column> replace_re(
70+
[[deprecated]] std::unique_ptr<column> replace_re(
6971
strings_column_view const& input,
7072
std::vector<std::string> const& patterns,
7173
strings_column_view const& replacements,

cpp/src/strings/regex/regexec.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ void reprog_device::set_working_memory(void* buffer, int32_t thread_count, int32
145145
{
146146
_buffer = buffer;
147147
_thread_count = thread_count;
148-
_max_insts = _max_insts > 0 ? _max_insts : _insts_count;
148+
_max_insts = max_insts > 0 ? max_insts : _insts_count;
149149
}
150150

151151
int32_t reprog_device::compute_shared_memory_size() const

cpp/src/strings/replace/multi_re.cu

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
158158
});
159159

160160
auto d_max_prog = **max_prog;
161+
auto const max_insts = d_max_prog.insts_counts();
161162
auto const buffer_size = d_max_prog.working_memory_size(input.size());
162163
auto d_buffer = rmm::device_buffer(buffer_size, stream);
163164

@@ -166,8 +167,8 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
166167
std::transform(h_progs.begin(),
167168
h_progs.end(),
168169
std::back_inserter(progs),
169-
[d_buffer = d_buffer.data(), size = input.size()](auto& prog) {
170-
prog->set_working_memory(d_buffer, size);
170+
[d_buffer = d_buffer.data(), size = input.size(), max_insts](auto& prog) {
171+
prog->set_working_memory(d_buffer, size, max_insts);
171172
return *prog;
172173
});
173174
auto d_progs =
@@ -184,6 +185,8 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
184185
stream,
185186
mr);
186187

188+
stream.synchronize();
189+
187190
return make_strings_column(input.size(),
188191
std::move(offsets_column),
189192
chars.release(),

cpp/tests/streams/strings/replace_test.cpp

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -25,17 +25,6 @@ TEST_F(StringsReplaceTest, Replace)
2525
cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream());
2626
cudf::strings::replace_multiple(view, view, view, cudf::test::get_default_stream());
2727
cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream());
28-
29-
auto const pattern = std::string("[a-z]");
30-
auto const prog = cudf::strings::regex_program::create(pattern);
31-
cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
32-
33-
cudf::test::strings_column_wrapper repls({"1", "a", " "});
34-
cudf::strings::replace_re(view,
35-
{pattern, pattern, pattern},
36-
cudf::strings_column_view(repls),
37-
cudf::strings::regex_flags::DEFAULT,
38-
cudf::test::get_default_stream());
3928
}
4029

4130
TEST_F(StringsReplaceTest, ReplaceRegex)
@@ -47,13 +36,6 @@ TEST_F(StringsReplaceTest, ReplaceRegex)
4736
auto const pattern = std::string("[a-z]");
4837
auto const prog = cudf::strings::regex_program::create(pattern);
4938
cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
50-
51-
cudf::test::strings_column_wrapper repls({"1", "a", " "});
52-
cudf::strings::replace_re(view,
53-
{pattern, pattern, pattern},
54-
cudf::strings_column_view(repls),
55-
cudf::strings::regex_flags::DEFAULT,
56-
cudf::test::get_default_stream());
5739
}
5840

5941
TEST_F(StringsReplaceTest, ReplaceRegexBackref)

cpp/tests/strings/replace_regex_tests.cpp

Lines changed: 3 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -51,38 +51,6 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
5151
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
5252
}
5353

54-
TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
55-
{
56-
std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
57-
"the fat cat lays next to the other accénted cat",
58-
"a slow moving turtlé cannot catch the bird",
59-
"which can be composéd together to form a more complete",
60-
"thé result does not include the value in the sum in",
61-
"",
62-
nullptr};
63-
64-
cudf::test::strings_column_wrapper strings(
65-
h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
66-
auto strings_view = cudf::strings_column_view(strings);
67-
68-
std::vector<char const*> h_expected{" quick brown fox jumps over lazy dog",
69-
" fat cat lays next to other accénted cat",
70-
"** slow moving turtlé cannot catch bird",
71-
"which can be composéd together to form ** more complete",
72-
"thé result does not include value N sum N",
73-
"",
74-
nullptr};
75-
76-
std::vector<std::string> patterns{"\\bthe\\b", "\\bin\\b", "\\ba\\b"};
77-
std::vector<std::string> h_repls{"", "N", "**"};
78-
cudf::test::strings_column_wrapper repls(h_repls.begin(), h_repls.end());
79-
auto repls_view = cudf::strings_column_view(repls);
80-
auto results = cudf::strings::replace_re(strings_view, patterns, repls_view);
81-
cudf::test::strings_column_wrapper expected(
82-
h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
83-
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
84-
}
85-
8654
TEST_F(StringsReplaceRegexTest, InvalidRegex)
8755
{
8856
// these are quantifiers that do not have a preceding character/class
@@ -103,13 +71,8 @@ TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
10371

10472
auto empty_pattern = std::string("");
10573
auto repl = cudf::string_scalar("bbb");
106-
std::vector<std::string> patterns({empty_pattern});
107-
cudf::test::strings_column_wrapper repls({"bbb"});
108-
auto repls_view = cudf::strings_column_view(repls);
109-
auto results = cudf::strings::replace_re(strings_view, patterns, repls_view);
110-
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
111-
auto prog = cudf::strings::regex_program::create(empty_pattern);
112-
results = cudf::strings::replace_re(strings_view, *prog, repl);
74+
auto prog = cudf::strings::regex_program::create(empty_pattern);
75+
auto results = cudf::strings::replace_re(strings_view, *prog, repl);
11376
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
11477
}
11578

@@ -237,17 +200,6 @@ TEST_F(StringsReplaceRegexTest, Multiline)
237200
results = cudf::strings::replace_re(sv, *prog, repl);
238201
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
239202

240-
// multi-replace
241-
std::vector<std::string> patterns({"aba$", "^aba"});
242-
cudf::test::strings_column_wrapper repls({">", "<"});
243-
results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls), multiline);
244-
cudf::test::strings_column_wrapper multi_expected_ml({"bcd\n>\nefg", ">\n< abab\n>", ">"});
245-
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected_ml);
246-
247-
results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls));
248-
cudf::test::strings_column_wrapper multi_expected({"bcd\naba\nefg", "<\naba abab\n>", ">"});
249-
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected);
250-
251203
// backref-replace
252204
auto repl_template = std::string("[\\1]");
253205
pattern = std::string("(^aba)");

python/cudf/cudf/core/accessors/string.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,11 @@ def replace(
10271027
)
10281028

10291029
if regex:
1030+
warnings.warn(
1031+
"regex support for multiple replace patterns "
1032+
"will be removed in a future version.",
1033+
FutureWarning,
1034+
)
10301035
result = self._column.replace_re(
10311036
list(pat),
10321037
as_column(repl, dtype=CUDF_STRING_DTYPE), # type: ignore[arg-type]

python/cudf/cudf/tests/series/accessors/test_str.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,15 +1535,15 @@ def test_string_replace_multi():
15351535
ps = pd.Series(["hello", "goodbye"])
15361536
gs = cudf.Series(["hello", "goodbye"])
15371537
expect = ps.str.replace("e", "E").str.replace("o", "O")
1538-
got = gs.str.replace(["e", "o"], ["E", "O"])
1538+
got = gs.str.replace(["e", "o"], ["E", "O"], regex=False)
15391539

15401540
assert_eq(expect, got)
15411541

15421542
ps = pd.Series(["foo", "fuz", np.nan])
15431543
gs = cudf.Series(ps)
15441544

15451545
expect = ps.str.replace("f.", "ba", regex=True)
1546-
got = gs.str.replace(["f."], ["ba"], regex=True)
1546+
got = gs.str.replace("f.", "ba", regex=True)
15471547
assert_eq(expect, got)
15481548

15491549
ps = pd.Series(["f.o", "fuz", np.nan])

0 commit comments

Comments
 (0)