Skip to content

Commit 7b802c7

Browse files
authored
Remove deprecated nvtext::edit_distance_matrix (rapidsai#22644)
Removes the `nvtext::edit_distance_matrix()` API and pylibcudf and python equivalents deprecated in 26.04. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Igor Peshansky (https://github.com/igorpeshansky) URL: rapidsai#22644
1 parent 797d730 commit 7b802c7

9 files changed

Lines changed: 1 addition & 332 deletions

File tree

cpp/include/nvtext/edit_distance.hpp

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -54,46 +54,5 @@ std::unique_ptr<cudf::column> edit_distance(
5454
rmm::cuda_stream_view stream = cudf::get_default_stream(),
5555
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
5656

57-
/**
58-
* @brief Compute the edit distance between all the strings in the input column.
59-
*
60-
* @deprecated Deprecated since release 26.04
61-
*
62-
* This uses the Levenshtein algorithm to calculate the edit distance between
63-
* two strings as documented here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
64-
*
65-
* The output is essentially a `input.size() x input.size()` square matrix of integers.
66-
* All values at diagonal `row == col` are 0 since the edit distance between two identical
67-
* strings is zero. All values above the diagonal are reflected below since the edit distance
68-
* calculation is also commutative.
69-
*
70-
* @code{.pseudo}
71-
* Example:
72-
* s = ["hello", "hallo", "hella"]
73-
* d = edit_distance_matrix(s)
74-
* d is now [[0, 1, 1],
75-
* [1, 0, 2]
76-
* [1, 2, 0]]
77-
* @endcode
78-
*
79-
* Null entries for `input` are ignored and the edit distance
80-
* is computed as though the null entry is an empty string.
81-
*
82-
* The output is a lists column of size `input.size()` and where each list item
83-
* is `input.size()` elements.
84-
*
85-
* @throw std::invalid_argument if `input.size() == 1`
86-
* @throw std::overflow_error if `input.size() * input.size()` greater than max size_type
87-
*
88-
* @param input Strings column of input strings
89-
* @param stream CUDA stream used for device memory operations and kernel launches
90-
* @param mr Device memory resource used to allocate the returned column's device memory
91-
* @return New lists column of edit distance values
92-
*/
93-
[[deprecated]] std::unique_ptr<cudf::column> edit_distance_matrix(
94-
cudf::strings_column_view const& input,
95-
rmm::cuda_stream_view stream = cudf::get_default_stream(),
96-
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
97-
9857
/** @} */ // end of group
9958
} // namespace CUDF_EXPORT nvtext

cpp/src/text/edit_distance.cu

Lines changed: 0 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -37,50 +37,6 @@ namespace nvtext {
3737
namespace detail {
3838
namespace {
3939

40-
/**
41-
* @brief Compute the Levenshtein distance for each string pair
42-
*
43-
* Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
44-
* And here: https://en.wikipedia.org/wiki/Levenshtein_distance
45-
*
46-
* @param d_str First string
47-
* @param d_tgt Second string
48-
* @param buffer Working buffer for intermediate calculations
49-
* @return The edit distance value
50-
*/
51-
__device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
52-
cudf::string_view const& d_tgt,
53-
cudf::size_type* buffer)
54-
{
55-
auto const str_length = d_str.length();
56-
auto const tgt_length = d_tgt.length();
57-
if (str_length == 0) return tgt_length;
58-
if (tgt_length == 0) return str_length;
59-
60-
auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
61-
auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
62-
auto const n = cuda::std::min(str_length, tgt_length);
63-
auto const m = cuda::std::max(str_length, tgt_length);
64-
// setup compute buffer pointers
65-
auto v0 = buffer;
66-
auto v1 = v0 + n + 1;
67-
// initialize v0
68-
thrust::sequence(thrust::seq, v0, v1);
69-
70-
for (int i = 0; i < m; ++i, ++itr) {
71-
auto itr_tgt = begin;
72-
v1[0] = i + 1;
73-
for (int j = 0; j < n; ++j, ++itr_tgt) {
74-
auto sub_cost = v0[j] + (*itr != *itr_tgt);
75-
auto del_cost = v0[j + 1] + 1;
76-
auto ins_cost = v1[j] + 1;
77-
v1[j + 1] = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost);
78-
}
79-
cuda::std::swap(v0, v1);
80-
}
81-
return v0[n];
82-
}
83-
8440
constexpr cudf::size_type row_pad_size = 2; // each row has potentially 2 extra values
8541

8642
struct calculate_compute_buffer_fn {
@@ -322,120 +278,6 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
322278
return results;
323279
}
324280

325-
namespace {
326-
struct edit_distance_matrix_levenshtein_algorithm {
327-
cudf::column_device_view d_strings; // computing these against itself
328-
cudf::size_type* d_buffer; // compute buffer for each string
329-
std::ptrdiff_t const* d_offsets; // locate sub-buffer for each string
330-
cudf::size_type* d_results; // edit distance values
331-
332-
__device__ void operator()(cudf::size_type idx) const
333-
{
334-
auto const strings_count = d_strings.size();
335-
auto const row = idx / strings_count;
336-
auto const col = idx % strings_count;
337-
if (row > col) return; // bottom half is computed with the top half of matrix
338-
cudf::string_view d_str1 =
339-
d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
340-
cudf::string_view d_str2 =
341-
d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
342-
auto work_buffer = d_buffer + d_offsets[idx - ((row + 1L) * (row + 2L)) / 2L];
343-
auto const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer);
344-
d_results[idx] = distance; // top half of matrix
345-
d_results[col * strings_count + row] = distance; // bottom half of matrix
346-
}
347-
};
348-
349-
struct calculate_matrix_compute_buffer_fn {
350-
cudf::column_device_view d_strings;
351-
std::ptrdiff_t* d_sizes;
352-
353-
__device__ void operator()(cudf::size_type idx) const
354-
{
355-
auto const row = idx / d_strings.size();
356-
auto const col = idx % d_strings.size();
357-
if (row >= col) { return; } // compute only the top half
358-
cudf::string_view const d_str1 =
359-
d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
360-
cudf::string_view const d_str2 =
361-
d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
362-
if (d_str1.empty() || d_str2.empty()) { return; }
363-
// the temp size needed is 2 integers per character of the shorter string
364-
d_sizes[idx - ((row + 1L) * (row + 2L)) / 2L] =
365-
(cuda::std::min(d_str1.length(), d_str2.length()) + 1L) * 2L;
366-
}
367-
};
368-
369-
} // namespace
370-
371-
/**
372-
* @copydoc nvtext::edit_distance_matrix
373-
*/
374-
std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
375-
rmm::cuda_stream_view stream,
376-
rmm::device_async_resource_ref mr)
377-
{
378-
auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
379-
if (input.is_empty()) { return cudf::make_empty_column(output_type); }
380-
CUDF_EXPECTS(
381-
input.size() > 1, "the input strings must include at least 2 strings", std::invalid_argument);
382-
CUDF_EXPECTS(input.size() * static_cast<size_t>(input.size()) <
383-
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>().max()),
384-
"too many strings to create the output column",
385-
std::overflow_error);
386-
387-
// create device column of the input strings column
388-
auto d_strings = cudf::column_device_view::create(input.parent(), stream);
389-
390-
// Calculate the size of the compute-buffer.
391-
// We only need memory for half the size of the output matrix since the edit distance calculation
392-
// is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])`
393-
auto const n_upper = (input.size() * (input.size() - 1L)) / 2L;
394-
auto const output_size = input.size() * input.size();
395-
rmm::device_uvector<std::ptrdiff_t> offsets(n_upper + 1, stream);
396-
thrust::uninitialized_fill(
397-
rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()),
398-
offsets.begin(),
399-
offsets.end(),
400-
0);
401-
thrust::for_each_n(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()),
402-
cuda::counting_iterator<cudf::size_type>{0},
403-
output_size,
404-
calculate_matrix_compute_buffer_fn{*d_strings, offsets.data()});
405-
406-
// get the total size for the compute buffer
407-
// and convert sizes to offsets in-place
408-
auto const compute_size =
409-
cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), 0, stream);
410-
411-
// create the compute buffer
412-
rmm::device_uvector<cudf::size_type> compute_buffer(compute_size, stream);
413-
auto d_buffer = compute_buffer.data();
414-
415-
// compute the edit distance into the output column
416-
auto results = cudf::make_fixed_width_column(
417-
output_type, output_size, rmm::device_buffer{0, stream, mr}, 0, stream, mr);
418-
auto d_results = results->mutable_view().data<cudf::size_type>();
419-
thrust::for_each_n(
420-
rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()),
421-
cuda::counting_iterator<cudf::size_type>{0},
422-
output_size,
423-
edit_distance_matrix_levenshtein_algorithm{*d_strings, d_buffer, offsets.data(), d_results});
424-
425-
// build a lists column of the results
426-
auto offsets_column =
427-
cudf::detail::sequence(input.size() + 1,
428-
cudf::numeric_scalar<cudf::size_type>(0, true, stream),
429-
cudf::numeric_scalar<cudf::size_type>(input.size(), true, stream),
430-
stream,
431-
mr);
432-
return cudf::make_lists_column(input.size(),
433-
std::move(offsets_column),
434-
std::move(results),
435-
0, // no nulls
436-
rmm::device_buffer{0, stream, mr});
437-
}
438-
439281
} // namespace detail
440282

441283
// external APIs
@@ -452,15 +294,4 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
452294
return detail::edit_distance(input, targets, stream, mr);
453295
}
454296

455-
/**
456-
* @copydoc nvtext::edit_distance_matrix
457-
*/
458-
std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
459-
rmm::cuda_stream_view stream,
460-
rmm::device_async_resource_ref mr)
461-
{
462-
CUDF_FUNC_RANGE();
463-
return detail::edit_distance_matrix(input, stream, mr);
464-
}
465-
466297
} // namespace nvtext

docs/cudf/source/cudf/api_docs/series.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,6 @@ String handling
354354
count
355355
detokenize
356356
edit_distance
357-
edit_distance_matrix
358357
endswith
359358
extract
360359
filter_alphanum

python/cudf/cudf/core/accessors/string.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5371,45 +5371,6 @@ def edit_distance(self, targets) -> Series | Index:
53715371
self._column.edit_distance(targets_column) # type: ignore[arg-type]
53725372
)
53735373

5374-
def edit_distance_matrix(self) -> Series | Index:
5375-
"""Computes the edit distance between strings in the series.
5376-
5377-
The series to compute the matrix should have more than 2 strings and
5378-
should not contain nulls.
5379-
5380-
Edit distance is measured based on the `Levenshtein edit distance
5381-
algorithm <https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
5382-
5383-
Returns
5384-
-------
5385-
Series of ListDtype(int64)
5386-
Assume ``N`` is the length of this series. The return series
5387-
contains ``N`` lists of size ``N``, where the ``j`` th number in
5388-
the ``i`` th row of the series tells the edit distance between the
5389-
``i`` th string and the ``j`` th string of this series. The matrix
5390-
is symmetric. Diagonal elements are 0.
5391-
5392-
Examples
5393-
--------
5394-
>>> import cudf
5395-
>>> s = cudf.Series(['abc', 'bc', 'cba'])
5396-
>>> s.str.edit_distance_matrix()
5397-
0 [0, 1, 2]
5398-
1 [1, 0, 2]
5399-
2 [2, 2, 0]
5400-
dtype: list
5401-
"""
5402-
if self._column.size < 2:
5403-
raise ValueError(
5404-
"Require size >= 2 to compute edit distance matrix."
5405-
)
5406-
if self._column.has_nulls():
5407-
raise ValueError(
5408-
"Cannot compute edit distance between null strings. "
5409-
"Consider removing them using `dropna` or fill with `fillna`."
5410-
)
5411-
return self._return_or_inplace(self._column.edit_distance_matrix())
5412-
54135374
def minhash(
54145375
self, seed: int, a: ColumnLike, b: ColumnLike, width: int
54155376
) -> Series | Index:

python/cudf/cudf/core/column/string.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import itertools
77
import re
8-
import warnings
98
from functools import lru_cache
109
from typing import TYPE_CHECKING, Any, Self, cast
1110

@@ -751,25 +750,6 @@ def edit_distance(self, targets: Self) -> NumericalColumn:
751750
),
752751
)
753752

754-
def edit_distance_matrix(self) -> ListColumn:
755-
warnings.warn(
756-
"edit_distance_matrix is deprecated. Use edit_distance instead.",
757-
FutureWarning,
758-
)
759-
with self.access(mode="read", scope="internal"):
760-
result = plc.nvtext.edit_distance.edit_distance_matrix(
761-
self.plc_column
762-
)
763-
return cast(
764-
cudf.core.column.lists.ListColumn,
765-
ColumnBase.create(
766-
result,
767-
cudf.ListDtype(
768-
get_dtype_of_same_kind(self.dtype, np.dtype(np.int32))
769-
),
770-
),
771-
)
772-
773753
def byte_pair_encoding(
774754
self,
775755
merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,

python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,3 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
1818
cudaStream_t stream,
1919
device_async_resource_ref mr
2020
) except +libcudf_exception_handler
21-
22-
cdef unique_ptr[column] edit_distance_matrix(
23-
const column_view & strings,
24-
cudaStream_t stream,
25-
device_async_resource_ref mr
26-
) except +libcudf_exception_handler

python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,3 @@ cpdef Column edit_distance(
1111
object stream = *,
1212
DeviceMemoryResource mr=*,
1313
)
14-
15-
cpdef Column edit_distance_matrix(
16-
Column input,
17-
object stream = *,
18-
DeviceMemoryResource mr=*,
19-
)

python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,3 @@ def edit_distance(
1212
stream: CudaStreamLike | None = None,
1313
mr: DeviceMemoryResource | None = None,
1414
) -> Column: ...
15-
def edit_distance_matrix(
16-
input: Column,
17-
stream: CudaStreamLike | None = None,
18-
mr: DeviceMemoryResource | None = None,
19-
) -> Column: ...

0 commit comments

Comments
 (0)