@@ -37,50 +37,6 @@ namespace nvtext {
3737namespace detail {
3838namespace {
3939
40- /* *
41- * @brief Compute the Levenshtein distance for each string pair
42- *
43- * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
44- * And here: https://en.wikipedia.org/wiki/Levenshtein_distance
45- *
46- * @param d_str First string
47- * @param d_tgt Second string
48- * @param buffer Working buffer for intermediate calculations
49- * @return The edit distance value
50- */
51- __device__ cudf::size_type compute_distance (cudf::string_view const & d_str,
52- cudf::string_view const & d_tgt,
53- cudf::size_type* buffer)
54- {
55- auto const str_length = d_str.length ();
56- auto const tgt_length = d_tgt.length ();
57- if (str_length == 0 ) return tgt_length;
58- if (tgt_length == 0 ) return str_length;
59-
60- auto begin = str_length < tgt_length ? d_str.begin () : d_tgt.begin ();
61- auto itr = str_length < tgt_length ? d_tgt.begin () : d_str.begin ();
62- auto const n = cuda::std::min (str_length, tgt_length);
63- auto const m = cuda::std::max (str_length, tgt_length);
64- // setup compute buffer pointers
65- auto v0 = buffer;
66- auto v1 = v0 + n + 1 ;
67- // initialize v0
68- thrust::sequence (thrust::seq, v0, v1);
69-
70- for (int i = 0 ; i < m; ++i, ++itr) {
71- auto itr_tgt = begin;
72- v1[0 ] = i + 1 ;
73- for (int j = 0 ; j < n; ++j, ++itr_tgt) {
74- auto sub_cost = v0[j] + (*itr != *itr_tgt);
75- auto del_cost = v0[j + 1 ] + 1 ;
76- auto ins_cost = v1[j] + 1 ;
77- v1[j + 1 ] = cuda::std::min (cuda::std::min (sub_cost, del_cost), ins_cost);
78- }
79- cuda::std::swap (v0, v1);
80- }
81- return v0[n];
82- }
83-
8440constexpr cudf::size_type row_pad_size = 2 ; // each row has potentially 2 extra values
8541
8642struct calculate_compute_buffer_fn {
@@ -322,120 +278,6 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
322278 return results;
323279}
324280
325- namespace {
326- struct edit_distance_matrix_levenshtein_algorithm {
327- cudf::column_device_view d_strings; // computing these against itself
328- cudf::size_type* d_buffer; // compute buffer for each string
329- std::ptrdiff_t const * d_offsets; // locate sub-buffer for each string
330- cudf::size_type* d_results; // edit distance values
331-
332- __device__ void operator ()(cudf::size_type idx) const
333- {
334- auto const strings_count = d_strings.size ();
335- auto const row = idx / strings_count;
336- auto const col = idx % strings_count;
337- if (row > col) return ; // bottom half is computed with the top half of matrix
338- cudf::string_view d_str1 =
339- d_strings.is_null (row) ? cudf::string_view{} : d_strings.element <cudf::string_view>(row);
340- cudf::string_view d_str2 =
341- d_strings.is_null (col) ? cudf::string_view{} : d_strings.element <cudf::string_view>(col);
342- auto work_buffer = d_buffer + d_offsets[idx - ((row + 1L ) * (row + 2L )) / 2L ];
343- auto const distance = (row == col) ? 0 : compute_distance (d_str1, d_str2, work_buffer);
344- d_results[idx] = distance; // top half of matrix
345- d_results[col * strings_count + row] = distance; // bottom half of matrix
346- }
347- };
348-
349- struct calculate_matrix_compute_buffer_fn {
350- cudf::column_device_view d_strings;
351- std::ptrdiff_t * d_sizes;
352-
353- __device__ void operator ()(cudf::size_type idx) const
354- {
355- auto const row = idx / d_strings.size ();
356- auto const col = idx % d_strings.size ();
357- if (row >= col) { return ; } // compute only the top half
358- cudf::string_view const d_str1 =
359- d_strings.is_null (row) ? cudf::string_view{} : d_strings.element <cudf::string_view>(row);
360- cudf::string_view const d_str2 =
361- d_strings.is_null (col) ? cudf::string_view{} : d_strings.element <cudf::string_view>(col);
362- if (d_str1.empty () || d_str2.empty ()) { return ; }
363- // the temp size needed is 2 integers per character of the shorter string
364- d_sizes[idx - ((row + 1L ) * (row + 2L )) / 2L ] =
365- (cuda::std::min (d_str1.length (), d_str2.length ()) + 1L ) * 2L ;
366- }
367- };
368-
369- } // namespace
370-
371- /* *
372- * @copydoc nvtext::edit_distance_matrix
373- */
374- std::unique_ptr<cudf::column> edit_distance_matrix (cudf::strings_column_view const & input,
375- rmm::cuda_stream_view stream,
376- rmm::device_async_resource_ref mr)
377- {
378- auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
379- if (input.is_empty ()) { return cudf::make_empty_column (output_type); }
380- CUDF_EXPECTS (
381- input.size () > 1 , " the input strings must include at least 2 strings" , std::invalid_argument);
382- CUDF_EXPECTS (input.size () * static_cast <size_t >(input.size ()) <
383- static_cast <std::size_t >(std::numeric_limits<cudf::size_type>().max ()),
384- " too many strings to create the output column" ,
385- std::overflow_error);
386-
387- // create device column of the input strings column
388- auto d_strings = cudf::column_device_view::create (input.parent (), stream);
389-
390- // Calculate the size of the compute-buffer.
391- // We only need memory for half the size of the output matrix since the edit distance calculation
392- // is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])`
393- auto const n_upper = (input.size () * (input.size () - 1L )) / 2L ;
394- auto const output_size = input.size () * input.size ();
395- rmm::device_uvector<std::ptrdiff_t > offsets (n_upper + 1 , stream);
396- thrust::uninitialized_fill (
397- rmm::exec_policy_nosync (stream, cudf::get_current_device_resource_ref ()),
398- offsets.begin (),
399- offsets.end (),
400- 0 );
401- thrust::for_each_n (rmm::exec_policy_nosync (stream, cudf::get_current_device_resource_ref ()),
402- cuda::counting_iterator<cudf::size_type>{0 },
403- output_size,
404- calculate_matrix_compute_buffer_fn{*d_strings, offsets.data ()});
405-
406- // get the total size for the compute buffer
407- // and convert sizes to offsets in-place
408- auto const compute_size =
409- cudf::detail::sizes_to_offsets (offsets.begin (), offsets.end (), offsets.begin (), 0 , stream);
410-
411- // create the compute buffer
412- rmm::device_uvector<cudf::size_type> compute_buffer (compute_size, stream);
413- auto d_buffer = compute_buffer.data ();
414-
415- // compute the edit distance into the output column
416- auto results = cudf::make_fixed_width_column (
417- output_type, output_size, rmm::device_buffer{0 , stream, mr}, 0 , stream, mr);
418- auto d_results = results->mutable_view ().data <cudf::size_type>();
419- thrust::for_each_n (
420- rmm::exec_policy_nosync (stream, cudf::get_current_device_resource_ref ()),
421- cuda::counting_iterator<cudf::size_type>{0 },
422- output_size,
423- edit_distance_matrix_levenshtein_algorithm{*d_strings, d_buffer, offsets.data (), d_results});
424-
425- // build a lists column of the results
426- auto offsets_column =
427- cudf::detail::sequence (input.size () + 1 ,
428- cudf::numeric_scalar<cudf::size_type>(0 , true , stream),
429- cudf::numeric_scalar<cudf::size_type>(input.size (), true , stream),
430- stream,
431- mr);
432- return cudf::make_lists_column (input.size (),
433- std::move (offsets_column),
434- std::move (results),
435- 0 , // no nulls
436- rmm::device_buffer{0 , stream, mr});
437- }
438-
439281} // namespace detail
440282
441283// external APIs
@@ -452,15 +294,4 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
452294 return detail::edit_distance (input, targets, stream, mr);
453295}
454296
455- /* *
456- * @copydoc nvtext::edit_distance_matrix
457- */
458- std::unique_ptr<cudf::column> edit_distance_matrix (cudf::strings_column_view const & input,
459- rmm::cuda_stream_view stream,
460- rmm::device_async_resource_ref mr)
461- {
462- CUDF_FUNC_RANGE ();
463- return detail::edit_distance_matrix (input, stream, mr);
464- }
465-
466297} // namespace nvtext
0 commit comments