From 0edf65f8e6af3672901ae56c82be5854e308188c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Apr 2026 13:22:28 -0700 Subject: [PATCH 01/29] Add partitioned probe support for hash joins --- cpp/CMakeLists.txt | 4 + cpp/include/cudf/detail/join/hash_join.hpp | 36 ++- cpp/include/cudf/join/hash_join.hpp | 97 ++++++ cpp/include/cudf/join/join.hpp | 6 + .../join/hash_join/full_join_complement.cu | 92 ++++++ cpp/src/join/hash_join/hash_join.cu | 27 ++ .../join/hash_join/partitioned_full_join.cu | 26 ++ .../join/hash_join/partitioned_inner_join.cu | 26 ++ .../join/hash_join/partitioned_left_join.cu | 26 ++ cpp/src/join/hash_join/retrieve_impl.cuh | 100 ++++++ cpp/tests/join/join_tests.cpp | 306 ++++++++++++++++++ 11 files changed, 745 insertions(+), 1 deletion(-) create mode 100644 cpp/src/join/hash_join/full_join_complement.cu create mode 100644 cpp/src/join/hash_join/partitioned_full_join.cu create mode 100644 cpp/src/join/hash_join/partitioned_inner_join.cu create mode 100644 cpp/src/join/hash_join/partitioned_left_join.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 911191a3895..5d21e5a4060 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -682,6 +682,10 @@ add_library( src/join/hash_join/left_join_retrieve.cu src/join/hash_join/left_join_size.cu src/join/hash_join/match_context.cu + src/join/hash_join/partitioned_full_join.cu + src/join/hash_join/partitioned_inner_join.cu + src/join/hash_join/partitioned_left_join.cu + src/join/hash_join/full_join_complement.cu src/join/mark_join.cu src/join/filter_join_indices_jit.cu src/join/join.cu diff --git a/cpp/include/cudf/detail/join/hash_join.hpp b/cpp/include/cudf/detail/join/hash_join.hpp index b1b96ca7218..ca94b1d02bb 100644 --- a/cpp/include/cudf/detail/join/hash_join.hpp +++ b/cpp/include/cudf/detail/join/hash_join.hpp @@ -149,6 +149,33 @@ class hash_join { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + /** + * @copydoc cudf::hash_join::partitioned_inner_join + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_inner_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::partitioned_left_join + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_left_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::partitioned_full_join + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_full_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + private: bool const _is_empty; ///< true if `_hash_table` is empty bool const _has_nulls; ///< true if nulls are present in either right table or any left table @@ -167,7 +194,14 @@ class hash_join { template [[nodiscard]] std::pair>, std::unique_ptr>> - join_retrieve(cudf::table_view const& left, + partitioned_join_retrieve(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + template + [[nodiscard]] std::pair>, + std::unique_ptr>> + join_retrieve(cudf::table_view const& probe, std::optional output_size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index 0865fb784cb..c94d9242aa2 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -308,6 +309,102 @@ class hash_join { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + /** + * @brief Performs an inner join on a partition of the probe table. + * + * This method executes an inner join between a specific partition of the probe table + * (defined by the join_partition_context) and the build table. The context must have been + * previously created by calling inner_join_match_context(). + * + * The returned left_indices are relative to the original complete probe table, not just the + * partition, so they can be used directly with the original probe table. + * + * @param context The partition context containing match information and partition bounds + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the join indices' device memory + * + * @return A pair of device vectors [`left_indices`, `right_indices`] for this partition + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_inner_join( + cudf::join_partition_context const& context, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + + /** + * @brief Performs a left join on a partition of the probe table. + * + * This method executes a left join between a specific partition of the probe table + * (defined by the join_partition_context) and the build table. The context must have been + * previously created by calling left_join_match_context(). + * + * The returned left_indices are relative to the original complete probe table. + * + * @param context The partition context containing match information and partition bounds + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the join indices' device memory + * + * @return A pair of device vectors [`left_indices`, `right_indices`] for this partition + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_left_join( + cudf::join_partition_context const& context, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + + /** + * @brief Performs a full join probe on a partition of the probe table. + * + * This method executes the probe-side of a full join between a specific partition of the probe + * table (defined by the join_partition_context) and the build table. The context must have been + * previously created by calling full_join_match_context(). + * + * @note This method does NOT include unmatched build rows (the complement). Since the complement + * is a global property across all partitions, it must be computed separately after all partitions + * are processed using full_join_complement(). + * + * The returned left_indices are relative to the original complete probe table. + * + * @param context The partition context containing match information and partition bounds + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the join indices' device memory + * + * @return A pair of device vectors [`left_indices`, `right_indices`] for this partition + */ + [[nodiscard]] std::pair>, + std::unique_ptr>> + partitioned_full_join( + cudf::join_partition_context const& context, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + + /** + * @brief Computes the full join complement: unmatched build rows. + * + * After performing partitioned_full_join() on all partitions, call this method with the + * concatenated right_indices from all partitions to obtain the unmatched build table rows. + * The result can be concatenated with the partitioned results to form the complete full join. + * + * @param right_indices Concatenated right (build) indices from all partitioned_full_join() calls + * @param probe_table_num_rows Total number of rows in the probe table + * @param build_table_num_rows Total number of rows in the build table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result device memory + * + * @return A pair of device vectors [`left_indices`, `right_indices`] for unmatched build rows, + * where left_indices are all JoinNoMatch and right_indices are the unmatched build row + * indices + */ + [[nodiscard]] static std::pair>, + std::unique_ptr>> + full_join_complement(cudf::device_span right_indices, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + private: std::unique_ptr _impl; }; diff --git a/cpp/include/cudf/join/join.hpp b/cpp/include/cudf/join/join.hpp index 86bc2a5a8f5..67620505422 100644 --- a/cpp/include/cudf/join/join.hpp +++ b/cpp/include/cudf/join/join.hpp @@ -87,6 +87,12 @@ struct join_match_context { : _left_table{left_table}, _match_counts{std::move(match_counts)} { } + join_match_context(join_match_context&&) = default; ///< Move constructor + /** + * @brief Move assignment operator + * @return Reference to this object + */ + join_match_context& operator=(join_match_context&&) = default; virtual ~join_match_context() = default; ///< Virtual destructor for proper polymorphic deletion }; diff --git a/cpp/src/join/hash_join/full_join_complement.cu b/cpp/src/join/hash_join/full_join_complement.cu new file mode 100644 index 00000000000..372755d991e --- /dev/null +++ b/cpp/src/join/hash_join/full_join_complement.cu @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace cudf { + +namespace { + +template +struct valid_range { + T start, stop; + __device__ constexpr bool operator()(T index) const { return index >= start && index < stop; } +}; + +} // namespace + +std::pair>, + std::unique_ptr>> +hash_join::full_join_complement(cudf::device_span right_indices, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto right_indices_complement = + std::make_unique>(build_table_num_rows, stream, mr); + + if (probe_table_num_rows == 0) { + // All build rows are unmatched + thrust::sequence(rmm::exec_policy_nosync(stream), + right_indices_complement->begin(), + right_indices_complement->end(), + 0); + } else { + auto invalid_index_map = + std::make_unique>(build_table_num_rows, stream); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + invalid_index_map->begin(), + invalid_index_map->end(), + int32_t{1}); + + valid_range valid{0, build_table_num_rows}; + + thrust::scatter_if(rmm::exec_policy_nosync(stream), + cuda::make_constant_iterator(0), + cuda::make_constant_iterator(0) + right_indices.size(), + right_indices.begin(), + right_indices.begin(), + invalid_index_map->begin(), + valid); + + auto const begin_counter = static_cast(0); + auto const end_counter = static_cast(build_table_num_rows); + + size_type const indices_count = thrust::copy_if(rmm::exec_policy_nosync(stream), + cuda::counting_iterator{begin_counter}, + cuda::counting_iterator{end_counter}, + invalid_index_map->begin(), + right_indices_complement->begin(), + cuda::std::identity{}) - + right_indices_complement->begin(); + right_indices_complement->resize(indices_count, stream); + } + + auto left_invalid_indices = + std::make_unique>(right_indices_complement->size(), stream, mr); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + left_invalid_indices->begin(), + left_invalid_indices->end(), + cudf::JoinNoMatch); + + return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement)); +} + +} // namespace cudf diff --git a/cpp/src/join/hash_join/hash_join.cu b/cpp/src/join/hash_join/hash_join.cu index 5c7679ae588..26f25001c4c 100644 --- a/cpp/src/join/hash_join/hash_join.cu +++ b/cpp/src/join/hash_join/hash_join.cu @@ -259,4 +259,31 @@ cudf::join_match_context hash_join::full_join_match_context(cudf::table_view con return _impl->full_join_match_context(left, stream, mr); } +std::pair>, + std::unique_ptr>> +hash_join::partitioned_inner_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->partitioned_inner_join(context, stream, mr); +} + +std::pair>, + std::unique_ptr>> +hash_join::partitioned_left_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->partitioned_left_join(context, stream, mr); +} + +std::pair>, + std::unique_ptr>> +hash_join::partitioned_full_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->partitioned_full_join(context, stream, mr); +} + } // namespace cudf diff --git a/cpp/src/join/hash_join/partitioned_full_join.cu b/cpp/src/join/hash_join/partitioned_full_join.cu new file mode 100644 index 00000000000..5934574dbda --- /dev/null +++ b/cpp/src/join/hash_join/partitioned_full_join.cu @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "retrieve_impl.cuh" + +namespace cudf::detail { + +template +std::pair>, + std::unique_ptr>> +hash_join::partitioned_full_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return this->template partitioned_join_retrieve(context, stream, mr); +} + +template std::pair>, + std::unique_ptr>> +hash_join::partitioned_full_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_inner_join.cu b/cpp/src/join/hash_join/partitioned_inner_join.cu new file mode 100644 index 00000000000..38ba93374e3 --- /dev/null +++ b/cpp/src/join/hash_join/partitioned_inner_join.cu @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "retrieve_impl.cuh" + +namespace cudf::detail { + +template +std::pair>, + std::unique_ptr>> +hash_join::partitioned_inner_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return this->template partitioned_join_retrieve(context, stream, mr); +} + +template std::pair>, + std::unique_ptr>> +hash_join::partitioned_inner_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_left_join.cu b/cpp/src/join/hash_join/partitioned_left_join.cu new file mode 100644 index 00000000000..de360af305c --- /dev/null +++ b/cpp/src/join/hash_join/partitioned_left_join.cu @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "retrieve_impl.cuh" + +namespace cudf::detail { + +template +std::pair>, + std::unique_ptr>> +hash_join::partitioned_left_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return this->template partitioned_join_retrieve(context, stream, mr); +} + +template std::pair>, + std::unique_ptr>> +hash_join::partitioned_left_join(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index 58b29562a2c..aacd8f79970 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -10,12 +10,20 @@ #include "join/join_common_utils.hpp" #include "size_impl.cuh" +#include #include +#include #include +#include + #include +#include #include #include +#include +#include +#include namespace cudf::detail { @@ -190,4 +198,96 @@ hash_join::join_retrieve(cudf::table_view const& left, } } +template +template +std::pair>, + std::unique_ptr>> +hash_join::partitioned_join_retrieve(cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + CUDF_FUNC_RANGE(); + + static_assert(Join == join_kind::INNER_JOIN || Join == join_kind::LEFT_JOIN || + Join == join_kind::FULL_JOIN); + + auto const& match_ctx = *context.left_table_context; + auto const left_start_idx = context.left_start_idx; + auto const left_end_idx = context.left_end_idx; + + // Empty partition + if (left_start_idx >= left_end_idx) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + + auto const partition_size = left_end_idx - left_start_idx; + + // Trivial case: build table is empty + if (_is_empty) { + if constexpr (Join == join_kind::INNER_JOIN) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } else { + auto left_indices = + std::make_unique>(partition_size, stream, mr); + auto right_indices = + std::make_unique>(partition_size, stream, mr); + thrust::sequence(rmm::exec_policy_nosync(stream), + left_indices->begin(), + left_indices->end(), + left_start_idx); + thrust::fill( + rmm::exec_policy_nosync(stream), right_indices->begin(), right_indices->end(), JoinNoMatch); + return std::pair(std::move(left_indices), std::move(right_indices)); + } + } + + // Compute output size from pre-computed match counts + auto const output_size = thrust::reduce(rmm::exec_policy_nosync(stream), + match_ctx._match_counts->begin() + left_start_idx, + match_ctx._match_counts->begin() + left_end_idx, + std::size_t{0}); + + if (output_size == 0) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + + // Slice the probe table to the partition range + auto const probe_partition_view = + cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; + + validate_hash_join_probe(_build, probe_partition_view, _has_nulls); + + auto const preprocessed_probe = + cudf::detail::row::equality::preprocessed_table::create(probe_partition_view, stream); + + // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) + constexpr auto probe_join = Join == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : Join; + + auto join_indices = cudf::detail::probe_join_hash_table(_build, + probe_partition_view, + _preprocessed_build, + preprocessed_probe, + _impl->_hash_table, + _has_nulls, + _nulls_equal, + output_size, + stream, + mr); + + // Offset left indices to be relative to the original complete probe table + if (left_start_idx > 0 && join_indices.first->size() > 0) { + thrust::transform(rmm::exec_policy_nosync(stream), + join_indices.first->begin(), + join_indices.first->end(), + cuda::make_constant_iterator(left_start_idx), + join_indices.first->begin(), + cuda::std::plus{}); + } + + return join_indices; +} + } // namespace cudf::detail diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index d08c7fcf8a6..61f173e0199 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -3137,4 +3137,310 @@ TEST_F(SortMergeJoinThreadSafetyTest, ConcurrentPartitionedJoins) } } +TEST_F(JoinTest, HashJoinPartitionedInnerJoin) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto const left_on = std::vector({0, 1}); + auto const right_on = std::vector({0, 1}); + auto const compare_nulls = cudf::null_equality::EQUAL; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + // Reference result from full inner join + auto expected_result = inner_join(t0, t1, left_on, right_on, compare_nulls); + auto expected_sort_order = cudf::sorted_order(expected_result->view()); + auto expected_sorted = cudf::gather(expected_result->view(), *expected_sort_order); + + // Partitioned inner join + cudf::hash_join hash_joiner(t1.select(right_on), compare_nulls, stream); + auto match_ctx = hash_joiner.inner_join_match_context(t0.select(left_on), stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, 0}; + + auto join_and_gather = [&](cudf::join_partition_context const& ctx) { + auto const [left_idx, right_idx] = hash_joiner.partitioned_inner_join(ctx, stream, mr); + auto left_col = cudf::column_view{cudf::device_span{*left_idx}}; + auto right_col = cudf::column_view{cudf::device_span{*right_idx}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::DONT_CHECK); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::DONT_CHECK); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert(joined.end(), + std::make_move_iterator(right_c.begin()), + std::make_move_iterator(right_c.end())); + return std::make_unique(std::move(joined)); + }; + + // Process row by row + std::vector> partials; + std::vector partial_views; + for (cudf::size_type i = 0; i < t0.num_rows(); i++) { + part_ctx.left_start_idx = i; + part_ctx.left_end_idx = i + 1; + partials.push_back(join_and_gather(part_ctx)); + partial_views.push_back(partials.back()->view()); + } + + auto concat_result = cudf::concatenate(partial_views, stream, mr); + auto concat_sort_order = cudf::sorted_order(concat_result->view()); + auto concat_sorted = cudf::gather(concat_result->view(), *concat_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sorted, *concat_sorted); +} + +TEST_F(JoinTest, HashJoinPartitionedLeftJoin) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto const left_on = std::vector({0, 1}); + auto const right_on = std::vector({0, 1}); + auto const compare_nulls = cudf::null_equality::EQUAL; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + // Reference result from full left join + auto expected_result = left_join(t0, t1, left_on, right_on, compare_nulls); + auto expected_sort_order = cudf::sorted_order(expected_result->view()); + auto expected_sorted = cudf::gather(expected_result->view(), *expected_sort_order); + + // Partitioned left join + cudf::hash_join hash_joiner(t1.select(right_on), compare_nulls, stream); + auto match_ctx = hash_joiner.left_join_match_context(t0.select(left_on), stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, 0}; + + auto join_and_gather = [&](cudf::join_partition_context const& ctx) { + auto const [left_idx, right_idx] = hash_joiner.partitioned_left_join(ctx, stream, mr); + auto left_col = cudf::column_view{cudf::device_span{*left_idx}}; + auto right_col = cudf::column_view{cudf::device_span{*right_idx}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert(joined.end(), + std::make_move_iterator(right_c.begin()), + std::make_move_iterator(right_c.end())); + return std::make_unique(std::move(joined)); + }; + + std::vector> partials; + std::vector partial_views; + for (cudf::size_type i = 0; i < t0.num_rows(); i++) { + part_ctx.left_start_idx = i; + part_ctx.left_end_idx = i + 1; + partials.push_back(join_and_gather(part_ctx)); + partial_views.push_back(partials.back()->view()); + } + + auto concat_result = cudf::concatenate(partial_views, stream, mr); + auto concat_sort_order = cudf::sorted_order(concat_result->view()); + auto concat_sorted = cudf::gather(concat_result->view(), *concat_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sorted, *concat_sorted); +} + +TEST_F(JoinTest, HashJoinPartitionedFullJoin) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto const left_on = std::vector({0, 1}); + auto const right_on = std::vector({0, 1}); + auto const compare_nulls = cudf::null_equality::EQUAL; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + // Reference result from full join + auto expected_result = full_join(t0, t1, left_on, right_on, compare_nulls); + auto expected_sort_order = cudf::sorted_order(expected_result->view()); + auto expected_sorted = cudf::gather(expected_result->view(), *expected_sort_order); + + // Partitioned full join (probe side) + cudf::hash_join hash_joiner(t1.select(right_on), compare_nulls, stream); + auto match_ctx = hash_joiner.full_join_match_context(t0.select(left_on), stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, 0}; + + // Collect right indices on device for complement computation + std::vector>> right_idx_parts; + + std::vector> partials; + std::vector partial_views; + for (cudf::size_type i = 0; i < t0.num_rows(); i++) { + part_ctx.left_start_idx = i; + part_ctx.left_end_idx = i + 1; + auto [left_idx, right_idx] = hash_joiner.partitioned_full_join(part_ctx, stream, mr); + + auto left_col = cudf::column_view{cudf::device_span{*left_idx}}; + auto right_col = cudf::column_view{cudf::device_span{*right_idx}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert(joined.end(), + std::make_move_iterator(right_c.begin()), + std::make_move_iterator(right_c.end())); + partials.push_back(std::make_unique(std::move(joined))); + partial_views.push_back(partials.back()->view()); + right_idx_parts.push_back(std::move(right_idx)); + } + + // Concatenate all right indices on device for complement + std::vector right_idx_views; + for (auto const& part : right_idx_parts) { + right_idx_views.push_back(cudf::column_view{cudf::device_span{*part}}); + } + auto all_right_indices_col = cudf::concatenate(right_idx_views, stream, mr); + + // Compute complement (unmatched build rows) + auto [complement_left, complement_right] = cudf::hash_join::full_join_complement( + cudf::device_span{ + all_right_indices_col->view().data(), + static_cast(all_right_indices_col->size())}, + t0.select(left_on).num_rows(), + t1.select(right_on).num_rows(), + stream, + mr); + + // Gather complement rows + if (complement_left->size() > 0) { + auto left_col = cudf::column_view{cudf::device_span{*complement_left}}; + auto right_col = cudf::column_view{cudf::device_span{*complement_right}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert(joined.end(), + std::make_move_iterator(right_c.begin()), + std::make_move_iterator(right_c.end())); + partials.push_back(std::make_unique(std::move(joined))); + partial_views.push_back(partials.back()->view()); + } + + auto concat_result = cudf::concatenate(partial_views, stream, mr); + auto concat_sort_order = cudf::sorted_order(concat_result->view()); + auto concat_sorted = cudf::gather(concat_result->view(), *concat_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sorted, *concat_sorted); +} + +TEST_F(JoinTest, HashJoinPartitionedEmptyPartition) +{ + column_wrapper col0{{3, 1, 2, 0, 2}}; + column_wrapper col1{{2, 2, 0, 4, 3}}; + + CVector cols0, cols1; + cols0.push_back(col0.release()); + cols1.push_back(col1.release()); + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + cudf::hash_join hash_joiner(t1.select({0}), cudf::null_equality::EQUAL, stream); + auto match_ctx = hash_joiner.inner_join_match_context(t0.select({0}), stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, 0}; + + // Empty partition (start == end) + part_ctx.left_start_idx = 2; + part_ctx.left_end_idx = 2; + auto [left_idx, right_idx] = hash_joiner.partitioned_inner_join(part_ctx, stream, mr); + EXPECT_EQ(left_idx->size(), 0); + EXPECT_EQ(right_idx->size(), 0); +} + +TEST_F(JoinTest, HashJoinPartitionedWholeTable) +{ + column_wrapper col0{{3, 1, 2, 0, 2}}; + column_wrapper col1{{2, 2, 0, 4, 3}}; + + CVector cols0, cols1; + cols0.push_back(col0.release()); + cols1.push_back(col1.release()); + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + // Reference: full inner join + auto expected = inner_join(t0, t1, {0}, {0}); + auto expected_order = cudf::sorted_order(expected->view()); + auto expected_sort = cudf::gather(expected->view(), *expected_order); + + // Partitioned: entire table as one partition + cudf::hash_join hash_joiner(t1.select({0}), cudf::null_equality::EQUAL, stream); + auto match_ctx = hash_joiner.inner_join_match_context(t0.select({0}), stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, t0.num_rows()}; + + auto [left_idx, right_idx] = hash_joiner.partitioned_inner_join(part_ctx, stream, mr); + auto left_col = cudf::column_view{cudf::device_span{*left_idx}}; + auto right_col = cudf::column_view{cudf::device_span{*right_idx}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::DONT_CHECK); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::DONT_CHECK); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert( + joined.end(), std::make_move_iterator(right_c.begin()), std::make_move_iterator(right_c.end())); + auto result = std::make_unique(std::move(joined)); + auto result_order = cudf::sorted_order(result->view()); + auto result_sort = cudf::gather(result->view(), *result_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sort, *result_sort); +} + CUDF_TEST_PROGRAM_MAIN() From 2a3ef4c26cf546bfc6fd859149e755ee20a0195f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Apr 2026 13:27:12 -0700 Subject: [PATCH 02/29] Clean up rmm policy mr usage --- .../join/hash_join/full_join_complement.cu | 38 ++++++++++--------- cpp/src/join/hash_join/retrieve_impl.cuh | 22 +++++++---- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/cpp/src/join/hash_join/full_join_complement.cu b/cpp/src/join/hash_join/full_join_complement.cu index 372755d991e..128bccf479d 100644 --- a/cpp/src/join/hash_join/full_join_complement.cu +++ b/cpp/src/join/hash_join/full_join_complement.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -44,21 +45,22 @@ hash_join::full_join_complement(cudf::device_span right_indices if (probe_table_num_rows == 0) { // All build rows are unmatched - thrust::sequence(rmm::exec_policy_nosync(stream), + thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), right_indices_complement->begin(), right_indices_complement->end(), 0); } else { auto invalid_index_map = std::make_unique>(build_table_num_rows, stream); - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), - invalid_index_map->begin(), - invalid_index_map->end(), - int32_t{1}); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + invalid_index_map->begin(), + invalid_index_map->end(), + int32_t{1}); valid_range valid{0, build_table_num_rows}; - thrust::scatter_if(rmm::exec_policy_nosync(stream), + thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::make_constant_iterator(0), cuda::make_constant_iterator(0) + right_indices.size(), right_indices.begin(), @@ -69,22 +71,24 @@ hash_join::full_join_complement(cudf::device_span right_indices auto const begin_counter = static_cast(0); auto const end_counter = static_cast(build_table_num_rows); - size_type const indices_count = thrust::copy_if(rmm::exec_policy_nosync(stream), - cuda::counting_iterator{begin_counter}, - cuda::counting_iterator{end_counter}, - invalid_index_map->begin(), - right_indices_complement->begin(), - cuda::std::identity{}) - - right_indices_complement->begin(); + size_type const indices_count = + thrust::copy_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator{begin_counter}, + cuda::counting_iterator{end_counter}, + invalid_index_map->begin(), + right_indices_complement->begin(), + cuda::std::identity{}) - + right_indices_complement->begin(); right_indices_complement->resize(indices_count, stream); } auto left_invalid_indices = std::make_unique>(right_indices_complement->size(), stream, mr); - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), - left_invalid_indices->begin(), - left_invalid_indices->end(), - cudf::JoinNoMatch); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + left_invalid_indices->begin(), + left_invalid_indices->end(), + cudf::JoinNoMatch); return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement)); } diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index aacd8f79970..e031ee18ed8 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -12,7 +12,9 @@ #include #include +#include #include +#include #include #include @@ -20,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -233,21 +236,24 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& std::make_unique>(partition_size, stream, mr); auto right_indices = std::make_unique>(partition_size, stream, mr); - thrust::sequence(rmm::exec_policy_nosync(stream), + thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), left_indices->begin(), left_indices->end(), left_start_idx); - thrust::fill( - rmm::exec_policy_nosync(stream), right_indices->begin(), right_indices->end(), JoinNoMatch); + thrust::fill(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + right_indices->begin(), + right_indices->end(), + JoinNoMatch); return std::pair(std::move(left_indices), std::move(right_indices)); } } // Compute output size from pre-computed match counts - auto const output_size = thrust::reduce(rmm::exec_policy_nosync(stream), - match_ctx._match_counts->begin() + left_start_idx, - match_ctx._match_counts->begin() + left_end_idx, - std::size_t{0}); + auto const output_size = + thrust::reduce(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + match_ctx._match_counts->begin() + left_start_idx, + match_ctx._match_counts->begin() + left_end_idx, + std::size_t{0}); if (output_size == 0) { return std::pair(std::make_unique>(0, stream, mr), @@ -279,7 +285,7 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& // Offset left indices to be relative to the original complete probe table if (left_start_idx > 0 && join_indices.first->size() > 0) { - thrust::transform(rmm::exec_policy_nosync(stream), + thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), join_indices.first->begin(), join_indices.first->end(), cuda::make_constant_iterator(left_start_idx), From d89b38436f5ec71fb6c296497536fdf513a558a2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 13 Apr 2026 20:07:16 -0700 Subject: [PATCH 03/29] Use custom kernels + file split --- cpp/CMakeLists.txt | 4 + cpp/src/join/hash_join/count_each.cu | 23 +++ cpp/src/join/hash_join/count_each_outer.cu | 23 +++ cpp/src/join/hash_join/count_kernels.cuh | 77 ++++++++++ cpp/src/join/hash_join/count_kernels.hpp | 24 ++++ cpp/src/join/hash_join/kernels_common.cuh | 29 ++++ cpp/src/join/hash_join/match_context.cu | 42 +++--- cpp/src/join/hash_join/ref_types.cuh | 42 ++++++ cpp/src/join/hash_join/retrieve.cu | 35 +++++ cpp/src/join/hash_join/retrieve_impl.cuh | 62 +++++--- cpp/src/join/hash_join/retrieve_kernels.cuh | 149 ++++++++++++++++++++ cpp/src/join/hash_join/retrieve_kernels.hpp | 23 +++ cpp/src/join/hash_join/retrieve_outer.cu | 35 +++++ 13 files changed, 527 insertions(+), 41 deletions(-) create mode 100644 cpp/src/join/hash_join/count_each.cu create mode 100644 cpp/src/join/hash_join/count_each_outer.cu create mode 100644 cpp/src/join/hash_join/count_kernels.cuh create mode 100644 cpp/src/join/hash_join/count_kernels.hpp create mode 100644 cpp/src/join/hash_join/kernels_common.cuh create mode 100644 cpp/src/join/hash_join/ref_types.cuh create mode 100644 cpp/src/join/hash_join/retrieve.cu create mode 100644 cpp/src/join/hash_join/retrieve_kernels.cuh create mode 100644 cpp/src/join/hash_join/retrieve_kernels.hpp create mode 100644 cpp/src/join/hash_join/retrieve_outer.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5d21e5a4060..ecb079aa31b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -682,6 +682,10 @@ add_library( src/join/hash_join/left_join_retrieve.cu src/join/hash_join/left_join_size.cu src/join/hash_join/match_context.cu + src/join/hash_join/count_each.cu + src/join/hash_join/count_each_outer.cu + src/join/hash_join/retrieve.cu + src/join/hash_join/retrieve_outer.cu src/join/hash_join/partitioned_full_join.cu src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_left_join.cu diff --git a/cpp/src/join/hash_join/count_each.cu b/cpp/src/join/hash_join/count_each.cu new file mode 100644 index 00000000000..db4024f44a0 --- /dev/null +++ b/cpp/src/join/hash_join/count_each.cu @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "count_kernels.cuh" +#include "ref_types.cuh" + +namespace cudf::detail { + +template void launch_count_each(probe_key_type const*, + cuda::std::int64_t, + size_type*, + primitive_count_ref_t, + rmm::cuda_stream_view); + +template void launch_count_each( + probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); + +template void launch_count_each( + probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/count_each_outer.cu b/cpp/src/join/hash_join/count_each_outer.cu new file mode 100644 index 00000000000..1dd029b08cd --- /dev/null +++ b/cpp/src/join/hash_join/count_each_outer.cu @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "count_kernels.cuh" +#include "ref_types.cuh" + +namespace cudf::detail { + +template void launch_count_each(probe_key_type const*, + cuda::std::int64_t, + size_type*, + primitive_count_ref_t, + rmm::cuda_stream_view); + +template void launch_count_each( + probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); + +template void launch_count_each( + probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/count_kernels.cuh b/cpp/src/join/hash_join/count_kernels.cuh new file mode 100644 index 00000000000..38c4ba76e1a --- /dev/null +++ b/cpp/src/join/hash_join/count_kernels.cuh @@ -0,0 +1,77 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Ported from cuco's open_addressing count_each kernel. + +#pragma once + +#include "kernels_common.cuh" + +#include + +#include + +#include +#include + +namespace cudf::detail { + +template +CUDF_KERNEL void __launch_bounds__(PROBE_BLOCK_SIZE) + count_each_kernel(probe_key_type const* __restrict__ keys, + cuda::std::int64_t n, + size_type* __restrict__ output, + Ref ref) +{ + auto constexpr cg_size = PROBE_CG_SIZE; + + auto idx = grid_1d::global_thread_id() / cg_size; + auto const stride = grid_1d::grid_stride() / cg_size; + + while (idx < n) { + auto const key = keys[idx]; + if constexpr (cg_size == 1) { + auto const cnt = ref.count(key); + if constexpr (IsOuter) { + output[idx] = (cnt == 0) ? size_type{1} : cnt; + } else { + output[idx] = cnt; + } + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + if constexpr (IsOuter) { + auto temp_count = static_cast(ref.count(tile, key)); + if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; } + auto const cnt = + cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); + if (tile.thread_rank() == 0) { output[idx] = cnt; } + } else { + auto const cnt = cooperative_groups::reduce(tile, + static_cast(ref.count(tile, key)), + cooperative_groups::plus()); + if (tile.thread_rank() == 0) { output[idx] = cnt; } + } + } + idx += stride; + } +} + +template +void launch_count_each(probe_key_type const* keys, + cuda::std::int64_t n, + size_type* output, + Ref ref, + rmm::cuda_stream_view stream) +{ + if (n == 0) { return; } + + auto const config = grid_1d{static_cast(n * PROBE_CG_SIZE), PROBE_BLOCK_SIZE}; + + count_each_kernel + <<>>(keys, n, output, ref); +} + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/count_kernels.hpp b/cpp/src/join/hash_join/count_kernels.hpp new file mode 100644 index 00000000000..9ea1e58d0fb --- /dev/null +++ b/cpp/src/join/hash_join/count_kernels.hpp @@ -0,0 +1,24 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "kernels_common.cuh" + +#include + +#include + +namespace cudf::detail { + +/// Launch the count_each kernel. +template +void launch_count_each(probe_key_type const* keys, + cuda::std::int64_t n, + size_type* output, + Ref ref, + rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh new file mode 100644 index 00000000000..7b86fe5da46 --- /dev/null +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Ported from cuco's open_addressing kernels and ref_impl to give cudf direct +// control over hash-join probe kernel launches. The device-side probing logic +// is identical to cuco's static_multiset::count / count_each / retrieve / +// retrieve_outer. We keep the cuco ref type for hash-table access (storage, +// probing scheme, predicate) and only replace the host-side launch. + +#pragma once + +#include +#include +#include + +#include + +namespace cudf::detail { + +// Constants matching the cuco static_multiset configuration used by hash_join. +inline constexpr int PROBE_BLOCK_SIZE = 128; // cuco::detail::default_block_size() +inline constexpr int PROBE_CG_SIZE = DEFAULT_JOIN_CG_SIZE; // 2 + +/// The probe key type stored in the hash table: {hash_value, row_index}. +using probe_key_type = cuco::pair; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index b6653be623c..6404de843de 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -4,6 +4,7 @@ */ #include "common.cuh" +#include "count_kernels.hpp" #include "dispatch.cuh" #include "join/join_common_utils.cuh" @@ -12,18 +13,12 @@ #include #include +#include #include -#include +#include namespace cudf::detail { -namespace { -/// Functor that ensures a minimum count of 1 for LEFT/FULL join match counts. -struct clamp_zero_to_one { - __device__ size_type operator()(size_type count) const { return count == 0 ? 1 : count; } -}; -} // namespace - std::unique_ptr> make_join_match_counts( table_view const& right, std::shared_ptr const& preprocessed_right, @@ -55,24 +50,23 @@ std::unique_ptr> make_join_match_counts( auto const left_table_num_rows = left.num_rows(); auto count_matches = [&](auto equality, auto d_hasher) { - auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher}); + // Precompute probe keys: {hash(row_idx), row_idx} for each probe row. + auto const n = static_cast(probe_table_num_rows); + rmm::device_uvector probe_keys(n, stream); + thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator(0), + cuda::counting_iterator(probe_table_num_rows), + probe_keys.begin(), + pair_fn{d_hasher}); + + auto const ref = hash_table.ref(cuco::op::count) + .rebind_key_eq(equality) + .rebind_hash_function(hash_table.hash_function()); if (join == join_kind::INNER_JOIN) { - hash_table.count_each(iter, - iter + left_table_num_rows, - equality, - hash_table.hash_function(), - match_counts->begin(), - stream.value()); + launch_count_each(probe_keys.data(), n, match_counts->begin(), ref, stream); } else { - // For LEFT/FULL joins, fuse the clamp into the output to avoid a separate kernel launch. - auto const output = - thrust::make_transform_output_iterator(match_counts->begin(), clamp_zero_to_one{}); - hash_table.count_each(iter, - iter + left_table_num_rows, - equality, - hash_table.hash_function(), - output, - stream.value()); + // IsOuter=true handles the clamp (zero → 1) for LEFT/FULL joins internally. + launch_count_each(probe_keys.data(), n, match_counts->begin(), ref, stream); } }; diff --git a/cpp/src/join/hash_join/ref_types.cuh b/cpp/src/join/hash_join/ref_types.cuh new file mode 100644 index 00000000000..b05601d930b --- /dev/null +++ b/cpp/src/join/hash_join/ref_types.cuh @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Type aliases for the cuco hash table ref types and equality comparators +// used across hash join probe kernels. There are 3 dispatch paths: +// primitive, nested, non-nested. + +#pragma once + +#include "dispatch.cuh" +#include "hash_join_impl.cuh" + +#include + +namespace cudf::detail { + +// --- Equality types from the 3 dispatch paths --- + +using primitive_equality_t = primitive_pair_equal; + +using nested_equality_t = pair_equal>>; + +using flat_equality_t = pair_equal>>; + +// --- Count ref types (used by count_each kernel) --- + +template +using count_ref_t = + decltype(std::declval() + .ref(cuco::op::count) + .rebind_key_eq(std::declval()) + .rebind_hash_function(std::declval().hash_function())); + +using primitive_count_ref_t = count_ref_t; +using nested_count_ref_t = count_ref_t; +using flat_count_ref_t = count_ref_t; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve.cu b/cpp/src/join/hash_join/retrieve.cu new file mode 100644 index 00000000000..e33ed2be109 --- /dev/null +++ b/cpp/src/join/hash_join/retrieve.cu @@ -0,0 +1,35 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "ref_types.cuh" +#include "retrieve_kernels.cuh" + +namespace cudf::detail { + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + primitive_count_ref_t, + rmm::cuda_stream_view); + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + nested_count_ref_t, + rmm::cuda_stream_view); + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + flat_count_ref_t, + rmm::cuda_stream_view); + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index e031ee18ed8..aa83eb4bcc6 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -8,6 +8,7 @@ #include "dispatch.cuh" #include "join/join_common_utils.cuh" #include "join/join_common_utils.hpp" +#include "retrieve_kernels.hpp" #include "size_impl.cuh" #include @@ -17,6 +18,7 @@ #include #include +#include #include #include @@ -260,6 +262,9 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& std::make_unique>(0, stream, mr)); } + auto left_indices = std::make_unique>(output_size, stream, mr); + auto right_indices = std::make_unique>(output_size, stream, mr); + // Slice the probe table to the partition range auto const probe_partition_view = cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; @@ -270,30 +275,53 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& cudf::detail::row::equality::preprocessed_table::create(probe_partition_view, stream); // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) - constexpr auto probe_join = Join == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : Join; - - auto join_indices = cudf::detail::probe_join_hash_table(_build, - probe_partition_view, - _preprocessed_build, - preprocessed_probe, - _impl->_hash_table, - _has_nulls, - _nulls_equal, - output_size, - stream, - mr); + constexpr bool is_outer = (Join != join_kind::INNER_JOIN); + + auto retrieve_partition = [&](auto equality, auto d_hasher) { + // Precompute probe keys for this partition slice. + auto const n = static_cast(partition_size); + rmm::device_uvector probe_keys(n, stream); + thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator(0), + cuda::counting_iterator(partition_size), + probe_keys.begin(), + pair_fn{d_hasher}); + + auto const ref = _impl->_hash_table.ref(cuco::op::count) + .rebind_key_eq(equality) + .rebind_hash_function(_impl->_hash_table.hash_function()); + + // Pass the partition's slice of pre-computed match counts. + auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; + + launch_retrieve(probe_keys.data(), + n, + left_indices->data(), + right_indices->data(), + partition_counts, + ref, + stream); + }; + + dispatch_join_comparator(_build, + probe_partition_view, + _preprocessed_build, + preprocessed_probe, + _has_nulls, + _nulls_equal, + retrieve_partition); // Offset left indices to be relative to the original complete probe table - if (left_start_idx > 0 && join_indices.first->size() > 0) { + if (left_start_idx > 0 && left_indices->size() > 0) { thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - join_indices.first->begin(), - join_indices.first->end(), + left_indices->begin(), + left_indices->end(), cuda::make_constant_iterator(left_start_idx), - join_indices.first->begin(), + left_indices->begin(), cuda::std::plus{}); } - return join_indices; + return std::pair(std::move(left_indices), std::move(right_indices)); } } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/retrieve_kernels.cuh new file mode 100644 index 00000000000..2e339b6d013 --- /dev/null +++ b/cpp/src/join/hash_join/retrieve_kernels.cuh @@ -0,0 +1,149 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Hash join retrieve kernel using prefix-scan offsets. Each CG knows +// exactly where to write — no atomics, no shared-memory buffering. +// Uses cuco ref public APIs: storage_ref(), probing_scheme(), empty_key_sentinel(), key_eq(). + +#pragma once + +#include "kernels_common.cuh" + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf::detail { + +template +CUDF_KERNEL void __launch_bounds__(PROBE_BLOCK_SIZE) + retrieve_kernel(probe_key_type const* __restrict__ input_probe, + size_type const* __restrict__ offsets, + cuda::std::int64_t n, + size_type* __restrict__ left_output, + size_type* __restrict__ right_output, + Ref ref) +{ + namespace cg = cooperative_groups; + + auto constexpr cg_size = Ref::cg_size; + auto constexpr bucket_size = Ref::bucket_size; + auto const empty_sentinel = ref.empty_key_sentinel(); + auto const key_equal = ref.key_eq(); + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto idx = grid_1d::global_thread_id() / cg_size; + auto const stride = grid_1d::grid_stride() / cg_size; + + while (idx < n) { + auto const probe_key = input_probe[idx]; + auto const left_index = probe_key.second; + auto write_pos = static_cast(offsets[idx]); + + auto probing_iter = ref.probing_scheme().template make_iterator( + tile, probe_key, ref.storage_ref().extent()); + auto const init_probing_idx = *probing_iter; + + bool running = true; + [[maybe_unused]] bool found_match = false; + + while (tile.any(running)) { + if (running) { + auto const bucket_slots = ref.storage_ref()[*probing_iter]; + + bool equals[bucket_size]; + for (int i = 0; i < bucket_size; ++i) { + equals[i] = false; + if (running) { + if (bucket_slots[i] == empty_sentinel) { + running = false; + } else if (key_equal(probe_key, bucket_slots[i])) { + equals[i] = true; + } + } + } + + tile.sync(); + running = tile.all(running); + + for (int i = 0; i < bucket_size; ++i) { + auto const match_mask = tile.ballot(equals[i]); + auto const num_matches = __popc(match_mask); + + if (equals[i]) { + auto const lane_offset = + cuco::detail::count_least_significant_bits(match_mask, tile.thread_rank()); + left_output[write_pos + lane_offset] = left_index; + right_output[write_pos + lane_offset] = bucket_slots[i].second; + if constexpr (IsOuter) { found_match = true; } + } + + if (tile.thread_rank() == 0) { write_pos += num_matches; } + write_pos = tile.shfl(write_pos, 0); + } + } + + ++probing_iter; + if (*probing_iter == init_probing_idx) { running = false; } + } + + if constexpr (IsOuter) { + if (!found_match && tile.thread_rank() == 0) { + left_output[write_pos] = left_index; + right_output[write_pos] = JoinNoMatch; + } + } + + idx += stride; + } +} + +template +std::size_t launch_retrieve(probe_key_type const* keys, + cuda::std::int64_t n, + size_type* left_output, + size_type* right_output, + size_type const* match_counts, + Ref ref, + rmm::cuda_stream_view stream) +{ + if (n == 0) { return 0; } + + // Exclusive scan of match counts to get per-row output offsets. + rmm::device_uvector offsets(n, stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + match_counts, + match_counts + n, + offsets.begin()); + + // Total output size = last offset + last count. Batch both D2H copies. + size_type last_offset = 0; + size_type last_count = 0; + void* const dsts[] = {&last_offset, &last_count}; + void const* const srcs[] = {offsets.data() + n - 1, match_counts + n - 1}; + std::size_t const sizes[] = {sizeof(size_type), sizeof(size_type)}; + CUDF_CUDA_TRY(cudf::detail::memcpy_batch_async(dsts, srcs, sizes, 2, stream)); + stream.synchronize(); + auto const total_output = static_cast(last_offset) + last_count; + + if (total_output == 0) { return 0; } + + auto const config = grid_1d{static_cast(n * PROBE_CG_SIZE), PROBE_BLOCK_SIZE}; + + retrieve_kernel<<>>( + keys, offsets.data(), n, left_output, right_output, ref); + + return total_output; +} + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_kernels.hpp b/cpp/src/join/hash_join/retrieve_kernels.hpp new file mode 100644 index 00000000000..becfc0e7040 --- /dev/null +++ b/cpp/src/join/hash_join/retrieve_kernels.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "kernels_common.cuh" + +#include + +namespace cudf::detail { + +template +std::size_t launch_retrieve(probe_key_type const* keys, + cuda::std::int64_t n, + size_type* left_output, + size_type* right_output, + size_type const* match_counts, + Ref ref, + rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_outer.cu b/cpp/src/join/hash_join/retrieve_outer.cu new file mode 100644 index 00000000000..ff226102057 --- /dev/null +++ b/cpp/src/join/hash_join/retrieve_outer.cu @@ -0,0 +1,35 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "ref_types.cuh" +#include "retrieve_kernels.cuh" + +namespace cudf::detail { + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + primitive_count_ref_t, + rmm::cuda_stream_view); + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + nested_count_ref_t, + rmm::cuda_stream_view); + +template std::size_t launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type*, + size_type*, + size_type const*, + flat_count_ref_t, + rmm::cuda_stream_view); + +} // namespace cudf::detail From 903f5ec7e90b1ce7e40586a81e1049b9f2e3459d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Apr 2026 13:22:34 -0700 Subject: [PATCH 04/29] Remove redundant thrust::reduce in partitioned join retrieve --- cpp/src/join/hash_join/retrieve.cu | 45 +++++++++++---------- cpp/src/join/hash_join/retrieve_impl.cuh | 42 +++++++------------ cpp/src/join/hash_join/retrieve_kernels.cuh | 32 ++++++++++----- cpp/src/join/hash_join/retrieve_kernels.hpp | 30 ++++++++++---- cpp/src/join/hash_join/retrieve_outer.cu | 45 +++++++++++---------- 5 files changed, 106 insertions(+), 88 deletions(-) diff --git a/cpp/src/join/hash_join/retrieve.cu b/cpp/src/join/hash_join/retrieve.cu index e33ed2be109..9609933f522 100644 --- a/cpp/src/join/hash_join/retrieve.cu +++ b/cpp/src/join/hash_join/retrieve.cu @@ -8,28 +8,31 @@ namespace cudf::detail { -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - primitive_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + primitive_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - nested_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + nested_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - flat_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + flat_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index aa83eb4bcc6..27c41f35988 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -250,21 +249,6 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& } } - // Compute output size from pre-computed match counts - auto const output_size = - thrust::reduce(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - match_ctx._match_counts->begin() + left_start_idx, - match_ctx._match_counts->begin() + left_end_idx, - std::size_t{0}); - - if (output_size == 0) { - return std::pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); - } - - auto left_indices = std::make_unique>(output_size, stream, mr); - auto right_indices = std::make_unique>(output_size, stream, mr); - // Slice the probe table to the partition range auto const probe_partition_view = cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; @@ -277,9 +261,17 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) constexpr bool is_outer = (Join != join_kind::INNER_JOIN); + // launch_retrieve computes output size from match counts via exclusive scan + // (total = last_offset + last_count), allocates output buffers, and launches the kernel. + auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; + auto const n = static_cast(partition_size); + + std::pair>, + std::unique_ptr>> + join_indices; + auto retrieve_partition = [&](auto equality, auto d_hasher) { // Precompute probe keys for this partition slice. - auto const n = static_cast(partition_size); rmm::device_uvector probe_keys(n, stream); thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::counting_iterator(0), @@ -291,16 +283,8 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& .rebind_key_eq(equality) .rebind_hash_function(_impl->_hash_table.hash_function()); - // Pass the partition's slice of pre-computed match counts. - auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; - - launch_retrieve(probe_keys.data(), - n, - left_indices->data(), - right_indices->data(), - partition_counts, - ref, - stream); + join_indices = + launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); }; dispatch_join_comparator(_build, @@ -311,6 +295,8 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& _nulls_equal, retrieve_partition); + auto& [left_indices, right_indices] = join_indices; + // Offset left indices to be relative to the original complete probe table if (left_start_idx > 0 && left_indices->size() > 0) { thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), @@ -321,7 +307,7 @@ hash_join::partitioned_join_retrieve(cudf::join_partition_context const& cuda::std::plus{}); } - return std::pair(std::move(left_indices), std::move(right_indices)); + return join_indices; } } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/retrieve_kernels.cuh index 2e339b6d013..fa57d06dd4d 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/retrieve_kernels.cuh @@ -109,15 +109,19 @@ CUDF_KERNEL void __launch_bounds__(PROBE_BLOCK_SIZE) } template -std::size_t launch_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, - size_type* left_output, - size_type* right_output, - size_type const* match_counts, - Ref ref, - rmm::cuda_stream_view stream) +std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const* keys, + cuda::std::int64_t n, + size_type const* match_counts, + Ref ref, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - if (n == 0) { return 0; } + if (n == 0) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } // Exclusive scan of match counts to get per-row output offsets. rmm::device_uvector offsets(n, stream); @@ -136,14 +140,20 @@ std::size_t launch_retrieve(probe_key_type const* keys, stream.synchronize(); auto const total_output = static_cast(last_offset) + last_count; - if (total_output == 0) { return 0; } + if (total_output == 0) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + + auto left_indices = std::make_unique>(total_output, stream, mr); + auto right_indices = std::make_unique>(total_output, stream, mr); auto const config = grid_1d{static_cast(n * PROBE_CG_SIZE), PROBE_BLOCK_SIZE}; retrieve_kernel<<>>( - keys, offsets.data(), n, left_output, right_output, ref); + keys, offsets.data(), n, left_indices->data(), right_indices->data(), ref); - return total_output; + return std::pair(std::move(left_indices), std::move(right_indices)); } } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_kernels.hpp b/cpp/src/join/hash_join/retrieve_kernels.hpp index becfc0e7040..a5d772821b1 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/retrieve_kernels.hpp @@ -7,17 +7,33 @@ #include "kernels_common.cuh" +#include + #include +#include + +#include +#include namespace cudf::detail { +/** + * @brief Probes the hash table for each key and writes matching index pairs. + * + * Internally computes per-row output offsets via exclusive scan on match_counts, + * derives the total output size, allocates output buffers, and launches the + * retrieve kernel. + * + * @return A pair of device vectors [left_indices, right_indices]. + */ template -std::size_t launch_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, - size_type* left_output, - size_type* right_output, - size_type const* match_counts, - Ref ref, - rmm::cuda_stream_view stream); +std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const* keys, + cuda::std::int64_t n, + size_type const* match_counts, + Ref ref, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/retrieve_outer.cu b/cpp/src/join/hash_join/retrieve_outer.cu index ff226102057..4bc5170b6fd 100644 --- a/cpp/src/join/hash_join/retrieve_outer.cu +++ b/cpp/src/join/hash_join/retrieve_outer.cu @@ -8,28 +8,31 @@ namespace cudf::detail { -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - primitive_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + primitive_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - nested_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + nested_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); -template std::size_t launch_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type*, - size_type*, - size_type const*, - flat_count_ref_t, - rmm::cuda_stream_view); +template std::pair>, + std::unique_ptr>> +launch_retrieve(probe_key_type const*, + cuda::std::int64_t, + size_type const*, + flat_count_ref_t, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); } // namespace cudf::detail From c03dd98206725adb3385ebce498f54f5836c7518 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Apr 2026 15:34:17 -0700 Subject: [PATCH 05/29] Use DEFAULT_JOIN_BLOCK_SIZE and DEFAULT_JOIN_CG_SIZE directly --- cpp/src/join/hash_join/count_kernels.cuh | 7 ++++--- cpp/src/join/hash_join/kernels_common.cuh | 4 ---- cpp/src/join/hash_join/retrieve_kernels.cuh | 5 +++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/join/hash_join/count_kernels.cuh b/cpp/src/join/hash_join/count_kernels.cuh index 38c4ba76e1a..d252e79ff56 100644 --- a/cpp/src/join/hash_join/count_kernels.cuh +++ b/cpp/src/join/hash_join/count_kernels.cuh @@ -19,13 +19,13 @@ namespace cudf::detail { template -CUDF_KERNEL void __launch_bounds__(PROBE_BLOCK_SIZE) +CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) count_each_kernel(probe_key_type const* __restrict__ keys, cuda::std::int64_t n, size_type* __restrict__ output, Ref ref) { - auto constexpr cg_size = PROBE_CG_SIZE; + auto constexpr cg_size = DEFAULT_JOIN_CG_SIZE; auto idx = grid_1d::global_thread_id() / cg_size; auto const stride = grid_1d::grid_stride() / cg_size; @@ -68,7 +68,8 @@ void launch_count_each(probe_key_type const* keys, { if (n == 0) { return; } - auto const config = grid_1d{static_cast(n * PROBE_CG_SIZE), PROBE_BLOCK_SIZE}; + auto const config = + grid_1d{static_cast(n * DEFAULT_JOIN_CG_SIZE), DEFAULT_JOIN_BLOCK_SIZE}; count_each_kernel <<>>(keys, n, output, ref); diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh index 7b86fe5da46..714421a84d3 100644 --- a/cpp/src/join/hash_join/kernels_common.cuh +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -19,10 +19,6 @@ namespace cudf::detail { -// Constants matching the cuco static_multiset configuration used by hash_join. -inline constexpr int PROBE_BLOCK_SIZE = 128; // cuco::detail::default_block_size() -inline constexpr int PROBE_CG_SIZE = DEFAULT_JOIN_CG_SIZE; // 2 - /// The probe key type stored in the hash table: {hash_value, row_index}. using probe_key_type = cuco::pair; diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/retrieve_kernels.cuh index fa57d06dd4d..743acbebf58 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/retrieve_kernels.cuh @@ -26,7 +26,7 @@ namespace cudf::detail { template -CUDF_KERNEL void __launch_bounds__(PROBE_BLOCK_SIZE) +CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) retrieve_kernel(probe_key_type const* __restrict__ input_probe, size_type const* __restrict__ offsets, cuda::std::int64_t n, @@ -148,7 +148,8 @@ launch_retrieve(probe_key_type const* keys, auto left_indices = std::make_unique>(total_output, stream, mr); auto right_indices = std::make_unique>(total_output, stream, mr); - auto const config = grid_1d{static_cast(n * PROBE_CG_SIZE), PROBE_BLOCK_SIZE}; + auto const config = + grid_1d{static_cast(n * DEFAULT_JOIN_CG_SIZE), DEFAULT_JOIN_BLOCK_SIZE}; retrieve_kernel<<>>( keys, offsets.data(), n, left_indices->data(), right_indices->data(), ref); From e586b569de9de5f1813313becd26ce0038991be7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 15 Apr 2026 09:43:22 -0700 Subject: [PATCH 06/29] Minor cleanup with modern CCCL --- cpp/src/join/hash_join/retrieve_kernels.cuh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/retrieve_kernels.cuh index 743acbebf58..858fde457fc 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/retrieve_kernels.cuh @@ -20,9 +20,21 @@ #include #include -#include +#include #include +namespace { + +/** + * @brief Count the number of set bits below a given position in a bitmask. + */ +__device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) +{ + return cuda::std::popcount(mask & ((1u << pos) - 1)); +} + +} // namespace + namespace cudf::detail { template @@ -78,11 +90,10 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) for (int i = 0; i < bucket_size; ++i) { auto const match_mask = tile.ballot(equals[i]); - auto const num_matches = __popc(match_mask); + auto const num_matches = cuda::std::popcount(match_mask); if (equals[i]) { - auto const lane_offset = - cuco::detail::count_least_significant_bits(match_mask, tile.thread_rank()); + auto const lane_offset = count_lower_set_bits(match_mask, tile.thread_rank()); left_output[write_pos + lane_offset] = left_index; right_output[write_pos + lane_offset] = bucket_slots[i].second; if constexpr (IsOuter) { found_match = true; } From a5879bcf52040e399fae172f191be7a170aedbe1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 15 Apr 2026 10:51:02 -0700 Subject: [PATCH 07/29] Cleanups --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/detail/join/hash_join.hpp | 4 +- cpp/src/join/hash_join/kernels_common.cuh | 2 + .../join/hash_join/partitioned_full_join.cu | 4 +- .../join/hash_join/partitioned_inner_join.cu | 4 +- .../hash_join/partitioned_join_retrieve.cu | 164 ++++++++++++++++++ .../join/hash_join/partitioned_left_join.cu | 4 +- cpp/src/join/hash_join/retrieve_impl.cuh | 113 ------------ 8 files changed, 175 insertions(+), 121 deletions(-) create mode 100644 cpp/src/join/hash_join/partitioned_join_retrieve.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ecb079aa31b..19eb85fb1dc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -688,6 +688,7 @@ add_library( src/join/hash_join/retrieve_outer.cu src/join/hash_join/partitioned_full_join.cu src/join/hash_join/partitioned_inner_join.cu + src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu src/join/hash_join/full_join_complement.cu src/join/mark_join.cu diff --git a/cpp/include/cudf/detail/join/hash_join.hpp b/cpp/include/cudf/detail/join/hash_join.hpp index ca94b1d02bb..81ab53fd5f3 100644 --- a/cpp/include/cudf/detail/join/hash_join.hpp +++ b/cpp/include/cudf/detail/join/hash_join.hpp @@ -191,10 +191,10 @@ class hash_join { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; - template [[nodiscard]] std::pair>, std::unique_ptr>> - partitioned_join_retrieve(cudf::join_partition_context const& context, + partitioned_join_retrieve(join_kind join, + cudf::join_partition_context const& context, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh index 714421a84d3..3f0740a4160 100644 --- a/cpp/src/join/hash_join/kernels_common.cuh +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -11,6 +11,8 @@ #pragma once +#include "join/join_common_utils.hpp" + #include #include #include diff --git a/cpp/src/join/hash_join/partitioned_full_join.cu b/cpp/src/join/hash_join/partitioned_full_join.cu index 5934574dbda..06a77a46121 100644 --- a/cpp/src/join/hash_join/partitioned_full_join.cu +++ b/cpp/src/join/hash_join/partitioned_full_join.cu @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "retrieve_impl.cuh" +#include "common.cuh" namespace cudf::detail { @@ -14,7 +14,7 @@ hash_join::partitioned_full_join(cudf::join_partition_context const& con rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - return this->template partitioned_join_retrieve(context, stream, mr); + return this->partitioned_join_retrieve(join_kind::FULL_JOIN, context, stream, mr); } template std::pair>, diff --git a/cpp/src/join/hash_join/partitioned_inner_join.cu b/cpp/src/join/hash_join/partitioned_inner_join.cu index 38ba93374e3..26a0f873de9 100644 --- a/cpp/src/join/hash_join/partitioned_inner_join.cu +++ b/cpp/src/join/hash_join/partitioned_inner_join.cu @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "retrieve_impl.cuh" +#include "common.cuh" namespace cudf::detail { @@ -14,7 +14,7 @@ hash_join::partitioned_inner_join(cudf::join_partition_context const& co rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - return this->template partitioned_join_retrieve(context, stream, mr); + return this->partitioned_join_retrieve(join_kind::INNER_JOIN, context, stream, mr); } template std::pair>, diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu new file mode 100644 index 00000000000..187a5b08ba1 --- /dev/null +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -0,0 +1,164 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "common.cuh" +#include "dispatch.cuh" +#include "join/join_common_utils.cuh" +#include "join/join_common_utils.hpp" +#include "retrieve_kernels.hpp" + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::detail { +namespace { + +/** + * @brief Returns trivial left/right index pairs for an outer join when the build side is empty. + */ +std::pair>, + std::unique_ptr>> +make_trivial_outer_indices(size_type left_start_idx, + size_type partition_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto left_indices = std::make_unique>(partition_size, stream, mr); + auto right_indices = std::make_unique>(partition_size, stream, mr); + auto out = cuda::zip_iterator(left_indices->begin(), right_indices->begin()); + thrust::tabulate(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + out, + out + partition_size, + cuda::proclaim_return_type>( + [left_start_idx] __device__(auto i) { + return cuda::std::tuple{static_cast(left_start_idx + i), + JoinNoMatch}; + })); + return std::pair(std::move(left_indices), std::move(right_indices)); +} + +} // namespace + +template +std::pair>, + std::unique_ptr>> +hash_join::partitioned_join_retrieve(join_kind join, + cudf::join_partition_context const& context, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + CUDF_FUNC_RANGE(); + + CUDF_EXPECTS( + join == join_kind::INNER_JOIN || join == join_kind::LEFT_JOIN || join == join_kind::FULL_JOIN, + "Unsupported join kind for partitioned retrieve"); + + auto const& match_ctx = *context.left_table_context; + auto const left_start_idx = context.left_start_idx; + auto const left_end_idx = context.left_end_idx; + + // Empty partition + if (left_start_idx >= left_end_idx) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } + + auto const partition_size = left_end_idx - left_start_idx; + + // Trivial case: build table is empty + if (_is_empty) { + if (join == join_kind::INNER_JOIN) { + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); + } else { + return make_trivial_outer_indices(left_start_idx, partition_size, stream, mr); + } + } + + // Slice the probe table to the partition range + auto const probe_partition_view = + cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; + + validate_hash_join_probe(_build, probe_partition_view, _has_nulls); + + auto const preprocessed_probe = + cudf::detail::row::equality::preprocessed_table::create(probe_partition_view, stream); + + // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) + bool const is_outer = (join != join_kind::INNER_JOIN); + + // launch_retrieve computes output size from match counts via exclusive scan + // (total = last_offset + last_count), allocates output buffers, and launches the kernel. + auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; + auto const n = static_cast(partition_size); + + std::pair>, + std::unique_ptr>> + join_indices; + + auto retrieve_partition = [&](auto equality, auto d_hasher) { + // Precompute probe keys for this partition slice. + rmm::device_uvector probe_keys(n, stream); + thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator(0), + cuda::counting_iterator(partition_size), + probe_keys.begin(), + pair_fn{d_hasher}); + + auto const ref = _impl->_hash_table.ref(cuco::op::count) + .rebind_key_eq(equality) + .rebind_hash_function(_impl->_hash_table.hash_function()); + + if (is_outer) { + join_indices = launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); + } else { + join_indices = + launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); + } + }; + + dispatch_join_comparator(_build, + probe_partition_view, + _preprocessed_build, + preprocessed_probe, + _has_nulls, + _nulls_equal, + retrieve_partition); + + auto& [left_indices, right_indices] = join_indices; + + // Offset left indices to be relative to the original complete probe table + if (left_start_idx > 0 && left_indices->size() > 0) { + thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + left_indices->begin(), + left_indices->end(), + cuda::make_constant_iterator(left_start_idx), + left_indices->begin(), + cuda::std::plus{}); + } + + return join_indices; +} + +template std::pair>, + std::unique_ptr>> +hash_join::partitioned_join_retrieve(join_kind, + cudf::join_partition_context const&, + rmm::cuda_stream_view, + rmm::device_async_resource_ref) const; + +} // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_left_join.cu b/cpp/src/join/hash_join/partitioned_left_join.cu index de360af305c..1200a9f6a0e 100644 --- a/cpp/src/join/hash_join/partitioned_left_join.cu +++ b/cpp/src/join/hash_join/partitioned_left_join.cu @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "retrieve_impl.cuh" +#include "common.cuh" namespace cudf::detail { @@ -14,7 +14,7 @@ hash_join::partitioned_left_join(cudf::join_partition_context const& con rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - return this->template partitioned_join_retrieve(context, stream, mr); + return this->partitioned_join_retrieve(join_kind::LEFT_JOIN, context, stream, mr); } template std::pair>, diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index 27c41f35988..8cff1c8b7c0 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -8,7 +8,6 @@ #include "dispatch.cuh" #include "join/join_common_utils.cuh" #include "join/join_common_utils.hpp" -#include "retrieve_kernels.hpp" #include "size_impl.cuh" #include @@ -22,12 +21,8 @@ #include #include -#include #include -#include #include -#include -#include namespace cudf::detail { @@ -202,112 +197,4 @@ hash_join::join_retrieve(cudf::table_view const& left, } } -template -template -std::pair>, - std::unique_ptr>> -hash_join::partitioned_join_retrieve(cudf::join_partition_context const& context, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const -{ - CUDF_FUNC_RANGE(); - - static_assert(Join == join_kind::INNER_JOIN || Join == join_kind::LEFT_JOIN || - Join == join_kind::FULL_JOIN); - - auto const& match_ctx = *context.left_table_context; - auto const left_start_idx = context.left_start_idx; - auto const left_end_idx = context.left_end_idx; - - // Empty partition - if (left_start_idx >= left_end_idx) { - return std::pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); - } - - auto const partition_size = left_end_idx - left_start_idx; - - // Trivial case: build table is empty - if (_is_empty) { - if constexpr (Join == join_kind::INNER_JOIN) { - return std::pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); - } else { - auto left_indices = - std::make_unique>(partition_size, stream, mr); - auto right_indices = - std::make_unique>(partition_size, stream, mr); - thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_indices->begin(), - left_indices->end(), - left_start_idx); - thrust::fill(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - right_indices->begin(), - right_indices->end(), - JoinNoMatch); - return std::pair(std::move(left_indices), std::move(right_indices)); - } - } - - // Slice the probe table to the partition range - auto const probe_partition_view = - cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; - - validate_hash_join_probe(_build, probe_partition_view, _has_nulls); - - auto const preprocessed_probe = - cudf::detail::row::equality::preprocessed_table::create(probe_partition_view, stream); - - // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) - constexpr bool is_outer = (Join != join_kind::INNER_JOIN); - - // launch_retrieve computes output size from match counts via exclusive scan - // (total = last_offset + last_count), allocates output buffers, and launches the kernel. - auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; - auto const n = static_cast(partition_size); - - std::pair>, - std::unique_ptr>> - join_indices; - - auto retrieve_partition = [&](auto equality, auto d_hasher) { - // Precompute probe keys for this partition slice. - rmm::device_uvector probe_keys(n, stream); - thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::counting_iterator(0), - cuda::counting_iterator(partition_size), - probe_keys.begin(), - pair_fn{d_hasher}); - - auto const ref = _impl->_hash_table.ref(cuco::op::count) - .rebind_key_eq(equality) - .rebind_hash_function(_impl->_hash_table.hash_function()); - - join_indices = - launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); - }; - - dispatch_join_comparator(_build, - probe_partition_view, - _preprocessed_build, - preprocessed_probe, - _has_nulls, - _nulls_equal, - retrieve_partition); - - auto& [left_indices, right_indices] = join_indices; - - // Offset left indices to be relative to the original complete probe table - if (left_start_idx > 0 && left_indices->size() > 0) { - thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_indices->begin(), - left_indices->end(), - cuda::make_constant_iterator(left_start_idx), - left_indices->begin(), - cuda::std::plus{}); - } - - return join_indices; -} - } // namespace cudf::detail From ef0d6f4954ac5d63d1a96f1fd6b01beefa78cf0c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Apr 2026 11:40:15 -0700 Subject: [PATCH 08/29] Improve retrieve kenrel perf with shared memory buffer --- cpp/benchmarks/join/join.cu | 32 ++- .../hash_join/partitioned_join_retrieve.cu | 20 +- cpp/src/join/hash_join/retrieve.cu | 3 + cpp/src/join/hash_join/retrieve_kernels.cuh | 239 ++++++++++++------ cpp/src/join/hash_join/retrieve_kernels.hpp | 5 +- cpp/src/join/hash_join/retrieve_outer.cu | 3 + 6 files changed, 195 insertions(+), 107 deletions(-) diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index 88d2cf22dcd..8906bdfae7f 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -16,14 +16,31 @@ void nvbench_inner_join(nvbench::state& state, nvbench::enum_type>) { auto const num_keys = state.get_int64("num_keys"); + auto const mode = state.get_string("mode"); auto dtypes = cycle_dtypes(get_type_or_group(static_cast(DataType)), num_keys); - auto join = [](cudf::table_view const& left_input, - cudf::table_view const& right_input, - cudf::null_equality compare_nulls) { - return cudf::inner_join(left_input, right_input, compare_nulls); - }; - BM_join(state, dtypes, join); + if (mode == "normal") { + auto join = [](cudf::table_view const& left_input, + cudf::table_view const& right_input, + cudf::null_equality compare_nulls) { + return cudf::inner_join(left_input, right_input, compare_nulls); + }; + BM_join(state, dtypes, join); + } else { + // Partitioned code path: build hash join, compute match context, then retrieve the + // entire probe table as a single partition. This exercises the two-phase + // count-then-retrieve flow used for chunked probing. + auto join = [](cudf::table_view const& left_input, + cudf::table_view const& right_input, + cudf::null_equality compare_nulls) { + auto hash_joiner = cudf::hash_join(right_input, compare_nulls); + auto match_ctx = hash_joiner.inner_join_match_context(left_input); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, left_input.num_rows()}; + return hash_joiner.partitioned_inner_join(part_ctx); + }; + BM_join(state, dtypes, join); + } } template @@ -92,7 +109,8 @@ NVBENCH_BENCH_TYPES(nvbench_inner_join, .add_int64_axis("num_keys", nvbench::range(1, 5, 1)) .add_int64_axis("left_size", JOIN_SIZE_RANGE) .add_int64_axis("right_size", JOIN_SIZE_RANGE) - .add_int64_axis("skip_large_sizes", {1}); + .add_int64_axis("skip_large_sizes", {1}) + .add_string_axis("mode", {"normal", "partitioned"}); NVBENCH_BENCH_TYPES(nvbench_left_join, NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE, diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index 187a5b08ba1..3b573f6c2f2 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -124,10 +123,11 @@ hash_join::partitioned_join_retrieve(join_kind join, .rebind_hash_function(_impl->_hash_table.hash_function()); if (is_outer) { - join_indices = launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); + join_indices = launch_retrieve( + probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } else { - join_indices = - launch_retrieve(probe_keys.data(), n, partition_counts, ref, stream, mr); + join_indices = launch_retrieve( + probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } }; @@ -139,18 +139,6 @@ hash_join::partitioned_join_retrieve(join_kind join, _nulls_equal, retrieve_partition); - auto& [left_indices, right_indices] = join_indices; - - // Offset left indices to be relative to the original complete probe table - if (left_start_idx > 0 && left_indices->size() > 0) { - thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_indices->begin(), - left_indices->end(), - cuda::make_constant_iterator(left_start_idx), - left_indices->begin(), - cuda::std::plus{}); - } - return join_indices; } diff --git a/cpp/src/join/hash_join/retrieve.cu b/cpp/src/join/hash_join/retrieve.cu index 9609933f522..fa446f5d697 100644 --- a/cpp/src/join/hash_join/retrieve.cu +++ b/cpp/src/join/hash_join/retrieve.cu @@ -14,6 +14,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, primitive_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); @@ -23,6 +24,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, nested_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); @@ -32,6 +34,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, flat_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/retrieve_kernels.cuh index 858fde457fc..6db9165ee45 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/retrieve_kernels.cuh @@ -3,25 +3,29 @@ * SPDX-License-Identifier: Apache-2.0 */ -// Hash join retrieve kernel using prefix-scan offsets. Each CG knows -// exactly where to write — no atomics, no shared-memory buffering. -// Uses cuco ref public APIs: storage_ref(), probing_scheme(), empty_key_sentinel(), key_eq(). +// Hash join retrieve kernel ported from cuco's open_addressing retrieve. +// Uses a shared-memory buffer per flushing tile (warp) to coalesce global +// output writes and amortize the global atomic counter across many matches. #pragma once #include "kernels_common.cuh" -#include #include +#include #include #include +#include #include #include #include +#include +#include #include -#include +#include +#include namespace { @@ -40,83 +44,157 @@ namespace cudf::detail { template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) retrieve_kernel(probe_key_type const* __restrict__ input_probe, - size_type const* __restrict__ offsets, cuda::std::int64_t n, + size_type left_offset, size_type* __restrict__ left_output, size_type* __restrict__ right_output, + size_type* __restrict__ output_counter, Ref ref) { namespace cg = cooperative_groups; - auto constexpr cg_size = Ref::cg_size; - auto constexpr bucket_size = Ref::bucket_size; - auto const empty_sentinel = ref.empty_key_sentinel(); - auto const key_equal = ref.key_eq(); - - auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto idx = grid_1d::global_thread_id() / cg_size; - auto const stride = grid_1d::grid_stride() / cg_size; - - while (idx < n) { - auto const probe_key = input_probe[idx]; - auto const left_index = probe_key.second; - auto write_pos = static_cast(offsets[idx]); - - auto probing_iter = ref.probing_scheme().template make_iterator( - tile, probe_key, ref.storage_ref().extent()); - auto const init_probing_idx = *probing_iter; - - bool running = true; - [[maybe_unused]] bool found_match = false; - - while (tile.any(running)) { - if (running) { - auto const bucket_slots = ref.storage_ref()[*probing_iter]; - - bool equals[bucket_size]; - for (int i = 0; i < bucket_size; ++i) { - equals[i] = false; - if (running) { - if (bucket_slots[i] == empty_sentinel) { - running = false; - } else if (key_equal(probe_key, bucket_slots[i])) { - equals[i] = true; + auto constexpr cg_size = Ref::cg_size; + auto constexpr bucket_size = Ref::bucket_size; + auto constexpr flushing_tile_size = 32; // full warp for coalesced flushes + static_assert(flushing_tile_size >= cg_size); + static_assert(DEFAULT_JOIN_BLOCK_SIZE % flushing_tile_size == 0); + + auto constexpr num_flushing_tiles = DEFAULT_JOIN_BLOCK_SIZE / flushing_tile_size; + auto constexpr tiles_in_block = DEFAULT_JOIN_BLOCK_SIZE / cg_size; + auto constexpr max_matches_per_step = flushing_tile_size * bucket_size; + // buffer_size leaves headroom so one full probing step can't overflow. + auto constexpr buffer_size = max_matches_per_step + flushing_tile_size; + + using index_pair = cuco::pair; + __shared__ index_pair buffers[num_flushing_tiles][buffer_size]; + __shared__ cuda::std::int32_t counters[num_flushing_tiles]; + + auto const block = cg::this_thread_block(); + auto const flushing_tile = cg::tiled_partition(block); + auto const probing_tile = cg::tiled_partition(block); + auto const flushing_tile_id = flushing_tile.meta_group_rank(); + auto const empty_sentinel = ref.empty_key_sentinel(); + auto const key_equal = ref.key_eq(); + + if (flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; } + flushing_tile.sync(); + + auto atomic_counter = cuda::atomic_ref{*output_counter}; + + auto flush_buffers = [&](auto const& tile) { + size_type offset = 0; + auto const count = counters[flushing_tile_id]; + auto const rank = tile.thread_rank(); + if (rank == 0) { + offset = atomic_counter.fetch_add(static_cast(count), cuda::memory_order_relaxed); + } + offset = tile.shfl(offset, 0); + for (int i = rank; i < count; i += tile.size()) { + left_output[offset + i] = buffers[flushing_tile_id][i].first; + right_output[offset + i] = buffers[flushing_tile_id][i].second; + } + }; + + auto const grid_stride_tiles = static_cast(gridDim.x) * tiles_in_block; + auto idx = + static_cast(blockIdx.x) * tiles_in_block + probing_tile.meta_group_rank(); + + while (flushing_tile.any(idx < n)) { + bool const active = idx < n; + auto const active_flushing_tile = + cg::binary_partition(flushing_tile, active); + + if (active) { + auto const probe_key = input_probe[idx]; + auto const left_index = probe_key.second + left_offset; + + auto probing_iter = ref.probing_scheme().template make_iterator( + probing_tile, probe_key, ref.storage_ref().extent()); + auto const init_probing_idx = *probing_iter; + + bool running = true; + [[maybe_unused]] bool found_match = false; + + while (active_flushing_tile.any(running)) { + if (running) { + auto const bucket_slots = ref.storage_ref()[*probing_iter]; + + bool equals[bucket_size]; + for (int i = 0; i < bucket_size; ++i) { + equals[i] = false; + if (running) { + if (bucket_slots[i] == empty_sentinel) { + running = false; + } else if (key_equal(probe_key, bucket_slots[i])) { + equals[i] = true; + } } } - } - tile.sync(); - running = tile.all(running); + probing_tile.sync(); + running = probing_tile.all(running); - for (int i = 0; i < bucket_size; ++i) { - auto const match_mask = tile.ballot(equals[i]); - auto const num_matches = cuda::std::popcount(match_mask); + cuda::std::int32_t exists[bucket_size]; + cuda::std::int32_t num_matches[bucket_size]; + cuda::std::int32_t total_matches = 0; + for (int i = 0; i < bucket_size; ++i) { + exists[i] = probing_tile.ballot(equals[i]); + num_matches[i] = cuda::std::popcount(static_cast(exists[i])); + total_matches += num_matches[i]; + } - if (equals[i]) { - auto const lane_offset = count_lower_set_bits(match_mask, tile.thread_rank()); - left_output[write_pos + lane_offset] = left_index; - right_output[write_pos + lane_offset] = bucket_slots[i].second; + auto const lane_id = probing_tile.thread_rank(); + + if (total_matches > 0) { if constexpr (IsOuter) { found_match = true; } + + cuda::std::int32_t output_idx = 0; + if (lane_id == 0) { + auto shared_ref = cuda::atomic_ref{ + counters[flushing_tile_id]}; + output_idx = shared_ref.fetch_add(total_matches, cuda::memory_order_relaxed); + } + output_idx = probing_tile.shfl(output_idx, 0); + + cuda::std::int32_t matches_offset = 0; + for (int i = 0; i < bucket_size; ++i) { + if (equals[i]) { + auto const lane_offset = count_lower_set_bits(exists[i], lane_id); + buffers[flushing_tile_id][output_idx + matches_offset + lane_offset] = { + left_index, bucket_slots[i].second}; + } + matches_offset += num_matches[i]; + } } - if (tile.thread_rank() == 0) { write_pos += num_matches; } - write_pos = tile.shfl(write_pos, 0); + if constexpr (IsOuter) { + if (!running && !found_match && lane_id == 0) { + auto shared_ref = cuda::atomic_ref{ + counters[flushing_tile_id]}; + auto const output_idx = shared_ref.fetch_add(1, cuda::memory_order_relaxed); + buffers[flushing_tile_id][output_idx] = {left_index, cudf::JoinNoMatch}; + } + } + } // if running + + active_flushing_tile.sync(); + if (counters[flushing_tile_id] > (buffer_size - max_matches_per_step)) { + flush_buffers(active_flushing_tile); + active_flushing_tile.sync(); + if (active_flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; } + active_flushing_tile.sync(); } - } - ++probing_iter; - if (*probing_iter == init_probing_idx) { running = false; } - } + ++probing_iter; + if (*probing_iter == init_probing_idx) { running = false; } + } // while running + } // if active - if constexpr (IsOuter) { - if (!found_match && tile.thread_rank() == 0) { - left_output[write_pos] = left_index; - right_output[write_pos] = JoinNoMatch; - } - } + idx += grid_stride_tiles; + } // while idx < n - idx += stride; - } + flushing_tile.sync(); + if (counters[flushing_tile_id] > 0) { flush_buffers(flushing_tile); } } template @@ -126,6 +204,7 @@ launch_retrieve(probe_key_type const* keys, cuda::std::int64_t n, size_type const* match_counts, Ref ref, + size_type left_offset, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -134,22 +213,13 @@ launch_retrieve(probe_key_type const* keys, std::make_unique>(0, stream, mr)); } - // Exclusive scan of match counts to get per-row output offsets. - rmm::device_uvector offsets(n, stream); - thrust::exclusive_scan(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - match_counts, - match_counts + n, - offsets.begin()); - - // Total output size = last offset + last count. Batch both D2H copies. - size_type last_offset = 0; - size_type last_count = 0; - void* const dsts[] = {&last_offset, &last_count}; - void const* const srcs[] = {offsets.data() + n - 1, match_counts + n - 1}; - std::size_t const sizes[] = {sizeof(size_type), sizeof(size_type)}; - CUDF_CUDA_TRY(cudf::detail::memcpy_batch_async(dsts, srcs, sizes, 2, stream)); - stream.synchronize(); - auto const total_output = static_cast(last_offset) + last_count; + // Shared-memory buffered retrieve only needs the total output size, not + // per-row offsets. A reduce is cheaper than an exclusive_scan. + auto const total_output = + thrust::reduce(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + match_counts, + match_counts + n, + size_type{0}); if (total_output == 0) { return std::pair(std::make_unique>(0, stream, mr), @@ -159,11 +229,14 @@ launch_retrieve(probe_key_type const* keys, auto left_indices = std::make_unique>(total_output, stream, mr); auto right_indices = std::make_unique>(total_output, stream, mr); - auto const config = - grid_1d{static_cast(n * DEFAULT_JOIN_CG_SIZE), DEFAULT_JOIN_BLOCK_SIZE}; + // Global atomic counter claimed in bulk by each flushing-tile buffer flush. + rmm::device_scalar output_counter(size_type{0}, stream); + + auto constexpr tiles_in_block = DEFAULT_JOIN_BLOCK_SIZE / Ref::cg_size; + auto const num_blocks = static_cast((n + tiles_in_block - 1) / tiles_in_block); - retrieve_kernel<<>>( - keys, offsets.data(), n, left_indices->data(), right_indices->data(), ref); + retrieve_kernel<<>>( + keys, n, left_offset, left_indices->data(), right_indices->data(), output_counter.data(), ref); return std::pair(std::move(left_indices), std::move(right_indices)); } diff --git a/cpp/src/join/hash_join/retrieve_kernels.hpp b/cpp/src/join/hash_join/retrieve_kernels.hpp index a5d772821b1..01431b97bca 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/retrieve_kernels.hpp @@ -22,7 +22,9 @@ namespace cudf::detail { * * Internally computes per-row output offsets via exclusive scan on match_counts, * derives the total output size, allocates output buffers, and launches the - * retrieve kernel. + * retrieve kernel. `left_offset` is added to each stored probe-row index when + * writing to `left_indices`, so callers can produce indices in the full probe + * table's coordinate space directly from a slice-local `keys` array. * * @return A pair of device vectors [left_indices, right_indices]. */ @@ -33,6 +35,7 @@ launch_retrieve(probe_key_type const* keys, cuda::std::int64_t n, size_type const* match_counts, Ref ref, + size_type left_offset, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/join/hash_join/retrieve_outer.cu b/cpp/src/join/hash_join/retrieve_outer.cu index 4bc5170b6fd..07d20c98c00 100644 --- a/cpp/src/join/hash_join/retrieve_outer.cu +++ b/cpp/src/join/hash_join/retrieve_outer.cu @@ -14,6 +14,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, primitive_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); @@ -23,6 +24,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, nested_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); @@ -32,6 +34,7 @@ launch_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, flat_count_ref_t, + size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref); From d91c171a115cafa697cff29439a090306dacad5c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Apr 2026 13:58:45 -0700 Subject: [PATCH 09/29] Replace full_join_complement with full_join_finalize --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/join/hash_join.hpp | 37 +++-- .../join/hash_join/full_join_complement.cu | 96 ----------- cpp/src/join/hash_join/full_join_finalize.cu | 156 ++++++++++++++++++ cpp/tests/join/join_tests.cpp | 149 +++++++++++------ 5 files changed, 275 insertions(+), 165 deletions(-) delete mode 100644 cpp/src/join/hash_join/full_join_complement.cu create mode 100644 cpp/src/join/hash_join/full_join_finalize.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 19eb85fb1dc..b0eb9cbed8e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -690,7 +690,7 @@ add_library( src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu - src/join/hash_join/full_join_complement.cu + src/join/hash_join/full_join_finalize.cu src/join/mark_join.cu src/join/filter_join_indices_jit.cu src/join/join.cu diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index c94d9242aa2..eeaf5dfaa2d 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -361,9 +361,9 @@ class hash_join { * table (defined by the join_partition_context) and the build table. The context must have been * previously created by calling full_join_match_context(). * - * @note This method does NOT include unmatched build rows (the complement). Since the complement - * is a global property across all partitions, it must be computed separately after all partitions - * are processed using full_join_complement(). + * @note This method does NOT include unmatched build rows (the complement). After all + * partitions have been processed, pass the collected results to `full_join_finalize()` to + * obtain the complete full join output. * * The returned left_indices are relative to the original complete probe table. * @@ -381,29 +381,32 @@ class hash_join { rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; /** - * @brief Computes the full join complement: unmatched build rows. + * @brief Finalizes a partitioned full join by concatenating all per-partition probe results + * and appending the unmatched build rows (the complement). * - * After performing partitioned_full_join() on all partitions, call this method with the - * concatenated right_indices from all partitions to obtain the unmatched build table rows. - * The result can be concatenated with the partitioned results to form the complete full join. + * Call this method after calling `partitioned_full_join()` for every partition. It combines + * the per-partition probe indices with the unmatched build row indices (a global property + * across all partitions) and returns a single `(left_indices, right_indices)` pair equivalent + * to the output of `full_join()`. * - * @param right_indices Concatenated right (build) indices from all partitioned_full_join() calls - * @param probe_table_num_rows Total number of rows in the probe table + * @param left_partials Per-partition `left_indices` views produced by `partitioned_full_join()` + * @param right_partials Per-partition `right_indices` views produced by `partitioned_full_join()` + * @param probe_table_num_rows Total number of rows in the original probe table * @param build_table_num_rows Total number of rows in the build table * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the result device memory * - * @return A pair of device vectors [`left_indices`, `right_indices`] for unmatched build rows, - * where left_indices are all JoinNoMatch and right_indices are the unmatched build row - * indices + * @return A pair of device vectors [`left_indices`, `right_indices`] representing the full + * join output. */ [[nodiscard]] static std::pair>, std::unique_ptr>> - full_join_complement(cudf::device_span right_indices, - size_type probe_table_num_rows, - size_type build_table_num_rows, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + full_join_finalize(cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); private: std::unique_ptr _impl; diff --git a/cpp/src/join/hash_join/full_join_complement.cu b/cpp/src/join/hash_join/full_join_complement.cu deleted file mode 100644 index 128bccf479d..00000000000 --- a/cpp/src/join/hash_join/full_join_complement.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace cudf { - -namespace { - -template -struct valid_range { - T start, stop; - __device__ constexpr bool operator()(T index) const { return index >= start && index < stop; } -}; - -} // namespace - -std::pair>, - std::unique_ptr>> -hash_join::full_join_complement(cudf::device_span right_indices, - size_type probe_table_num_rows, - size_type build_table_num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto right_indices_complement = - std::make_unique>(build_table_num_rows, stream, mr); - - if (probe_table_num_rows == 0) { - // All build rows are unmatched - thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - right_indices_complement->begin(), - right_indices_complement->end(), - 0); - } else { - auto invalid_index_map = - std::make_unique>(build_table_num_rows, stream); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - invalid_index_map->begin(), - invalid_index_map->end(), - int32_t{1}); - - valid_range valid{0, build_table_num_rows}; - - thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::make_constant_iterator(0), - cuda::make_constant_iterator(0) + right_indices.size(), - right_indices.begin(), - right_indices.begin(), - invalid_index_map->begin(), - valid); - - auto const begin_counter = static_cast(0); - auto const end_counter = static_cast(build_table_num_rows); - - size_type const indices_count = - thrust::copy_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::counting_iterator{begin_counter}, - cuda::counting_iterator{end_counter}, - invalid_index_map->begin(), - right_indices_complement->begin(), - cuda::std::identity{}) - - right_indices_complement->begin(); - right_indices_complement->resize(indices_count, stream); - } - - auto left_invalid_indices = - std::make_unique>(right_indices_complement->size(), stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_invalid_indices->begin(), - left_invalid_indices->end(), - cudf::JoinNoMatch); - - return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement)); -} - -} // namespace cudf diff --git a/cpp/src/join/hash_join/full_join_finalize.cu b/cpp/src/join/hash_join/full_join_finalize.cu new file mode 100644 index 00000000000..7b50898d329 --- /dev/null +++ b/cpp/src/join/hash_join/full_join_finalize.cu @@ -0,0 +1,156 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace { + +template +struct valid_range { + T start, stop; + __device__ constexpr bool operator()(T index) const { return index >= start && index < stop; } +}; + +/** + * @brief Writes the unmatched build-row indices into `comp_right_out` and returns the count. + */ +size_type compute_complement(cudf::device_span right_indices, + size_type probe_table_num_rows, + size_type build_table_num_rows, + size_type* comp_right_out, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (probe_table_num_rows == 0) { + thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + comp_right_out, + comp_right_out + build_table_num_rows, + 0); + return build_table_num_rows; + } + + rmm::device_uvector invalid_index_map(build_table_num_rows, stream); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + invalid_index_map.begin(), + invalid_index_map.end(), + size_type{1}); + + thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::make_constant_iterator(size_type{0}), + cuda::make_constant_iterator(size_type{0}) + right_indices.size(), + right_indices.begin(), + right_indices.begin(), + invalid_index_map.begin(), + valid_range{0, build_table_num_rows}); + + auto const end = + thrust::copy_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator{0}, + cuda::counting_iterator{build_table_num_rows}, + invalid_index_map.begin(), + comp_right_out, + cuda::std::identity{}); + return static_cast(end - comp_right_out); +} + +} // namespace + +std::pair>, + std::unique_ptr>> +hash_join::full_join_finalize( + cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(left_partials.size() == right_partials.size(), + "left_partials and right_partials must have the same length", + std::invalid_argument); + + // Sum the probe partial sizes. + std::size_t probe_total = 0; + for (auto const& span : left_partials) { + probe_total += span.size(); + } + + // Upper-bound the output at (probe matches + all build rows unmatched). + auto const upper_bound = probe_total + static_cast(build_table_num_rows); + auto left_out = std::make_unique>(upper_bound, stream, mr); + auto right_out = std::make_unique>(upper_bound, stream, mr); + + // Concatenate every partial's (left, right) indices into the head of the output with a + // single batched memcpy (one driver submission regardless of partition count). + if (probe_total > 0) { + auto const n = left_partials.size(); + std::vector dsts; + std::vector srcs; + std::vector sizes; + dsts.reserve(2 * n); + srcs.reserve(2 * n); + sizes.reserve(2 * n); + std::size_t offset = 0; + for (std::size_t i = 0; i < n; ++i) { + CUDF_EXPECTS(left_partials[i].size() == right_partials[i].size(), + "matching partials must have equal left/right sizes", + std::invalid_argument); + auto const sz = left_partials[i].size() * sizeof(size_type); + dsts.push_back(left_out->data() + offset); + srcs.push_back(left_partials[i].data()); + sizes.push_back(sz); + dsts.push_back(right_out->data() + offset); + srcs.push_back(right_partials[i].data()); + sizes.push_back(sz); + offset += left_partials[i].size(); + } + CUDF_CUDA_TRY(cudf::detail::memcpy_batch_async( + dsts.data(), srcs.data(), sizes.data(), dsts.size(), stream)); + } + + // Append the complement into the tail of the output buffers. + auto const comp_size = + compute_complement(cudf::device_span{right_out->data(), probe_total}, + probe_table_num_rows, + build_table_num_rows, + right_out->data() + probe_total, + stream, + mr); + + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + left_out->begin() + probe_total, + left_out->begin() + probe_total + comp_size, + cudf::JoinNoMatch); + + auto const final_size = probe_total + static_cast(comp_size); + left_out->resize(final_size, stream); + right_out->resize(final_size, stream); + + return std::pair(std::move(left_out), std::move(right_out)); +} + +} // namespace cudf diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 61f173e0199..3e5de7d9cc9 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -49,7 +49,7 @@ using CVector = std::vector>; using Table = cudf::table; constexpr cudf::size_type NoneValue = std::numeric_limits::min(); // TODO: how to test if this isn't public? -enum class algorithm { HASH, SORT_MERGE, MERGE }; +enum class algorithm { HASH, HASH_PARTITIONED, SORT_MERGE, MERGE }; void expect_match_counts_equal(rmm::device_uvector const& actual_counts, std::vector const& expected_counts, @@ -139,6 +139,24 @@ std::unique_ptr inner_join( left_on, right_on, compare_nulls); + } else if (algo == algorithm::HASH_PARTITIONED) { + return join_and_gather( + [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + cudf::hash_join hash_joiner(right, compare_nulls, stream); + auto match_ctx = hash_joiner.inner_join_match_context(left, stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, left.num_rows()}; + return hash_joiner.partitioned_inner_join(part_ctx, stream, mr); + }, + left_input, + right_input, + left_on, + right_on, + compare_nulls); } return join_and_gather( [](cudf::table_view const& left, @@ -212,6 +230,24 @@ std::unique_ptr left_join( left_on, right_on, compare_nulls); + } else if (algo == algorithm::HASH_PARTITIONED) { + return join_and_gather( + [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + cudf::hash_join hash_joiner(right, compare_nulls, stream); + auto match_ctx = hash_joiner.left_join_match_context(left, stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, left.num_rows()}; + return hash_joiner.partitioned_left_join(part_ctx, stream, mr); + }, + left_input, + right_input, + left_on, + right_on, + compare_nulls); } return join_and_gather( [](cudf::table_view const& left, @@ -233,8 +269,35 @@ std::unique_ptr full_join( cudf::table_view const& right_input, std::vector const& full_on, std::vector const& right_on, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL, + algorithm algo = algorithm::HASH) { + if (algo == algorithm::HASH_PARTITIONED) { + return join_and_gather( + [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + cudf::hash_join hash_joiner(right, compare_nulls, stream); + auto match_ctx = hash_joiner.full_join_match_context(left, stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, left.num_rows()}; + auto [probe_left, probe_right] = hash_joiner.partitioned_full_join(part_ctx, stream, mr); + + std::vector> left_partials{ + cudf::device_span{probe_left->data(), probe_left->size()}}; + std::vector> right_partials{ + cudf::device_span{probe_right->data(), probe_right->size()}}; + return cudf::hash_join::full_join_finalize( + left_partials, right_partials, left.num_rows(), right.num_rows(), stream, mr); + }, + full_input, + right_input, + full_on, + right_on, + compare_nulls); + } return join_and_gather( [](cudf::table_view const& left, cudf::table_view const& right, @@ -310,11 +373,15 @@ struct JoinParameterizedTestSortedInput : public JoinTest, // Parametrize qualifying join tests for supported algorithms INSTANTIATE_TEST_CASE_P(InnerJoinParameterizedTest, JoinParameterizedTest, - ::testing::Values(algorithm::HASH, algorithm::SORT_MERGE)); + ::testing::Values(algorithm::HASH, + algorithm::HASH_PARTITIONED, + algorithm::SORT_MERGE)); INSTANTIATE_TEST_CASE_P(InnerJoinParameterizedTestSortedInput, JoinParameterizedTestSortedInput, - ::testing::Values(algorithm::HASH, algorithm::MERGE)); + ::testing::Values(algorithm::HASH, + algorithm::HASH_PARTITIONED, + algorithm::MERGE)); TEST_P(JoinParameterizedTestSortedInput, SortedKeys) { @@ -3312,63 +3379,43 @@ TEST_F(JoinTest, HashJoinPartitionedFullJoin) auto part_ctx = cudf::join_partition_context{ std::make_unique(std::move(match_ctx)), 0, 0}; - // Collect right indices on device for complement computation + // Collect per-partition (left, right) indices for finalization. + std::vector>> left_idx_parts; std::vector>> right_idx_parts; - - std::vector> partials; - std::vector partial_views; for (cudf::size_type i = 0; i < t0.num_rows(); i++) { part_ctx.left_start_idx = i; part_ctx.left_end_idx = i + 1; auto [left_idx, right_idx] = hash_joiner.partitioned_full_join(part_ctx, stream, mr); - - auto left_col = cudf::column_view{cudf::device_span{*left_idx}}; - auto right_col = cudf::column_view{cudf::device_span{*right_idx}}; - auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); - auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); - auto joined = left_res->release(); - auto right_c = right_res->release(); - joined.insert(joined.end(), - std::make_move_iterator(right_c.begin()), - std::make_move_iterator(right_c.end())); - partials.push_back(std::make_unique(std::move(joined))); - partial_views.push_back(partials.back()->view()); + left_idx_parts.push_back(std::move(left_idx)); right_idx_parts.push_back(std::move(right_idx)); } - // Concatenate all right indices on device for complement - std::vector right_idx_views; - for (auto const& part : right_idx_parts) { - right_idx_views.push_back(cudf::column_view{cudf::device_span{*part}}); - } - auto all_right_indices_col = cudf::concatenate(right_idx_views, stream, mr); - - // Compute complement (unmatched build rows) - auto [complement_left, complement_right] = cudf::hash_join::full_join_complement( - cudf::device_span{ - all_right_indices_col->view().data(), - static_cast(all_right_indices_col->size())}, - t0.select(left_on).num_rows(), - t1.select(right_on).num_rows(), - stream, - mr); - - // Gather complement rows - if (complement_left->size() > 0) { - auto left_col = cudf::column_view{cudf::device_span{*complement_left}}; - auto right_col = cudf::column_view{cudf::device_span{*complement_right}}; - auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); - auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); - auto joined = left_res->release(); - auto right_c = right_res->release(); - joined.insert(joined.end(), - std::make_move_iterator(right_c.begin()), - std::make_move_iterator(right_c.end())); - partials.push_back(std::make_unique(std::move(joined))); - partial_views.push_back(partials.back()->view()); + std::vector> left_partials; + std::vector> right_partials; + left_partials.reserve(left_idx_parts.size()); + right_partials.reserve(right_idx_parts.size()); + for (std::size_t i = 0; i < left_idx_parts.size(); ++i) { + left_partials.emplace_back(left_idx_parts[i]->data(), left_idx_parts[i]->size()); + right_partials.emplace_back(right_idx_parts[i]->data(), right_idx_parts[i]->size()); } - auto concat_result = cudf::concatenate(partial_views, stream, mr); + auto [final_left, final_right] = + cudf::hash_join::full_join_finalize(left_partials, + right_partials, + t0.select(left_on).num_rows(), + t1.select(right_on).num_rows(), + stream, + mr); + + auto left_col = cudf::column_view{cudf::device_span{*final_left}}; + auto right_col = cudf::column_view{cudf::device_span{*final_right}}; + auto left_res = cudf::gather(t0, left_col, cudf::out_of_bounds_policy::NULLIFY); + auto right_res = cudf::gather(t1, right_col, cudf::out_of_bounds_policy::NULLIFY); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert( + joined.end(), std::make_move_iterator(right_c.begin()), std::make_move_iterator(right_c.end())); + auto concat_result = std::make_unique(std::move(joined)); auto concat_sort_order = cudf::sorted_order(concat_result->view()); auto concat_sorted = cudf::gather(concat_result->view(), *concat_sort_order); From e0ecc1b1d62e4687f47b84e692c5809cbbeec540 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Apr 2026 15:03:11 -0700 Subject: [PATCH 10/29] Use a common full join finalize logic --- cpp/CMakeLists.txt | 2 +- cpp/src/join/conditional_join.cu | 10 +- cpp/src/join/hash_join/full_join_finalize.cpp | 33 +++ cpp/src/join/hash_join/full_join_finalize.cu | 156 ------------- cpp/src/join/hash_join/retrieve_impl.cuh | 9 +- cpp/src/join/join_common_utils.hpp | 58 ++--- cpp/src/join/join_utils.cu | 213 +++++++++++------- cpp/src/join/mixed_join.cu | 10 +- cpp/tests/join/join_tests.cpp | 11 +- 9 files changed, 211 insertions(+), 291 deletions(-) create mode 100644 cpp/src/join/hash_join/full_join_finalize.cpp delete mode 100644 cpp/src/join/hash_join/full_join_finalize.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b0eb9cbed8e..e7b91a1ea54 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -690,7 +690,7 @@ add_library( src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu - src/join/hash_join/full_join_finalize.cu + src/join/hash_join/full_join_finalize.cpp src/join/mark_join.cu src/join/filter_join_indices_jit.cu src/join/join.cu diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 96116028ba9..c7a2c2d2c71 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -23,6 +23,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -267,9 +268,12 @@ conditional_join(table_view const& left, // For full joins, get the indices in the right table that were not joined to // by any row in the left table. if (join_type == join_kind::FULL_JOIN) { - auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, left.num_rows(), right.num_rows(), stream, mr); - join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + std::vector> const left_partials{ + cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; + std::vector> const right_partials{ + cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; + join_indices = detail::finalize_full_join( + left_partials, right_partials, left.num_rows(), right.num_rows(), stream, mr); } return join_indices; } diff --git a/cpp/src/join/hash_join/full_join_finalize.cpp b/cpp/src/join/hash_join/full_join_finalize.cpp new file mode 100644 index 00000000000..e5f68ef4038 --- /dev/null +++ b/cpp/src/join/hash_join/full_join_finalize.cpp @@ -0,0 +1,33 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "join/join_common_utils.hpp" + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf { + +std::pair>, + std::unique_ptr>> +hash_join::full_join_finalize( + cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::detail::finalize_full_join( + left_partials, right_partials, probe_table_num_rows, build_table_num_rows, stream, mr); +} + +} // namespace cudf diff --git a/cpp/src/join/hash_join/full_join_finalize.cu b/cpp/src/join/hash_join/full_join_finalize.cu deleted file mode 100644 index 7b50898d329..00000000000 --- a/cpp/src/join/hash_join/full_join_finalize.cu +++ /dev/null @@ -1,156 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -namespace cudf { -namespace { - -template -struct valid_range { - T start, stop; - __device__ constexpr bool operator()(T index) const { return index >= start && index < stop; } -}; - -/** - * @brief Writes the unmatched build-row indices into `comp_right_out` and returns the count. - */ -size_type compute_complement(cudf::device_span right_indices, - size_type probe_table_num_rows, - size_type build_table_num_rows, - size_type* comp_right_out, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - if (probe_table_num_rows == 0) { - thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - comp_right_out, - comp_right_out + build_table_num_rows, - 0); - return build_table_num_rows; - } - - rmm::device_uvector invalid_index_map(build_table_num_rows, stream); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - invalid_index_map.begin(), - invalid_index_map.end(), - size_type{1}); - - thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::make_constant_iterator(size_type{0}), - cuda::make_constant_iterator(size_type{0}) + right_indices.size(), - right_indices.begin(), - right_indices.begin(), - invalid_index_map.begin(), - valid_range{0, build_table_num_rows}); - - auto const end = - thrust::copy_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::counting_iterator{0}, - cuda::counting_iterator{build_table_num_rows}, - invalid_index_map.begin(), - comp_right_out, - cuda::std::identity{}); - return static_cast(end - comp_right_out); -} - -} // namespace - -std::pair>, - std::unique_ptr>> -hash_join::full_join_finalize( - cudf::host_span const> left_partials, - cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(left_partials.size() == right_partials.size(), - "left_partials and right_partials must have the same length", - std::invalid_argument); - - // Sum the probe partial sizes. - std::size_t probe_total = 0; - for (auto const& span : left_partials) { - probe_total += span.size(); - } - - // Upper-bound the output at (probe matches + all build rows unmatched). - auto const upper_bound = probe_total + static_cast(build_table_num_rows); - auto left_out = std::make_unique>(upper_bound, stream, mr); - auto right_out = std::make_unique>(upper_bound, stream, mr); - - // Concatenate every partial's (left, right) indices into the head of the output with a - // single batched memcpy (one driver submission regardless of partition count). - if (probe_total > 0) { - auto const n = left_partials.size(); - std::vector dsts; - std::vector srcs; - std::vector sizes; - dsts.reserve(2 * n); - srcs.reserve(2 * n); - sizes.reserve(2 * n); - std::size_t offset = 0; - for (std::size_t i = 0; i < n; ++i) { - CUDF_EXPECTS(left_partials[i].size() == right_partials[i].size(), - "matching partials must have equal left/right sizes", - std::invalid_argument); - auto const sz = left_partials[i].size() * sizeof(size_type); - dsts.push_back(left_out->data() + offset); - srcs.push_back(left_partials[i].data()); - sizes.push_back(sz); - dsts.push_back(right_out->data() + offset); - srcs.push_back(right_partials[i].data()); - sizes.push_back(sz); - offset += left_partials[i].size(); - } - CUDF_CUDA_TRY(cudf::detail::memcpy_batch_async( - dsts.data(), srcs.data(), sizes.data(), dsts.size(), stream)); - } - - // Append the complement into the tail of the output buffers. - auto const comp_size = - compute_complement(cudf::device_span{right_out->data(), probe_total}, - probe_table_num_rows, - build_table_num_rows, - right_out->data() + probe_total, - stream, - mr); - - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_out->begin() + probe_total, - left_out->begin() + probe_total + comp_size, - cudf::JoinNoMatch); - - auto const final_size = probe_total + static_cast(comp_size); - left_out->resize(final_size, stream); - right_out->resize(final_size, stream); - - return std::pair(std::move(left_out), std::move(right_out)); -} - -} // namespace cudf diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index 8cff1c8b7c0..43d33d4159f 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -189,9 +189,12 @@ hash_join::join_retrieve(cudf::table_view const& left, mr); if constexpr (Join == join_kind::FULL_JOIN) { - auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, left.num_rows(), _right.num_rows(), stream, mr); - return detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + std::vector> const left_partials{ + cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; + std::vector> const right_partials{ + cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; + return detail::finalize_full_join( + left_partials, right_partials, left.num_rows(), _right.num_rows(), stream, mr); } else { return join_indices; } diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 6a40692b59f..9e2ba456589 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -42,48 +43,35 @@ VectorPair get_trivial_left_join_indices(table_view const& left, rmm::device_async_resource_ref mr); /** - * @brief Takes two pairs of vectors and returns a single pair where the first - * element is a vector made from concatenating the first elements of both input - * pairs and the second element is a vector made from concatenating the second - * elements of both input pairs. + * @brief Finalize a full-join result: concatenate per-partition probe indices and append + * the complement (unmatched build-table rows paired with `JoinNoMatch`). * - * This function's primary use is for computing the indices of a full join by - * first performing a left join, then separately getting the complementary - * right join indices, then finally calling this function to concatenate the - * results. In this case, each input VectorPair contains the left and right - * indices from a join. + * Given the per-partition `(left, right)` probe index spans produced by a hash/mixed/conditional + * full-join probe pass, this helper builds the combined `(left_indices, right_indices)` result of + * a full outer join. The probe partials are concatenated in order into the head of the output, + * and unmatched build rows (build indices not appearing in any of the `right_partials`) are + * appended to the tail with their left-side index set to `cudf::JoinNoMatch`. * - * Note that this is a destructive operation, in that at least one of a or b - * will be invalidated (by a move) by this operation. Calling code should - * assume that neither input VectorPair is valid after this function executes. + * Callers that produce a single `(left, right)` vector pair (the non-partitioned case) wrap + * it into 1-element spans. * - * @param a The first pair of vectors. - * @param b The second pair of vectors. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return A pair of vectors containing the concatenated output. - */ -VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream); - -/** - * @brief Creates a table containing the complement of left join indices. - * - * This table has two columns. The first one is filled with `JoinNoMatch` - * and the second one contains values from 0 to right_table_row_count - 1 - * excluding those found in the right_indices column. - * - * @param right_indices Vector of indices - * @param left_table_row_count Number of rows of left table - * @param right_table_row_count Number of rows of right table + * @param left_partials Per-partition probe-side (left) index spans. Must match `right_partials` + * in count and per-entry size. + * @param right_partials Per-partition probe-side (right) index spans. `JoinNoMatch` entries + * and any out-of-range values are ignored for complement computation. + * @param probe_table_num_rows Number of rows in the original probe table. When 0, every build + * row is treated as unmatched (fast path). + * @param build_table_num_rows Number of rows in the build table. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned vectors. * - * @return Pair of vectors containing the left join indices complement + * @return `[left_indices, right_indices]` sized `sum(left_partials[i].size()) + num_unmatched`. */ -VectorPair get_left_join_indices_complement( - std::unique_ptr>& right_indices, - size_type left_table_row_count, - size_type right_table_row_count, +VectorPair finalize_full_join( + cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 0ebbd61464b..548a775e9e4 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -4,24 +4,34 @@ */ #include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include +#include +#include #include #include #include +#include #include #include #include #include +#include #include #include -#include -#include +#include +#include +#include +#include #include #include +#include #include +#include namespace cudf { namespace detail { @@ -45,100 +55,135 @@ VectorPair get_trivial_left_join_indices(table_view const& left, return std::pair(std::move(left_indices), std::move(right_indices)); } -VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS((a.first->size() == a.second->size()), - "Mismatch between sizes of vectors in vector pair"); - CUDF_EXPECTS((b.first->size() == b.second->size()), - "Mismatch between sizes of vectors in vector pair"); - if (a.first->is_empty()) { - return std::move(b); - } else if (b.first->is_empty()) { - return std::move(a); +namespace { + +// Predicate: build row `idx` is unmatched iff its bit in the packed uint32 mask is clear. +struct unmatched_bit { + uint32_t const* marks; + __device__ bool operator()(size_type idx) const noexcept + { + return ((marks[idx / 32] >> (idx % 32)) & 1u) == 0u; } - auto original_size = a.first->size(); - a.first->resize(a.first->size() + b.first->size(), stream); - a.second->resize(a.second->size() + b.second->size(), stream); - thrust::copy(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - b.first->begin(), - b.first->end(), - a.first->begin() + original_size); - thrust::copy(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - b.second->begin(), - b.second->end(), - a.second->begin() + original_size); - return std::move(a); -} +}; + +// Transform a selected (unmatched) build index into a (JoinNoMatch, idx) pair that is stored +// through a zip iterator over (left_out_tail, right_out_tail). +struct to_no_match_pair { + __device__ cuda::std::tuple operator()(size_type idx) const noexcept + { + return cuda::std::make_tuple(cudf::JoinNoMatch, idx); + } +}; -VectorPair get_left_join_indices_complement( - std::unique_ptr>& right_indices, - size_type left_table_row_count, - size_type right_table_row_count, +} // namespace + +VectorPair finalize_full_join( + cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // Get array of indices that do not appear in right_indices - - // Vector allocated for unmatched result - auto right_indices_complement = - std::make_unique>(right_table_row_count, stream); - - // If left table is empty in a full join call then all rows of the right table - // should be represented in the joined indices. This is an optimization since - // if left table is empty and full join is called all the elements in - // right_indices will be cudf::JoinNoMatch, i.e. `cuda::std::numeric_limits::min()`. - // This if path should produce exactly the same result as the else path but will be faster. - if (left_table_row_count == 0) { + CUDF_EXPECTS(left_partials.size() == right_partials.size(), + "left_partials and right_partials must have the same length", + std::invalid_argument); + + std::size_t probe_total = 0; + for (std::size_t i = 0; i < left_partials.size(); ++i) { + CUDF_EXPECTS(left_partials[i].size() == right_partials[i].size(), + "matching partials must have equal left/right sizes", + std::invalid_argument); + probe_total += left_partials[i].size(); + } + + auto const upper = probe_total + static_cast(build_table_num_rows); + auto left_out = std::make_unique>(upper, stream, mr); + auto right_out = std::make_unique>(upper, stream, mr); + + // Concatenate every probe partial into the head of the output via one batched memcpy. + if (probe_total > 0) { + auto const n = left_partials.size(); + std::vector dsts; + std::vector srcs; + std::vector sizes; + dsts.reserve(2 * n); + srcs.reserve(2 * n); + sizes.reserve(2 * n); + std::size_t offset = 0; + for (std::size_t i = 0; i < n; ++i) { + auto const sz = left_partials[i].size() * sizeof(size_type); + dsts.push_back(left_out->data() + offset); + srcs.push_back(left_partials[i].data()); + sizes.push_back(sz); + dsts.push_back(right_out->data() + offset); + srcs.push_back(right_partials[i].data()); + sizes.push_back(sz); + offset += left_partials[i].size(); + } + CUDF_CUDA_TRY(cudf::detail::memcpy_batch_async( + dsts.data(), srcs.data(), sizes.data(), dsts.size(), stream)); + } + + // Empty-probe fast path: every build row is unmatched. + if (probe_table_num_rows == 0) { + auto const tail = static_cast(build_table_num_rows); thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - right_indices_complement->begin(), - right_indices_complement->end(), + right_out->begin() + probe_total, + right_out->begin() + probe_total + tail, 0); - } else { - // Assume all the indices in invalid_index_map are invalid - auto invalid_index_map = - std::make_unique>(right_table_row_count, stream); thrust::uninitialized_fill( rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - invalid_index_map->begin(), - invalid_index_map->end(), - int32_t{1}); - - // Functor to check for index validity since left joins can create invalid indices - valid_range valid(0, right_table_row_count); - - // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count - // Thus specifying that those locations are valid - thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::make_constant_iterator(0), - cuda::make_constant_iterator(0) + right_indices->size(), - right_indices->begin(), // Index locations - right_indices->begin(), // Stencil - Check if index location is valid - invalid_index_map->begin(), // Output indices - valid); // Stencil Predicate - size_type begin_counter = static_cast(0); - size_type end_counter = static_cast(right_table_row_count); - - // Create list of indices that have been marked as invalid - size_type indices_count = - thrust::copy_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::counting_iterator{begin_counter}, - cuda::counting_iterator{end_counter}, - invalid_index_map->begin(), - right_indices_complement->begin(), - cuda::std::identity{}) - - right_indices_complement->begin(); - right_indices_complement->resize(indices_count, stream); + left_out->begin() + probe_total, + left_out->begin() + probe_total + tail, + cudf::JoinNoMatch); + left_out->resize(probe_total + tail, stream); + right_out->resize(probe_total + tail, stream); + return std::pair(std::move(left_out), std::move(right_out)); } - auto left_invalid_indices = - std::make_unique>(right_indices_complement->size(), stream); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_invalid_indices->begin(), - left_invalid_indices->end(), - cudf::JoinNoMatch); + if (build_table_num_rows == 0) { + left_out->resize(probe_total, stream); + right_out->resize(probe_total, stream); + return std::pair(std::move(left_out), std::move(right_out)); + } - return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement)); + // Mark matched build rows in a packed uint32_t bitmask. + auto const n_words = cudf::util::div_rounding_up_safe(build_table_num_rows, size_type{32}); + rmm::device_uvector marks(n_words, stream); + CUDF_CUDA_TRY(cudaMemsetAsync(marks.data(), 0, n_words * sizeof(uint32_t), stream.value())); + + auto const right_head = right_out->data(); + thrust::for_each_n(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::counting_iterator{0}, + probe_total, + [right_head, + marks = marks.data(), + build_rows = build_table_num_rows] __device__(std::size_t i) { + auto const idx = right_head[i]; + if (idx < 0 || idx >= build_rows) return; + cuda::atomic_ref ref(marks[idx / 32]); + ref.fetch_or(1u << (idx % 32), cuda::std::memory_order_relaxed); + }); + + // Fused compaction: for each unmatched build row, emit (JoinNoMatch, build_idx) into + // (left_out_tail, right_out_tail) in a single CUB DeviceSelect pass. + auto zip_tail = + thrust::make_zip_iterator(left_out->data() + probe_total, right_out->data() + probe_total); + auto out_iter = thrust::make_transform_output_iterator(zip_tail, to_no_match_pair{}); + + auto const new_end = + cudf::detail::copy_if(cuda::counting_iterator{0}, + cuda::counting_iterator{build_table_num_rows}, + out_iter, + unmatched_bit{marks.data()}, + stream); + + auto const comp_size = static_cast(new_end - out_iter); + left_out->resize(probe_total + comp_size, stream); + right_out->resize(probe_total + comp_size, stream); + + return std::pair(std::move(left_out), std::move(right_out)); } } // namespace detail diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index e2aef657baf..b89a41fc5f5 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -34,6 +34,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -522,9 +523,12 @@ mixed_join( // For full joins, get the indices in the right table that were not joined to // by any row in the left table. if (join_type == join_kind::FULL_JOIN) { - auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, left_num_rows, right_num_rows, stream, mr); - join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + std::vector> const left_partials{ + cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; + std::vector> const right_partials{ + cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; + join_indices = detail::finalize_full_join( + left_partials, right_partials, left_num_rows, right_num_rows, stream, mr); } return join_indices; } diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 3e5de7d9cc9..81fb6b6ed65 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -43,12 +43,11 @@ namespace { template -using column_wrapper = cudf::test::fixed_width_column_wrapper; -using strcol_wrapper = cudf::test::strings_column_wrapper; -using CVector = std::vector>; -using Table = cudf::table; -constexpr cudf::size_type NoneValue = - std::numeric_limits::min(); // TODO: how to test if this isn't public? +using column_wrapper = cudf::test::fixed_width_column_wrapper; +using strcol_wrapper = cudf::test::strings_column_wrapper; +using CVector = std::vector>; +using Table = cudf::table; +constexpr cudf::size_type NoneValue = cudf::JoinNoMatch; enum class algorithm { HASH, HASH_PARTITIONED, SORT_MERGE, MERGE }; void expect_match_counts_equal(rmm::device_uvector const& actual_counts, From fd735290da742f519ed74c1fb72824de8e54a82f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Apr 2026 16:47:16 -0700 Subject: [PATCH 11/29] Speed up full_join finalize with consume-in-place and fused compact --- cpp/src/join/conditional_join.cu | 6 +- cpp/src/join/hash_join/retrieve_impl.cuh | 6 +- cpp/src/join/join_common_utils.hpp | 49 ++++--- cpp/src/join/join_utils.cu | 161 +++++++++++++---------- cpp/src/join/mixed_join.cu | 6 +- 5 files changed, 130 insertions(+), 98 deletions(-) diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index c7a2c2d2c71..d45912139c4 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -268,12 +268,8 @@ conditional_join(table_view const& left, // For full joins, get the indices in the right table that were not joined to // by any row in the left table. if (join_type == join_kind::FULL_JOIN) { - std::vector> const left_partials{ - cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; - std::vector> const right_partials{ - cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; join_indices = detail::finalize_full_join( - left_partials, right_partials, left.num_rows(), right.num_rows(), stream, mr); + std::move(join_indices), left.num_rows(), right.num_rows(), stream, mr); } return join_indices; } diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh index 43d33d4159f..2ad60a33303 100644 --- a/cpp/src/join/hash_join/retrieve_impl.cuh +++ b/cpp/src/join/hash_join/retrieve_impl.cuh @@ -189,12 +189,8 @@ hash_join::join_retrieve(cudf::table_view const& left, mr); if constexpr (Join == join_kind::FULL_JOIN) { - std::vector> const left_partials{ - cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; - std::vector> const right_partials{ - cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; return detail::finalize_full_join( - left_partials, right_partials, left.num_rows(), _right.num_rows(), stream, mr); + std::move(join_indices), left.num_rows(), _right.num_rows(), stream, mr); } else { return join_indices; } diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 9e2ba456589..201e3da247d 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -43,24 +43,43 @@ VectorPair get_trivial_left_join_indices(table_view const& left, rmm::device_async_resource_ref mr); /** - * @brief Finalize a full-join result: concatenate per-partition probe indices and append - * the complement (unmatched build-table rows paired with `JoinNoMatch`). + * @brief Finalize a full-join result from a single probe-side `(left, right)` index pair. * - * Given the per-partition `(left, right)` probe index spans produced by a hash/mixed/conditional - * full-join probe pass, this helper builds the combined `(left_indices, right_indices)` result of - * a full outer join. The probe partials are concatenated in order into the head of the output, - * and unmatched build rows (build indices not appearing in any of the `right_partials`) are - * appended to the tail with their left-side index set to `cudf::JoinNoMatch`. + * Takes ownership of `probe_indices`, resizes both vectors to `probe_indices.first->size() + + * build_table_num_rows`, and appends the complement (unmatched build rows paired with + * `JoinNoMatch`) into the tail. The vectors are then resized down to the true output length. * - * Callers that produce a single `(left, right)` vector pair (the non-partitioned case) wrap - * it into 1-element spans. + * Used by the non-partitioned full-join paths (hash/mixed/conditional); consuming the caller's + * buffers in-place avoids a redundant concat memcpy over the probe data. * - * @param left_partials Per-partition probe-side (left) index spans. Must match `right_partials` - * in count and per-entry size. - * @param right_partials Per-partition probe-side (right) index spans. `JoinNoMatch` entries - * and any out-of-range values are ignored for complement computation. - * @param probe_table_num_rows Number of rows in the original probe table. When 0, every build - * row is treated as unmatched (fast path). + * @param probe_indices Probe-side `(left, right)` index vectors (consumed). + * @param probe_table_num_rows Number of rows in the probe table (0 → every build row is + * unmatched, fast path). + * @param build_table_num_rows Number of rows in the build table. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate working storage. + * + * @return `[left_indices, right_indices]` of the complete full-join output. + */ +VectorPair finalize_full_join(VectorPair&& probe_indices, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Finalize a full-join result from per-partition probe index spans. + * + * Concatenates every `(left_partials[i], right_partials[i])` pair into the head of the output + * and appends the complement (unmatched build rows paired with `JoinNoMatch`) into the tail. + * Internally delegates to the `VectorPair&&` overload, so the mark/compact path is shared. + * + * Used by `cudf::hash_join::full_join_finalize` for partitioned full joins where the partials + * live in separate buffers and must be gathered. + * + * @param left_partials Per-partition probe-side (left) index spans. + * @param right_partials Per-partition probe-side (right) index spans. + * @param probe_table_num_rows Number of rows in the probe table. * @param build_table_num_rows Number of rows in the build table. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned vectors. diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 548a775e9e4..2c02c6b930f 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -19,13 +19,12 @@ #include #include -#include #include #include #include -#include #include #include +#include #include #include @@ -57,13 +56,13 @@ VectorPair get_trivial_left_join_indices(table_view const& left, namespace { -// Predicate: build row `idx` is unmatched iff its bit in the packed uint32 mask is clear. -struct unmatched_bit { - uint32_t const* marks; - __device__ bool operator()(size_type idx) const noexcept - { - return ((marks[idx / 32] >> (idx % 32)) & 1u) == 0u; - } +// Predicate: build row `idx` is unmatched iff its flag slot is zero. +// We use an int32 flag (one per build row) rather than a packed bit or a byte: byte stores +// from a dense 32-wide scatter don't coalesce into full-word transactions, which costs 2–3× +// in the mark kernel for skewed probe/build ratios. +struct unmatched_flag { + size_type const* flags; + __device__ bool operator()(size_type idx) const noexcept { return flags[idx] == 0; } }; // Transform a selected (unmatched) build index into a (JoinNoMatch, idx) pair that is stored @@ -77,6 +76,79 @@ struct to_no_match_pair { } // namespace +VectorPair finalize_full_join(VectorPair&& probe_indices, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto [left_out, right_out] = std::move(probe_indices); + CUDF_EXPECTS(left_out->size() == right_out->size(), + "probe left/right index vectors must have equal size", + std::invalid_argument); + auto const probe_total = left_out->size(); + + // Empty-probe fast path: every build row is unmatched. + if (probe_table_num_rows == 0) { + auto const tail = static_cast(build_table_num_rows); + left_out->resize(probe_total + tail, stream); + right_out->resize(probe_total + tail, stream); + thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + right_out->begin() + probe_total, + right_out->end(), + 0); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + left_out->begin() + probe_total, + left_out->end(), + cudf::JoinNoMatch); + return std::pair(std::move(left_out), std::move(right_out)); + } + + if (build_table_num_rows == 0) { return std::pair(std::move(left_out), std::move(right_out)); } + + // Grow to the upper bound (probe_total + build_table_num_rows); the complement is appended + // into the tail. If the caller pre-reserved this capacity (see the span overload below), + // these resizes don't reallocate. + auto const upper = probe_total + static_cast(build_table_num_rows); + left_out->resize(upper, stream); + right_out->resize(upper, stream); + + // Mark matched build rows in an int32 flag array (one word per build row). Redundant stores + // of the same value are idempotent, so no atomics are needed. Word-sized stores coalesce into + // full 128-byte transactions per warp; byte-sized flags cost ~2–3× here because partial-word + // stores from dense scatters serialize within each 32-bit sector. + auto flags = cudf::detail::make_zeroed_device_uvector_async( + build_table_num_rows, stream, cudf::get_current_device_resource_ref()); + + thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), + cuda::make_constant_iterator(size_type{1}), + cuda::make_constant_iterator(size_type{1}) + probe_total, + right_out->begin(), + right_out->begin(), + flags.begin(), + valid_range{0, build_table_num_rows}); + + // Fused compaction: for each unmatched build row, emit (JoinNoMatch, build_idx) into + // (left_out_tail, right_out_tail) in a single CUB DeviceSelect pass. + auto zip_tail = + thrust::make_zip_iterator(left_out->data() + probe_total, right_out->data() + probe_total); + auto out_iter = thrust::make_transform_output_iterator(zip_tail, to_no_match_pair{}); + + auto const new_end = + cudf::detail::copy_if(cuda::counting_iterator{0}, + cuda::counting_iterator{build_table_num_rows}, + out_iter, + unmatched_flag{flags.data()}, + stream); + + auto const comp_size = static_cast(new_end - out_iter); + left_out->resize(probe_total + comp_size, stream); + right_out->resize(probe_total + comp_size, stream); + + return std::pair(std::move(left_out), std::move(right_out)); +} + VectorPair finalize_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, @@ -97,6 +169,8 @@ VectorPair finalize_full_join( probe_total += left_partials[i].size(); } + // Pre-allocate at the upper bound so the VectorPair overload's resize-up becomes a no-op + // (capacity is already there). auto const upper = probe_total + static_cast(build_table_num_rows); auto left_out = std::make_unique>(upper, stream, mr); auto right_out = std::make_unique>(upper, stream, mr); @@ -125,65 +199,16 @@ VectorPair finalize_full_join( dsts.data(), srcs.data(), sizes.data(), dsts.size(), stream)); } - // Empty-probe fast path: every build row is unmatched. - if (probe_table_num_rows == 0) { - auto const tail = static_cast(build_table_num_rows); - thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - right_out->begin() + probe_total, - right_out->begin() + probe_total + tail, - 0); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_out->begin() + probe_total, - left_out->begin() + probe_total + tail, - cudf::JoinNoMatch); - left_out->resize(probe_total + tail, stream); - right_out->resize(probe_total + tail, stream); - return std::pair(std::move(left_out), std::move(right_out)); - } - - if (build_table_num_rows == 0) { - left_out->resize(probe_total, stream); - right_out->resize(probe_total, stream); - return std::pair(std::move(left_out), std::move(right_out)); - } - - // Mark matched build rows in a packed uint32_t bitmask. - auto const n_words = cudf::util::div_rounding_up_safe(build_table_num_rows, size_type{32}); - rmm::device_uvector marks(n_words, stream); - CUDF_CUDA_TRY(cudaMemsetAsync(marks.data(), 0, n_words * sizeof(uint32_t), stream.value())); - - auto const right_head = right_out->data(); - thrust::for_each_n(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - cuda::counting_iterator{0}, - probe_total, - [right_head, - marks = marks.data(), - build_rows = build_table_num_rows] __device__(std::size_t i) { - auto const idx = right_head[i]; - if (idx < 0 || idx >= build_rows) return; - cuda::atomic_ref ref(marks[idx / 32]); - ref.fetch_or(1u << (idx % 32), cuda::std::memory_order_relaxed); - }); - - // Fused compaction: for each unmatched build row, emit (JoinNoMatch, build_idx) into - // (left_out_tail, right_out_tail) in a single CUB DeviceSelect pass. - auto zip_tail = - thrust::make_zip_iterator(left_out->data() + probe_total, right_out->data() + probe_total); - auto out_iter = thrust::make_transform_output_iterator(zip_tail, to_no_match_pair{}); + // Shrink the uvectors' logical size to probe_total (capacity stays at upper bound), then + // delegate to the VectorPair overload which resizes back up and appends the complement. + left_out->resize(probe_total, stream); + right_out->resize(probe_total, stream); - auto const new_end = - cudf::detail::copy_if(cuda::counting_iterator{0}, - cuda::counting_iterator{build_table_num_rows}, - out_iter, - unmatched_bit{marks.data()}, - stream); - - auto const comp_size = static_cast(new_end - out_iter); - left_out->resize(probe_total + comp_size, stream); - right_out->resize(probe_total + comp_size, stream); - - return std::pair(std::move(left_out), std::move(right_out)); + return finalize_full_join(std::pair(std::move(left_out), std::move(right_out)), + probe_table_num_rows, + build_table_num_rows, + stream, + mr); } } // namespace detail diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index b89a41fc5f5..79d359c7b01 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -523,12 +523,8 @@ mixed_join( // For full joins, get the indices in the right table that were not joined to // by any row in the left table. if (join_type == join_kind::FULL_JOIN) { - std::vector> const left_partials{ - cudf::device_span{join_indices.first->data(), join_indices.first->size()}}; - std::vector> const right_partials{ - cudf::device_span{join_indices.second->data(), join_indices.second->size()}}; join_indices = detail::finalize_full_join( - left_partials, right_partials, left_num_rows, right_num_rows, stream, mr); + std::move(join_indices), left_num_rows, right_num_rows, stream, mr); } return join_indices; } From 9ada51a1154a104feefee6f1a24a04cb89bc7869 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 22:02:55 +0000 Subject: [PATCH 12/29] Rename full_join_finalize -> finalize_partitioned_full_join; sort CMakeLists --- cpp/CMakeLists.txt | 10 +++++----- cpp/include/cudf/join/hash_join.hpp | 4 ++-- cpp/src/join/hash_join/full_join_finalize.cpp | 2 +- cpp/src/join/join_common_utils.hpp | 2 +- cpp/tests/join/join_tests.cpp | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e7b91a1ea54..1d4d4062372 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -670,6 +670,9 @@ add_library( src/join/filter_join_indices_kernel_null_primitive.cu src/join/filter_join_indices_kernel_primitive.cu src/join/filtered_join.cu + src/join/hash_join/count_each.cu + src/join/hash_join/count_each_outer.cu + src/join/hash_join/full_join_finalize.cpp src/join/hash_join/full_join_match_context.cpp src/join/hash_join/full_join_retrieve.cu src/join/hash_join/full_join_size.cu @@ -682,15 +685,12 @@ add_library( src/join/hash_join/left_join_retrieve.cu src/join/hash_join/left_join_size.cu src/join/hash_join/match_context.cu - src/join/hash_join/count_each.cu - src/join/hash_join/count_each_outer.cu - src/join/hash_join/retrieve.cu - src/join/hash_join/retrieve_outer.cu src/join/hash_join/partitioned_full_join.cu src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu - src/join/hash_join/full_join_finalize.cpp + src/join/hash_join/retrieve.cu + src/join/hash_join/retrieve_outer.cu src/join/mark_join.cu src/join/filter_join_indices_jit.cu src/join/join.cu diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index eeaf5dfaa2d..905a3257ee8 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -362,7 +362,7 @@ class hash_join { * previously created by calling full_join_match_context(). * * @note This method does NOT include unmatched build rows (the complement). After all - * partitions have been processed, pass the collected results to `full_join_finalize()` to + * partitions have been processed, pass the collected results to `finalize_partitioned_full_join()` to * obtain the complete full join output. * * The returned left_indices are relative to the original complete probe table. @@ -401,7 +401,7 @@ class hash_join { */ [[nodiscard]] static std::pair>, std::unique_ptr>> - full_join_finalize(cudf::host_span const> left_partials, + finalize_partitioned_full_join(cudf::host_span const> left_partials, cudf::host_span const> right_partials, size_type probe_table_num_rows, size_type build_table_num_rows, diff --git a/cpp/src/join/hash_join/full_join_finalize.cpp b/cpp/src/join/hash_join/full_join_finalize.cpp index e5f68ef4038..408d9f077d8 100644 --- a/cpp/src/join/hash_join/full_join_finalize.cpp +++ b/cpp/src/join/hash_join/full_join_finalize.cpp @@ -18,7 +18,7 @@ namespace cudf { std::pair>, std::unique_ptr>> -hash_join::full_join_finalize( +hash_join::finalize_partitioned_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, size_type probe_table_num_rows, diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 201e3da247d..461c43c4d10 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -74,7 +74,7 @@ VectorPair finalize_full_join(VectorPair&& probe_indices, * and appends the complement (unmatched build rows paired with `JoinNoMatch`) into the tail. * Internally delegates to the `VectorPair&&` overload, so the mark/compact path is shared. * - * Used by `cudf::hash_join::full_join_finalize` for partitioned full joins where the partials + * Used by `cudf::hash_join::finalize_partitioned_full_join` for partitioned full joins where the partials * live in separate buffers and must be gathered. * * @param left_partials Per-partition probe-side (left) index spans. diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 81fb6b6ed65..9051e216129 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -288,7 +288,7 @@ std::unique_ptr full_join( cudf::device_span{probe_left->data(), probe_left->size()}}; std::vector> right_partials{ cudf::device_span{probe_right->data(), probe_right->size()}}; - return cudf::hash_join::full_join_finalize( + return cudf::hash_join::finalize_partitioned_full_join( left_partials, right_partials, left.num_rows(), right.num_rows(), stream, mr); }, full_input, @@ -3399,7 +3399,7 @@ TEST_F(JoinTest, HashJoinPartitionedFullJoin) } auto [final_left, final_right] = - cudf::hash_join::full_join_finalize(left_partials, + cudf::hash_join::finalize_partitioned_full_join(left_partials, right_partials, t0.select(left_on).num_rows(), t1.select(right_on).num_rows(), From d11bf76cf05224ff38a4507d42178bbd50c89499 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 22:33:40 +0000 Subject: [PATCH 13/29] Address lamarrr review: explicit mode check in benchmark, delete copy ops --- cpp/benchmarks/join/join.cu | 4 +++- cpp/include/cudf/join/join.hpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index 8906bdfae7f..13ee5533130 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -26,7 +26,7 @@ void nvbench_inner_join(nvbench::state& state, return cudf::inner_join(left_input, right_input, compare_nulls); }; BM_join(state, dtypes, join); - } else { + } else if (mode == "partitioned") { // Partitioned code path: build hash join, compute match context, then retrieve the // entire probe table as a single partition. This exercises the two-phase // count-then-retrieve flow used for chunked probing. @@ -40,6 +40,8 @@ void nvbench_inner_join(nvbench::state& state, return hash_joiner.partitioned_inner_join(part_ctx); }; BM_join(state, dtypes, join); + } else { + CUDF_FAIL("unrecognized mode: " + mode); } } diff --git a/cpp/include/cudf/join/join.hpp b/cpp/include/cudf/join/join.hpp index 67620505422..16ed0ad0e2e 100644 --- a/cpp/include/cudf/join/join.hpp +++ b/cpp/include/cudf/join/join.hpp @@ -87,7 +87,9 @@ struct join_match_context { : _left_table{left_table}, _match_counts{std::move(match_counts)} { } - join_match_context(join_match_context&&) = default; ///< Move constructor + join_match_context(join_match_context const&) = delete; + join_match_context& operator=(join_match_context const&) = delete; + join_match_context(join_match_context&&) = default; ///< Move constructor /** * @brief Move assignment operator * @return Reference to this object From 36f212ed422bbe7ee36d9d75663f320fe4cf1227 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 22:36:55 +0000 Subject: [PATCH 14/29] Use cooperative_groups::invoke_one instead of thread_rank() == 0 --- cpp/src/join/hash_join/count_kernels.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join/count_kernels.cuh b/cpp/src/join/hash_join/count_kernels.cuh index d252e79ff56..92867799752 100644 --- a/cpp/src/join/hash_join/count_kernels.cuh +++ b/cpp/src/join/hash_join/count_kernels.cuh @@ -44,15 +44,15 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); if constexpr (IsOuter) { auto temp_count = static_cast(ref.count(tile, key)); - if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; } + if (tile.all(temp_count == 0)) { cooperative_groups::invoke_one(tile, [&]() { ++temp_count; }); } auto const cnt = cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); - if (tile.thread_rank() == 0) { output[idx] = cnt; } + cooperative_groups::invoke_one(tile, [&]() { output[idx] = cnt; }); } else { auto const cnt = cooperative_groups::reduce(tile, static_cast(ref.count(tile, key)), cooperative_groups::plus()); - if (tile.thread_rank() == 0) { output[idx] = cnt; } + cooperative_groups::invoke_one(tile, [&]() { output[idx] = cnt; }); } } idx += stride; From 2d670d20a7a157e6df192d9b584ee9b32d7b0e49 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 22:46:01 +0000 Subject: [PATCH 15/29] Rename count_each -> partitioned_count kernel and associated files --- cpp/CMakeLists.txt | 4 ++-- cpp/src/join/hash_join/kernels_common.cuh | 2 +- cpp/src/join/hash_join/match_context.cu | 6 +++--- .../hash_join/{count_each.cu => partitioned_count.cu} | 8 ++++---- .../{count_kernels.cuh => partitioned_count_kernels.cuh} | 8 ++++---- .../{count_kernels.hpp => partitioned_count_kernels.hpp} | 4 ++-- .../{count_each_outer.cu => partitioned_count_outer.cu} | 8 ++++---- cpp/src/join/hash_join/ref_types.cuh | 2 +- 8 files changed, 21 insertions(+), 21 deletions(-) rename cpp/src/join/hash_join/{count_each.cu => partitioned_count.cu} (73%) rename cpp/src/join/hash_join/{count_kernels.cuh => partitioned_count_kernels.cuh} (91%) rename cpp/src/join/hash_join/{count_kernels.hpp => partitioned_count_kernels.hpp} (83%) rename cpp/src/join/hash_join/{count_each_outer.cu => partitioned_count_outer.cu} (73%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1d4d4062372..c36cb3693d1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -670,8 +670,6 @@ add_library( src/join/filter_join_indices_kernel_null_primitive.cu src/join/filter_join_indices_kernel_primitive.cu src/join/filtered_join.cu - src/join/hash_join/count_each.cu - src/join/hash_join/count_each_outer.cu src/join/hash_join/full_join_finalize.cpp src/join/hash_join/full_join_match_context.cpp src/join/hash_join/full_join_retrieve.cu @@ -685,6 +683,8 @@ add_library( src/join/hash_join/left_join_retrieve.cu src/join/hash_join/left_join_size.cu src/join/hash_join/match_context.cu + src/join/hash_join/partitioned_count.cu + src/join/hash_join/partitioned_count_outer.cu src/join/hash_join/partitioned_full_join.cu src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh index 3f0740a4160..36f54ffdca7 100644 --- a/cpp/src/join/hash_join/kernels_common.cuh +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -5,7 +5,7 @@ // Ported from cuco's open_addressing kernels and ref_impl to give cudf direct // control over hash-join probe kernel launches. The device-side probing logic -// is identical to cuco's static_multiset::count / count_each / retrieve / +// is identical to cuco's static_multiset::count / partitioned_count / retrieve / // retrieve_outer. We keep the cuco ref type for hash-table access (storage, // probing scheme, predicate) and only replace the host-side launch. diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index 6404de843de..eb5f933e164 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -4,7 +4,7 @@ */ #include "common.cuh" -#include "count_kernels.hpp" +#include "partitioned_count_kernels.hpp" #include "dispatch.cuh" #include "join/join_common_utils.cuh" @@ -63,10 +63,10 @@ std::unique_ptr> make_join_match_counts( .rebind_key_eq(equality) .rebind_hash_function(hash_table.hash_function()); if (join == join_kind::INNER_JOIN) { - launch_count_each(probe_keys.data(), n, match_counts->begin(), ref, stream); + launch_partitioned_count(probe_keys.data(), n, match_counts->begin(), ref, stream); } else { // IsOuter=true handles the clamp (zero → 1) for LEFT/FULL joins internally. - launch_count_each(probe_keys.data(), n, match_counts->begin(), ref, stream); + launch_partitioned_count(probe_keys.data(), n, match_counts->begin(), ref, stream); } }; diff --git a/cpp/src/join/hash_join/count_each.cu b/cpp/src/join/hash_join/partitioned_count.cu similarity index 73% rename from cpp/src/join/hash_join/count_each.cu rename to cpp/src/join/hash_join/partitioned_count.cu index db4024f44a0..b430661dc56 100644 --- a/cpp/src/join/hash_join/count_each.cu +++ b/cpp/src/join/hash_join/partitioned_count.cu @@ -3,21 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "count_kernels.cuh" +#include "partitioned_count_kernels.cuh" #include "ref_types.cuh" namespace cudf::detail { -template void launch_count_each(probe_key_type const*, +template void launch_partitioned_count(probe_key_type const*, cuda::std::int64_t, size_type*, primitive_count_ref_t, rmm::cuda_stream_view); -template void launch_count_each( +template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); -template void launch_count_each( +template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh similarity index 91% rename from cpp/src/join/hash_join/count_kernels.cuh rename to cpp/src/join/hash_join/partitioned_count_kernels.cuh index 92867799752..4826042da6d 100644 --- a/cpp/src/join/hash_join/count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -// Ported from cuco's open_addressing count_each kernel. +// Ported from cuco's open_addressing partitioned_count kernel. #pragma once @@ -20,7 +20,7 @@ namespace cudf::detail { template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) - count_each_kernel(probe_key_type const* __restrict__ keys, + partitioned_count_kernel(probe_key_type const* __restrict__ keys, cuda::std::int64_t n, size_type* __restrict__ output, Ref ref) @@ -60,7 +60,7 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) } template -void launch_count_each(probe_key_type const* keys, +void launch_partitioned_count(probe_key_type const* keys, cuda::std::int64_t n, size_type* output, Ref ref, @@ -71,7 +71,7 @@ void launch_count_each(probe_key_type const* keys, auto const config = grid_1d{static_cast(n * DEFAULT_JOIN_CG_SIZE), DEFAULT_JOIN_BLOCK_SIZE}; - count_each_kernel + partitioned_count_kernel <<>>(keys, n, output, ref); } diff --git a/cpp/src/join/hash_join/count_kernels.hpp b/cpp/src/join/hash_join/partitioned_count_kernels.hpp similarity index 83% rename from cpp/src/join/hash_join/count_kernels.hpp rename to cpp/src/join/hash_join/partitioned_count_kernels.hpp index 9ea1e58d0fb..c4eb83f2e05 100644 --- a/cpp/src/join/hash_join/count_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_count_kernels.hpp @@ -13,9 +13,9 @@ namespace cudf::detail { -/// Launch the count_each kernel. +/// Launch the partitioned_count kernel. template -void launch_count_each(probe_key_type const* keys, +void launch_partitioned_count(probe_key_type const* keys, cuda::std::int64_t n, size_type* output, Ref ref, diff --git a/cpp/src/join/hash_join/count_each_outer.cu b/cpp/src/join/hash_join/partitioned_count_outer.cu similarity index 73% rename from cpp/src/join/hash_join/count_each_outer.cu rename to cpp/src/join/hash_join/partitioned_count_outer.cu index 1dd029b08cd..13831d79a44 100644 --- a/cpp/src/join/hash_join/count_each_outer.cu +++ b/cpp/src/join/hash_join/partitioned_count_outer.cu @@ -3,21 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "count_kernels.cuh" +#include "partitioned_count_kernels.cuh" #include "ref_types.cuh" namespace cudf::detail { -template void launch_count_each(probe_key_type const*, +template void launch_partitioned_count(probe_key_type const*, cuda::std::int64_t, size_type*, primitive_count_ref_t, rmm::cuda_stream_view); -template void launch_count_each( +template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); -template void launch_count_each( +template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/ref_types.cuh b/cpp/src/join/hash_join/ref_types.cuh index b05601d930b..bd2fb424064 100644 --- a/cpp/src/join/hash_join/ref_types.cuh +++ b/cpp/src/join/hash_join/ref_types.cuh @@ -26,7 +26,7 @@ using nested_equality_t = pair_equal>>; -// --- Count ref types (used by count_each kernel) --- +// --- Count ref types (used by partitioned_count kernel) --- template using count_ref_t = From 444d121701a5a50354db224cc9cf0a0d8369152c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:14:39 +0000 Subject: [PATCH 16/29] Rename retrieve -> partitioned_retrieve kernel and associated files; add kernel doc --- cpp/CMakeLists.txt | 4 +-- cpp/src/join/hash_join/kernels_common.cuh | 2 +- .../hash_join/partitioned_join_retrieve.cu | 8 +++--- .../{retrieve.cu => partitioned_retrieve.cu} | 8 +++--- ...s.cuh => partitioned_retrieve_kernels.cuh} | 27 ++++++++++++++++--- ...s.hpp => partitioned_retrieve_kernels.hpp} | 2 +- ...outer.cu => partitioned_retrieve_outer.cu} | 8 +++--- 7 files changed, 40 insertions(+), 19 deletions(-) rename cpp/src/join/hash_join/{retrieve.cu => partitioned_retrieve.cu} (86%) rename cpp/src/join/hash_join/{retrieve_kernels.cuh => partitioned_retrieve_kernels.cuh} (85%) rename cpp/src/join/hash_join/{retrieve_kernels.hpp => partitioned_retrieve_kernels.hpp} (95%) rename cpp/src/join/hash_join/{retrieve_outer.cu => partitioned_retrieve_outer.cu} (86%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c36cb3693d1..3b14e3a75fc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -689,8 +689,8 @@ add_library( src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu - src/join/hash_join/retrieve.cu - src/join/hash_join/retrieve_outer.cu + src/join/hash_join/partitioned_retrieve.cu + src/join/hash_join/partitioned_retrieve_outer.cu src/join/mark_join.cu src/join/filter_join_indices_jit.cu src/join/join.cu diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh index 36f54ffdca7..c4ac7a4dc1f 100644 --- a/cpp/src/join/hash_join/kernels_common.cuh +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -6,7 +6,7 @@ // Ported from cuco's open_addressing kernels and ref_impl to give cudf direct // control over hash-join probe kernel launches. The device-side probing logic // is identical to cuco's static_multiset::count / partitioned_count / retrieve / -// retrieve_outer. We keep the cuco ref type for hash-table access (storage, +// partitioned_retrieve_outer. We keep the cuco ref type for hash-table access (storage, // probing scheme, predicate) and only replace the host-side launch. #pragma once diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index 3b573f6c2f2..dd67e3367ef 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -7,7 +7,7 @@ #include "dispatch.cuh" #include "join/join_common_utils.cuh" #include "join/join_common_utils.hpp" -#include "retrieve_kernels.hpp" +#include "partitioned_retrieve_kernels.hpp" #include #include @@ -100,7 +100,7 @@ hash_join::partitioned_join_retrieve(join_kind join, // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) bool const is_outer = (join != join_kind::INNER_JOIN); - // launch_retrieve computes output size from match counts via exclusive scan + // launch_partitioned_retrieve computes output size from match counts via exclusive scan // (total = last_offset + last_count), allocates output buffers, and launches the kernel. auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; auto const n = static_cast(partition_size); @@ -123,10 +123,10 @@ hash_join::partitioned_join_retrieve(join_kind join, .rebind_hash_function(_impl->_hash_table.hash_function()); if (is_outer) { - join_indices = launch_retrieve( + join_indices = launch_partitioned_retrieve( probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } else { - join_indices = launch_retrieve( + join_indices = launch_partitioned_retrieve( probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } }; diff --git a/cpp/src/join/hash_join/retrieve.cu b/cpp/src/join/hash_join/partitioned_retrieve.cu similarity index 86% rename from cpp/src/join/hash_join/retrieve.cu rename to cpp/src/join/hash_join/partitioned_retrieve.cu index fa446f5d697..b62f8a15e4b 100644 --- a/cpp/src/join/hash_join/retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve.cu @@ -4,13 +4,13 @@ */ #include "ref_types.cuh" -#include "retrieve_kernels.cuh" +#include "partitioned_retrieve_kernels.cuh" namespace cudf::detail { template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, primitive_count_ref_t, @@ -20,7 +20,7 @@ launch_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, nested_count_ref_t, @@ -30,7 +30,7 @@ launch_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, flat_count_ref_t, diff --git a/cpp/src/join/hash_join/retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh similarity index 85% rename from cpp/src/join/hash_join/retrieve_kernels.cuh rename to cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 6db9165ee45..bc160c9d569 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -41,9 +41,30 @@ __device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) namespace cudf::detail { +/** + * @brief Retrieve matching build-side rows for each probe key. + * + * Each probing tile (@p cg_size threads) walks the hash table for one probe key, + * collecting matches via warp ballot. Matches are staged in a per-flushing-tile (warp) + * shared-memory buffer instead of being written directly to global memory. When the buffer + * nears capacity, the flushing tile claims a contiguous range in the global output arrays + * via a single atomic and flushes with coalesced writes, amortising atomic overhead across + * many matches. If @p IsOuter is true, probe rows with no matches emit a + * `(left_index, JoinNoMatch)` pair. + * + * @tparam IsOuter If true, unmatched probe rows emit a null-padded output row + * @tparam Ref cuco open-addressing reference type (carries hash, equality, storage) + * @param input_probe Packed probe keys: `.first` = hash, `.second` = probe row index + * @param n Number of probe keys + * @param left_offset Added to each probe row index to produce an absolute left index + * @param left_output Output buffer for left (probe-side) row indices + * @param right_output Output buffer for right (build-side) row indices + * @param output_counter Global atomic counter tracking total pairs written so far + * @param ref cuco hash-table reference for probing + */ template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) - retrieve_kernel(probe_key_type const* __restrict__ input_probe, + partitioned_retrieve_kernel(probe_key_type const* __restrict__ input_probe, cuda::std::int64_t n, size_type left_offset, size_type* __restrict__ left_output, @@ -200,7 +221,7 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const* keys, +launch_partitioned_retrieve(probe_key_type const* keys, cuda::std::int64_t n, size_type const* match_counts, Ref ref, @@ -235,7 +256,7 @@ launch_retrieve(probe_key_type const* keys, auto constexpr tiles_in_block = DEFAULT_JOIN_BLOCK_SIZE / Ref::cg_size; auto const num_blocks = static_cast((n + tiles_in_block - 1) / tiles_in_block); - retrieve_kernel<<>>( + partitioned_retrieve_kernel<<>>( keys, n, left_offset, left_indices->data(), right_indices->data(), output_counter.data(), ref); return std::pair(std::move(left_indices), std::move(right_indices)); diff --git a/cpp/src/join/hash_join/retrieve_kernels.hpp b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp similarity index 95% rename from cpp/src/join/hash_join/retrieve_kernels.hpp rename to cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp index 01431b97bca..95982784594 100644 --- a/cpp/src/join/hash_join/retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp @@ -31,7 +31,7 @@ namespace cudf::detail { template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const* keys, +launch_partitioned_retrieve(probe_key_type const* keys, cuda::std::int64_t n, size_type const* match_counts, Ref ref, diff --git a/cpp/src/join/hash_join/retrieve_outer.cu b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu similarity index 86% rename from cpp/src/join/hash_join/retrieve_outer.cu rename to cpp/src/join/hash_join/partitioned_retrieve_outer.cu index 07d20c98c00..8f691bbded0 100644 --- a/cpp/src/join/hash_join/retrieve_outer.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu @@ -4,13 +4,13 @@ */ #include "ref_types.cuh" -#include "retrieve_kernels.cuh" +#include "partitioned_retrieve_kernels.cuh" namespace cudf::detail { template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, primitive_count_ref_t, @@ -20,7 +20,7 @@ launch_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, nested_count_ref_t, @@ -30,7 +30,7 @@ launch_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> -launch_retrieve(probe_key_type const*, +launch_partitioned_retrieve(probe_key_type const*, cuda::std::int64_t, size_type const*, flat_count_ref_t, From 04bcf07878b74cf0a26811e47ef5a7fb16ac6a76 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:25:40 +0000 Subject: [PATCH 17/29] Rename full_join_finalize.cpp -> partitioned_full_join_finalize.cpp --- cpp/CMakeLists.txt | 2 +- ...ull_join_finalize.cpp => partitioned_full_join_finalize.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/join/hash_join/{full_join_finalize.cpp => partitioned_full_join_finalize.cpp} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3b14e3a75fc..e4bca14c5e8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -670,7 +670,6 @@ add_library( src/join/filter_join_indices_kernel_null_primitive.cu src/join/filter_join_indices_kernel_primitive.cu src/join/filtered_join.cu - src/join/hash_join/full_join_finalize.cpp src/join/hash_join/full_join_match_context.cpp src/join/hash_join/full_join_retrieve.cu src/join/hash_join/full_join_size.cu @@ -686,6 +685,7 @@ add_library( src/join/hash_join/partitioned_count.cu src/join/hash_join/partitioned_count_outer.cu src/join/hash_join/partitioned_full_join.cu + src/join/hash_join/partitioned_full_join_finalize.cpp src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu diff --git a/cpp/src/join/hash_join/full_join_finalize.cpp b/cpp/src/join/hash_join/partitioned_full_join_finalize.cpp similarity index 100% rename from cpp/src/join/hash_join/full_join_finalize.cpp rename to cpp/src/join/hash_join/partitioned_full_join_finalize.cpp From f656c0b06736090e4bc8d50132febe6202c37f0f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:27:28 +0000 Subject: [PATCH 18/29] Rename to finalize_partitioned_full_join.cpp to match API name --- cpp/CMakeLists.txt | 2 +- ...ull_join_finalize.cpp => finalize_partitioned_full_join.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/join/hash_join/{partitioned_full_join_finalize.cpp => finalize_partitioned_full_join.cpp} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4bca14c5e8..d1f0bf83b52 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -670,6 +670,7 @@ add_library( src/join/filter_join_indices_kernel_null_primitive.cu src/join/filter_join_indices_kernel_primitive.cu src/join/filtered_join.cu + src/join/hash_join/finalize_partitioned_full_join.cpp src/join/hash_join/full_join_match_context.cpp src/join/hash_join/full_join_retrieve.cu src/join/hash_join/full_join_size.cu @@ -685,7 +686,6 @@ add_library( src/join/hash_join/partitioned_count.cu src/join/hash_join/partitioned_count_outer.cu src/join/hash_join/partitioned_full_join.cu - src/join/hash_join/partitioned_full_join_finalize.cpp src/join/hash_join/partitioned_inner_join.cu src/join/hash_join/partitioned_join_retrieve.cu src/join/hash_join/partitioned_left_join.cu diff --git a/cpp/src/join/hash_join/partitioned_full_join_finalize.cpp b/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp similarity index 100% rename from cpp/src/join/hash_join/partitioned_full_join_finalize.cpp rename to cpp/src/join/hash_join/finalize_partitioned_full_join.cpp From 3e6c606bca9b0a1c08289c1716bb34d39abed41e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:32:38 +0000 Subject: [PATCH 19/29] Clean up stale comments: remove ported language, fix exclusive scan -> reduce --- cpp/src/join/hash_join/kernels_common.cuh | 7 ++----- cpp/src/join/hash_join/partitioned_count_kernels.cuh | 1 - cpp/src/join/hash_join/partitioned_join_retrieve.cu | 2 +- cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh | 3 --- cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp | 5 ++--- 5 files changed, 5 insertions(+), 13 deletions(-) diff --git a/cpp/src/join/hash_join/kernels_common.cuh b/cpp/src/join/hash_join/kernels_common.cuh index c4ac7a4dc1f..cd74a5bde4e 100644 --- a/cpp/src/join/hash_join/kernels_common.cuh +++ b/cpp/src/join/hash_join/kernels_common.cuh @@ -3,11 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -// Ported from cuco's open_addressing kernels and ref_impl to give cudf direct -// control over hash-join probe kernel launches. The device-side probing logic -// is identical to cuco's static_multiset::count / partitioned_count / retrieve / -// partitioned_retrieve_outer. We keep the cuco ref type for hash-table access (storage, -// probing scheme, predicate) and only replace the host-side launch. +// Custom hash-join probe kernels that give cudf direct control over kernel launches. +// Uses the cuco ref type for hash-table access (storage, probing scheme, predicate). #pragma once diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index 4826042da6d..503d985551a 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -3,7 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// Ported from cuco's open_addressing partitioned_count kernel. #pragma once diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index dd67e3367ef..b03d728b2e3 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -100,7 +100,7 @@ hash_join::partitioned_join_retrieve(join_kind join, // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) bool const is_outer = (join != join_kind::INNER_JOIN); - // launch_partitioned_retrieve computes output size from match counts via exclusive scan + // launch_partitioned_retrieve reduces match counts to compute output size // (total = last_offset + last_count), allocates output buffers, and launches the kernel. auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; auto const n = static_cast(partition_size); diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index bc160c9d569..f3293dba0f0 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -3,9 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// Hash join retrieve kernel ported from cuco's open_addressing retrieve. -// Uses a shared-memory buffer per flushing tile (warp) to coalesce global -// output writes and amortize the global atomic counter across many matches. #pragma once diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp index 95982784594..865e6415cd8 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp @@ -20,9 +20,8 @@ namespace cudf::detail { /** * @brief Probes the hash table for each key and writes matching index pairs. * - * Internally computes per-row output offsets via exclusive scan on match_counts, - * derives the total output size, allocates output buffers, and launches the - * retrieve kernel. `left_offset` is added to each stored probe-row index when + * Reduces match_counts to derive the total output size, allocates output buffers, + * and launches the retrieve kernel. `left_offset` is added to each stored probe-row index when * writing to `left_indices`, so callers can produce indices in the full probe * table's coordinate space directly from a slice-local `keys` array. * From ed8ad9321de29f986f8cb3f90015ae7968e977e7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:34:31 +0000 Subject: [PATCH 20/29] Add doc comment to partitioned_count_kernel --- .../hash_join/partitioned_count_kernels.cuh | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index 503d985551a..58a96574f0a 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -17,6 +17,26 @@ namespace cudf::detail { +/** + * @brief Count matching build-side rows for each probe key. + * + * Each probing tile (@p cg_size threads) calls `ref.count()` for one probe key + * and reduces the per-lane counts across the tile with a warp reduce. The result + * is written to @p output by a single elected thread via `invoke_one`. If + * @p IsOuter is true, keys with zero matches are recorded as 1 so every probe + * row contributes at least one output row in the subsequent retrieve pass. + * + * This is the first phase of the two-phase partitioned join: count then retrieve. + * The output array is consumed by `launch_partitioned_retrieve` to pre-allocate + * the output index buffers. + * + * @tparam IsOuter If true, zero-match keys produce a count of 1 + * @tparam Ref cuco open-addressing reference type (carries hash, equality, storage) + * @param keys Packed probe keys: `.first` = hash, `.second` = probe row index + * @param n Number of probe keys + * @param output Per-key match count output (one entry per probe key) + * @param ref cuco hash-table reference for counting + */ template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) partitioned_count_kernel(probe_key_type const* __restrict__ keys, From bb86c185144a787b773b2a65e88ad5ddde7a19b8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Apr 2026 23:46:19 +0000 Subject: [PATCH 21/29] Formatting --- cpp/include/cudf/join/hash_join.hpp | 17 +++++---- cpp/src/join/hash_join/match_context.cu | 2 +- cpp/src/join/hash_join/partitioned_count.cu | 8 ++-- .../hash_join/partitioned_count_kernels.cuh | 19 +++++----- .../hash_join/partitioned_count_kernels.hpp | 8 ++-- .../join/hash_join/partitioned_count_outer.cu | 8 ++-- .../join/hash_join/partitioned_retrieve.cu | 38 +++++++++---------- .../partitioned_retrieve_kernels.cuh | 25 ++++++------ .../partitioned_retrieve_kernels.hpp | 12 +++--- .../hash_join/partitioned_retrieve_outer.cu | 38 +++++++++---------- cpp/src/join/join_common_utils.hpp | 4 +- cpp/tests/join/join_tests.cpp | 10 ++--- 12 files changed, 95 insertions(+), 94 deletions(-) diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index 905a3257ee8..93acf635f24 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -362,8 +362,8 @@ class hash_join { * previously created by calling full_join_match_context(). * * @note This method does NOT include unmatched build rows (the complement). After all - * partitions have been processed, pass the collected results to `finalize_partitioned_full_join()` to - * obtain the complete full join output. + * partitions have been processed, pass the collected results to + * `finalize_partitioned_full_join()` to obtain the complete full join output. * * The returned left_indices are relative to the original complete probe table. * @@ -401,12 +401,13 @@ class hash_join { */ [[nodiscard]] static std::pair>, std::unique_ptr>> - finalize_partitioned_full_join(cudf::host_span const> left_partials, - cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + finalize_partitioned_full_join( + cudf::host_span const> left_partials, + cudf::host_span const> right_partials, + size_type probe_table_num_rows, + size_type build_table_num_rows, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); private: std::unique_ptr _impl; diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index eb5f933e164..a2aed529d3b 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -4,9 +4,9 @@ */ #include "common.cuh" -#include "partitioned_count_kernels.hpp" #include "dispatch.cuh" #include "join/join_common_utils.cuh" +#include "partitioned_count_kernels.hpp" #include diff --git a/cpp/src/join/hash_join/partitioned_count.cu b/cpp/src/join/hash_join/partitioned_count.cu index b430661dc56..176370690be 100644 --- a/cpp/src/join/hash_join/partitioned_count.cu +++ b/cpp/src/join/hash_join/partitioned_count.cu @@ -9,10 +9,10 @@ namespace cudf::detail { template void launch_partitioned_count(probe_key_type const*, - cuda::std::int64_t, - size_type*, - primitive_count_ref_t, - rmm::cuda_stream_view); + cuda::std::int64_t, + size_type*, + primitive_count_ref_t, + rmm::cuda_stream_view); template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index 58a96574f0a..2c981c2bc6d 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -3,7 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once #include "kernels_common.cuh" @@ -40,9 +39,9 @@ namespace cudf::detail { template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) partitioned_count_kernel(probe_key_type const* __restrict__ keys, - cuda::std::int64_t n, - size_type* __restrict__ output, - Ref ref) + cuda::std::int64_t n, + size_type* __restrict__ output, + Ref ref) { auto constexpr cg_size = DEFAULT_JOIN_CG_SIZE; @@ -63,7 +62,9 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); if constexpr (IsOuter) { auto temp_count = static_cast(ref.count(tile, key)); - if (tile.all(temp_count == 0)) { cooperative_groups::invoke_one(tile, [&]() { ++temp_count; }); } + if (tile.all(temp_count == 0)) { + cooperative_groups::invoke_one(tile, [&]() { ++temp_count; }); + } auto const cnt = cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); cooperative_groups::invoke_one(tile, [&]() { output[idx] = cnt; }); @@ -80,10 +81,10 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) template void launch_partitioned_count(probe_key_type const* keys, - cuda::std::int64_t n, - size_type* output, - Ref ref, - rmm::cuda_stream_view stream) + cuda::std::int64_t n, + size_type* output, + Ref ref, + rmm::cuda_stream_view stream) { if (n == 0) { return; } diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.hpp b/cpp/src/join/hash_join/partitioned_count_kernels.hpp index c4eb83f2e05..f83f611e86b 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_count_kernels.hpp @@ -16,9 +16,9 @@ namespace cudf::detail { /// Launch the partitioned_count kernel. template void launch_partitioned_count(probe_key_type const* keys, - cuda::std::int64_t n, - size_type* output, - Ref ref, - rmm::cuda_stream_view stream); + cuda::std::int64_t n, + size_type* output, + Ref ref, + rmm::cuda_stream_view stream); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_count_outer.cu b/cpp/src/join/hash_join/partitioned_count_outer.cu index 13831d79a44..60299f34ea3 100644 --- a/cpp/src/join/hash_join/partitioned_count_outer.cu +++ b/cpp/src/join/hash_join/partitioned_count_outer.cu @@ -9,10 +9,10 @@ namespace cudf::detail { template void launch_partitioned_count(probe_key_type const*, - cuda::std::int64_t, - size_type*, - primitive_count_ref_t, - rmm::cuda_stream_view); + cuda::std::int64_t, + size_type*, + primitive_count_ref_t, + rmm::cuda_stream_view); template void launch_partitioned_count( probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); diff --git a/cpp/src/join/hash_join/partitioned_retrieve.cu b/cpp/src/join/hash_join/partitioned_retrieve.cu index b62f8a15e4b..d95c57f20d6 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve.cu @@ -3,39 +3,39 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "ref_types.cuh" #include "partitioned_retrieve_kernels.cuh" +#include "ref_types.cuh" namespace cudf::detail { template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - primitive_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + primitive_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - nested_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + nested_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - flat_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + flat_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index f3293dba0f0..0c7f9dc9773 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -3,7 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once #include "kernels_common.cuh" @@ -62,12 +61,12 @@ namespace cudf::detail { template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) partitioned_retrieve_kernel(probe_key_type const* __restrict__ input_probe, - cuda::std::int64_t n, - size_type left_offset, - size_type* __restrict__ left_output, - size_type* __restrict__ right_output, - size_type* __restrict__ output_counter, - Ref ref) + cuda::std::int64_t n, + size_type left_offset, + size_type* __restrict__ left_output, + size_type* __restrict__ right_output, + size_type* __restrict__ output_counter, + Ref ref) { namespace cg = cooperative_groups; @@ -219,12 +218,12 @@ template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, - size_type const* match_counts, - Ref ref, - size_type left_offset, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + cuda::std::int64_t n, + size_type const* match_counts, + Ref ref, + size_type left_offset, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { if (n == 0) { return std::pair(std::make_unique>(0, stream, mr), diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp index 865e6415cd8..875b776b64e 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp @@ -31,11 +31,11 @@ template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, - size_type const* match_counts, - Ref ref, - size_type left_offset, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + cuda::std::int64_t n, + size_type const* match_counts, + Ref ref, + size_type left_offset, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_retrieve_outer.cu b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu index 8f691bbded0..4e8fde1dc4d 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_outer.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu @@ -3,39 +3,39 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "ref_types.cuh" #include "partitioned_retrieve_kernels.cuh" +#include "ref_types.cuh" namespace cudf::detail { template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - primitive_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + primitive_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - nested_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + nested_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, - size_type const*, - flat_count_ref_t, - size_type, - rmm::cuda_stream_view, - rmm::device_async_resource_ref); + cuda::std::int64_t, + size_type const*, + flat_count_ref_t, + size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref); } // namespace cudf::detail diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 461c43c4d10..6af7047fbeb 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -74,8 +74,8 @@ VectorPair finalize_full_join(VectorPair&& probe_indices, * and appends the complement (unmatched build rows paired with `JoinNoMatch`) into the tail. * Internally delegates to the `VectorPair&&` overload, so the mark/compact path is shared. * - * Used by `cudf::hash_join::finalize_partitioned_full_join` for partitioned full joins where the partials - * live in separate buffers and must be gathered. + * Used by `cudf::hash_join::finalize_partitioned_full_join` for partitioned full joins where the + * partials live in separate buffers and must be gathered. * * @param left_partials Per-partition probe-side (left) index spans. * @param right_partials Per-partition probe-side (right) index spans. diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 9051e216129..9bdbe705541 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -3400,11 +3400,11 @@ TEST_F(JoinTest, HashJoinPartitionedFullJoin) auto [final_left, final_right] = cudf::hash_join::finalize_partitioned_full_join(left_partials, - right_partials, - t0.select(left_on).num_rows(), - t1.select(right_on).num_rows(), - stream, - mr); + right_partials, + t0.select(left_on).num_rows(), + t1.select(right_on).num_rows(), + stream, + mr); auto left_col = cudf::column_view{cudf::device_span{*final_left}}; auto right_col = cudf::column_view{cudf::device_span{*final_right}}; From 43d0e822202047a19724da7f38bf57e908a1f8a5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 4 May 2026 23:34:00 +0000 Subject: [PATCH 22/29] Address shrshi review nits and rename cnt -> match_count --- .../hash_join/partitioned_count_kernels.cuh | 26 ++++++++++--------- .../partitioned_retrieve_kernels.cuh | 6 +---- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index 2c981c2bc6d..75e61b4a4f9 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -51,28 +51,30 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) while (idx < n) { auto const key = keys[idx]; if constexpr (cg_size == 1) { - auto const cnt = ref.count(key); + auto const match_count = ref.count(key); if constexpr (IsOuter) { - output[idx] = (cnt == 0) ? size_type{1} : cnt; + output[idx] = (match_count == 0) ? size_type{1} : match_count; } else { - output[idx] = cnt; + output[idx] = match_count; } } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); if constexpr (IsOuter) { - auto temp_count = static_cast(ref.count(tile, key)); + auto const temp_count = static_cast(ref.count(tile, key)); if (tile.all(temp_count == 0)) { - cooperative_groups::invoke_one(tile, [&]() { ++temp_count; }); + cooperative_groups::invoke_one(tile, [&]() { output[idx] = size_type{1}; }); + } else { + auto const match_count = + cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); + cooperative_groups::invoke_one(tile, [&]() { output[idx] = match_count; }); } - auto const cnt = - cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); - cooperative_groups::invoke_one(tile, [&]() { output[idx] = cnt; }); } else { - auto const cnt = cooperative_groups::reduce(tile, - static_cast(ref.count(tile, key)), - cooperative_groups::plus()); - cooperative_groups::invoke_one(tile, [&]() { output[idx] = cnt; }); + auto const match_count = + cooperative_groups::reduce(tile, + static_cast(ref.count(tile, key)), + cooperative_groups::plus()); + cooperative_groups::invoke_one(tile, [&]() { output[idx] = match_count; }); } } idx += stride; diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 0c7f9dc9773..93e563f19bb 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -23,7 +23,7 @@ #include #include -namespace { +namespace cudf::detail { /** * @brief Count the number of set bits below a given position in a bitmask. @@ -33,10 +33,6 @@ __device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) return cuda::std::popcount(mask & ((1u << pos) - 1)); } -} // namespace - -namespace cudf::detail { - /** * @brief Retrieve matching build-side rows for each probe key. * From 7be83f3cfb61144ac8dac745b3cbd7326a92504b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 May 2026 23:45:52 +0000 Subject: [PATCH 23/29] Replace raw int64_t with thread_index_type --- cpp/src/join/hash_join/match_context.cu | 2 +- cpp/src/join/hash_join/partitioned_count.cu | 6 +++--- .../hash_join/partitioned_count_kernels.cuh | 4 ++-- .../hash_join/partitioned_count_kernels.hpp | 4 +--- .../join/hash_join/partitioned_count_outer.cu | 6 +++--- .../hash_join/partitioned_join_retrieve.cu | 2 +- .../join/hash_join/partitioned_retrieve.cu | 6 +++--- .../partitioned_retrieve_kernels.cuh | 20 +++++++++---------- .../partitioned_retrieve_kernels.hpp | 2 +- .../hash_join/partitioned_retrieve_outer.cu | 6 +++--- 10 files changed, 27 insertions(+), 31 deletions(-) diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index a2aed529d3b..af14faecb8b 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -51,7 +51,7 @@ std::unique_ptr> make_join_match_counts( auto count_matches = [&](auto equality, auto d_hasher) { // Precompute probe keys: {hash(row_idx), row_idx} for each probe row. - auto const n = static_cast(probe_table_num_rows); + auto const n = static_cast(probe_table_num_rows); rmm::device_uvector probe_keys(n, stream); thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::counting_iterator(0), diff --git a/cpp/src/join/hash_join/partitioned_count.cu b/cpp/src/join/hash_join/partitioned_count.cu index 176370690be..67af2bf0d05 100644 --- a/cpp/src/join/hash_join/partitioned_count.cu +++ b/cpp/src/join/hash_join/partitioned_count.cu @@ -9,15 +9,15 @@ namespace cudf::detail { template void launch_partitioned_count(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type*, primitive_count_ref_t, rmm::cuda_stream_view); template void launch_partitioned_count( - probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); + probe_key_type const*, thread_index_type, size_type*, nested_count_ref_t, rmm::cuda_stream_view); template void launch_partitioned_count( - probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); + probe_key_type const*, thread_index_type, size_type*, flat_count_ref_t, rmm::cuda_stream_view); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index 75e61b4a4f9..bcdc462c3b0 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -39,7 +39,7 @@ namespace cudf::detail { template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) partitioned_count_kernel(probe_key_type const* __restrict__ keys, - cuda::std::int64_t n, + thread_index_type n, size_type* __restrict__ output, Ref ref) { @@ -83,7 +83,7 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) template void launch_partitioned_count(probe_key_type const* keys, - cuda::std::int64_t n, + thread_index_type n, size_type* output, Ref ref, rmm::cuda_stream_view stream) diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.hpp b/cpp/src/join/hash_join/partitioned_count_kernels.hpp index f83f611e86b..d6bb30b7f00 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_count_kernels.hpp @@ -9,14 +9,12 @@ #include -#include - namespace cudf::detail { /// Launch the partitioned_count kernel. template void launch_partitioned_count(probe_key_type const* keys, - cuda::std::int64_t n, + thread_index_type n, size_type* output, Ref ref, rmm::cuda_stream_view stream); diff --git a/cpp/src/join/hash_join/partitioned_count_outer.cu b/cpp/src/join/hash_join/partitioned_count_outer.cu index 60299f34ea3..3f4204cb1fa 100644 --- a/cpp/src/join/hash_join/partitioned_count_outer.cu +++ b/cpp/src/join/hash_join/partitioned_count_outer.cu @@ -9,15 +9,15 @@ namespace cudf::detail { template void launch_partitioned_count(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type*, primitive_count_ref_t, rmm::cuda_stream_view); template void launch_partitioned_count( - probe_key_type const*, cuda::std::int64_t, size_type*, nested_count_ref_t, rmm::cuda_stream_view); + probe_key_type const*, thread_index_type, size_type*, nested_count_ref_t, rmm::cuda_stream_view); template void launch_partitioned_count( - probe_key_type const*, cuda::std::int64_t, size_type*, flat_count_ref_t, rmm::cuda_stream_view); + probe_key_type const*, thread_index_type, size_type*, flat_count_ref_t, rmm::cuda_stream_view); } // namespace cudf::detail diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index b03d728b2e3..16556e5f403 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -103,7 +103,7 @@ hash_join::partitioned_join_retrieve(join_kind join, // launch_partitioned_retrieve reduces match counts to compute output size // (total = last_offset + last_count), allocates output buffers, and launches the kernel. auto const* partition_counts = match_ctx._match_counts->data() + left_start_idx; - auto const n = static_cast(partition_size); + auto const n = static_cast(partition_size); std::pair>, std::unique_ptr>> diff --git a/cpp/src/join/hash_join/partitioned_retrieve.cu b/cpp/src/join/hash_join/partitioned_retrieve.cu index d95c57f20d6..efb200f4c89 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve.cu @@ -11,7 +11,7 @@ namespace cudf::detail { template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, primitive_count_ref_t, size_type, @@ -21,7 +21,7 @@ launch_partitioned_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, nested_count_ref_t, size_type, @@ -31,7 +31,7 @@ launch_partitioned_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, flat_count_ref_t, size_type, diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 93e563f19bb..53b08e2e4b6 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -57,7 +57,7 @@ __device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) partitioned_retrieve_kernel(probe_key_type const* __restrict__ input_probe, - cuda::std::int64_t n, + thread_index_type n, size_type left_offset, size_type* __restrict__ left_output, size_type* __restrict__ right_output, @@ -95,22 +95,20 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) auto atomic_counter = cuda::atomic_ref{*output_counter}; auto flush_buffers = [&](auto const& tile) { - size_type offset = 0; - auto const count = counters[flushing_tile_id]; - auto const rank = tile.thread_rank(); - if (rank == 0) { - offset = atomic_counter.fetch_add(static_cast(count), cuda::memory_order_relaxed); - } - offset = tile.shfl(offset, 0); + auto const count = counters[flushing_tile_id]; + auto const offset = cg::invoke_one_broadcast(tile, [&]() { + return atomic_counter.fetch_add(static_cast(count), cuda::memory_order_relaxed); + }); + auto const rank = tile.thread_rank(); for (int i = rank; i < count; i += tile.size()) { left_output[offset + i] = buffers[flushing_tile_id][i].first; right_output[offset + i] = buffers[flushing_tile_id][i].second; } }; - auto const grid_stride_tiles = static_cast(gridDim.x) * tiles_in_block; + auto const grid_stride_tiles = static_cast(gridDim.x) * tiles_in_block; auto idx = - static_cast(blockIdx.x) * tiles_in_block + probing_tile.meta_group_rank(); + static_cast(blockIdx.x) * tiles_in_block + probing_tile.meta_group_rank(); while (flushing_tile.any(idx < n)) { bool const active = idx < n; @@ -214,7 +212,7 @@ template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, + thread_index_type n, size_type const* match_counts, Ref ref, size_type left_offset, diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp index 875b776b64e..edd8816924a 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.hpp @@ -31,7 +31,7 @@ template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const* keys, - cuda::std::int64_t n, + thread_index_type n, size_type const* match_counts, Ref ref, size_type left_offset, diff --git a/cpp/src/join/hash_join/partitioned_retrieve_outer.cu b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu index 4e8fde1dc4d..68b8f591ada 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_outer.cu +++ b/cpp/src/join/hash_join/partitioned_retrieve_outer.cu @@ -11,7 +11,7 @@ namespace cudf::detail { template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, primitive_count_ref_t, size_type, @@ -21,7 +21,7 @@ launch_partitioned_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, nested_count_ref_t, size_type, @@ -31,7 +31,7 @@ launch_partitioned_retrieve(probe_key_type const*, template std::pair>, std::unique_ptr>> launch_partitioned_retrieve(probe_key_type const*, - cuda::std::int64_t, + thread_index_type, size_type const*, flat_count_ref_t, size_type, From e567072b18167a6cd49b00c783d31955c724e12c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 9 May 2026 00:02:51 +0000 Subject: [PATCH 24/29] Use left/right naming --- cpp/include/cudf/join/hash_join.hpp | 14 +++++----- .../finalize_partitioned_full_join.cpp | 6 ++-- .../hash_join/partitioned_join_retrieve.cu | 28 +++++++++---------- .../partitioned_retrieve_kernels.cuh | 6 ++-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index 93acf635f24..65a3a7c5062 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -381,18 +381,18 @@ class hash_join { rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; /** - * @brief Finalizes a partitioned full join by concatenating all per-partition probe results - * and appending the unmatched build rows (the complement). + * @brief Finalizes a partitioned full join by concatenating all per-partition results + * and appending the unmatched right rows (the complement). * * Call this method after calling `partitioned_full_join()` for every partition. It combines - * the per-partition probe indices with the unmatched build row indices (a global property + * the per-partition indices with the unmatched right row indices (a global property * across all partitions) and returns a single `(left_indices, right_indices)` pair equivalent * to the output of `full_join()`. * * @param left_partials Per-partition `left_indices` views produced by `partitioned_full_join()` * @param right_partials Per-partition `right_indices` views produced by `partitioned_full_join()` - * @param probe_table_num_rows Total number of rows in the original probe table - * @param build_table_num_rows Total number of rows in the build table + * @param left_table_num_rows Total number of rows in the original left table + * @param right_table_num_rows Total number of rows in the right table * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the result device memory * @@ -404,8 +404,8 @@ class hash_join { finalize_partitioned_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp b/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp index 408d9f077d8..cc85bc5c74d 100644 --- a/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp +++ b/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp @@ -21,13 +21,13 @@ std::pair>, hash_join::finalize_partitioned_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { return cudf::detail::finalize_full_join( - left_partials, right_partials, probe_table_num_rows, build_table_num_rows, stream, mr); + left_partials, right_partials, left_table_num_rows, right_table_num_rows, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index 16556e5f403..78f0f2afed6 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -88,14 +88,14 @@ hash_join::partitioned_join_retrieve(join_kind join, } } - // Slice the probe table to the partition range - auto const probe_partition_view = + // Slice the left table to the partition range + auto const left_partition_view = cudf::slice(match_ctx._left_table, {left_start_idx, left_end_idx})[0]; - validate_hash_join_probe(_build, probe_partition_view, _has_nulls); + validate_hash_join_probe(_right, left_partition_view, _has_nulls); - auto const preprocessed_probe = - cudf::detail::row::equality::preprocessed_table::create(probe_partition_view, stream); + auto const preprocessed_left = + cudf::detail::row::equality::preprocessed_table::create(left_partition_view, stream); // For FULL_JOIN, probe with LEFT_JOIN semantics (no complement here) bool const is_outer = (join != join_kind::INNER_JOIN); @@ -110,12 +110,12 @@ hash_join::partitioned_join_retrieve(join_kind join, join_indices; auto retrieve_partition = [&](auto equality, auto d_hasher) { - // Precompute probe keys for this partition slice. - rmm::device_uvector probe_keys(n, stream); + // Precompute left keys for this partition slice. + rmm::device_uvector left_keys(n, stream); thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::counting_iterator(0), cuda::counting_iterator(partition_size), - probe_keys.begin(), + left_keys.begin(), pair_fn{d_hasher}); auto const ref = _impl->_hash_table.ref(cuco::op::count) @@ -124,17 +124,17 @@ hash_join::partitioned_join_retrieve(join_kind join, if (is_outer) { join_indices = launch_partitioned_retrieve( - probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); + left_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } else { join_indices = launch_partitioned_retrieve( - probe_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); + left_keys.data(), n, partition_counts, ref, left_start_idx, stream, mr); } }; - dispatch_join_comparator(_build, - probe_partition_view, - _preprocessed_build, - preprocessed_probe, + dispatch_join_comparator(_right, + left_partition_view, + _preprocessed_right, + preprocessed_left, _has_nulls, _nulls_equal, retrieve_partition); diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 53b08e2e4b6..71993087f85 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -46,7 +46,7 @@ __device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) * * @tparam IsOuter If true, unmatched probe rows emit a null-padded output row * @tparam Ref cuco open-addressing reference type (carries hash, equality, storage) - * @param input_probe Packed probe keys: `.first` = hash, `.second` = probe row index + * @param keys Packed left keys: `.first` = hash, `.second` = left row index * @param n Number of probe keys * @param left_offset Added to each probe row index to produce an absolute left index * @param left_output Output buffer for left (probe-side) row indices @@ -56,7 +56,7 @@ __device__ __forceinline__ int count_lower_set_bits(unsigned int mask, int pos) */ template CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) - partitioned_retrieve_kernel(probe_key_type const* __restrict__ input_probe, + partitioned_retrieve_kernel(probe_key_type const* __restrict__ keys, thread_index_type n, size_type left_offset, size_type* __restrict__ left_output, @@ -116,7 +116,7 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) cg::binary_partition(flushing_tile, active); if (active) { - auto const probe_key = input_probe[idx]; + auto const probe_key = keys[idx]; auto const left_index = probe_key.second + left_offset; auto probing_iter = ref.probing_scheme().template make_iterator( From 55a3633c6d42358837a220f6d1994c1771d06bfd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 11 May 2026 17:41:02 +0000 Subject: [PATCH 25/29] Address CodeRabbit review feedback - Validate join_partition_context bounds in partitioned_join_retrieve - Document throw conditions on new partitioned_*_join APIs - Add CUDF_FUNC_RANGE to new partitioned shims and finalize_*_full_join - Use cudf::detail::device_scalar and direct include - Add sliced + multi-block partitioned probe test --- cpp/include/cudf/join/hash_join.hpp | 12 +++ .../finalize_partitioned_full_join.cpp | 2 + cpp/src/join/hash_join/hash_join.cu | 3 + .../hash_join/partitioned_join_retrieve.cu | 12 +++ .../partitioned_retrieve_kernels.cuh | 4 +- cpp/src/join/hash_join/ref_types.cuh | 2 + cpp/tests/join/join_tests.cpp | 76 +++++++++++++++++++ 7 files changed, 109 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index 65a3a7c5062..d765d5d807d 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -319,6 +319,10 @@ class hash_join { * The returned left_indices are relative to the original complete probe table, not just the * partition, so they can be used directly with the original probe table. * + * @throw std::invalid_argument If `context.left_table_context` is null, if its + * `_match_counts` is null, or if `[left_start_idx, left_end_idx)` is outside the bounds + * of the left table. + * * @param context The partition context containing match information and partition bounds * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the join indices' device memory @@ -341,6 +345,10 @@ class hash_join { * * The returned left_indices are relative to the original complete probe table. * + * @throw std::invalid_argument If `context.left_table_context` is null, if its + * `_match_counts` is null, or if `[left_start_idx, left_end_idx)` is outside the bounds + * of the left table. + * * @param context The partition context containing match information and partition bounds * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the join indices' device memory @@ -367,6 +375,10 @@ class hash_join { * * The returned left_indices are relative to the original complete probe table. * + * @throw std::invalid_argument If `context.left_table_context` is null, if its + * `_match_counts` is null, or if `[left_start_idx, left_end_idx)` is outside the bounds + * of the left table. + * * @param context The partition context containing match information and partition bounds * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the join indices' device memory diff --git a/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp b/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp index cc85bc5c74d..c5e36c5be74 100644 --- a/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp +++ b/cpp/src/join/hash_join/finalize_partitioned_full_join.cpp @@ -5,6 +5,7 @@ #include "join/join_common_utils.hpp" +#include #include #include #include @@ -26,6 +27,7 @@ hash_join::finalize_partitioned_full_join( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); return cudf::detail::finalize_full_join( left_partials, right_partials, left_table_num_rows, right_table_num_rows, stream, mr); } diff --git a/cpp/src/join/hash_join/hash_join.cu b/cpp/src/join/hash_join/hash_join.cu index 26f25001c4c..163b558eb18 100644 --- a/cpp/src/join/hash_join/hash_join.cu +++ b/cpp/src/join/hash_join/hash_join.cu @@ -265,6 +265,7 @@ hash_join::partitioned_inner_join(cudf::join_partition_context const& context, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { + CUDF_FUNC_RANGE(); return _impl->partitioned_inner_join(context, stream, mr); } @@ -274,6 +275,7 @@ hash_join::partitioned_left_join(cudf::join_partition_context const& context, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { + CUDF_FUNC_RANGE(); return _impl->partitioned_left_join(context, stream, mr); } @@ -283,6 +285,7 @@ hash_join::partitioned_full_join(cudf::join_partition_context const& context, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { + CUDF_FUNC_RANGE(); return _impl->partitioned_full_join(context, stream, mr); } diff --git a/cpp/src/join/hash_join/partitioned_join_retrieve.cu b/cpp/src/join/hash_join/partitioned_join_retrieve.cu index 78f0f2afed6..77dcae1e932 100644 --- a/cpp/src/join/hash_join/partitioned_join_retrieve.cu +++ b/cpp/src/join/hash_join/partitioned_join_retrieve.cu @@ -66,10 +66,22 @@ hash_join::partitioned_join_retrieve(join_kind join, join == join_kind::INNER_JOIN || join == join_kind::LEFT_JOIN || join == join_kind::FULL_JOIN, "Unsupported join kind for partitioned retrieve"); + CUDF_EXPECTS(context.left_table_context != nullptr, + "join_partition_context is missing left_table_context", + std::invalid_argument); + auto const& match_ctx = *context.left_table_context; auto const left_start_idx = context.left_start_idx; auto const left_end_idx = context.left_end_idx; + CUDF_EXPECTS(match_ctx._match_counts != nullptr, + "join_match_context is missing match counts", + std::invalid_argument); + CUDF_EXPECTS(left_start_idx >= 0 && left_end_idx >= left_start_idx && + left_end_idx <= match_ctx._left_table.num_rows(), + "Invalid partition bounds", + std::invalid_argument); + // Empty partition if (left_start_idx >= left_end_idx) { return std::pair(std::make_unique>(0, stream, mr), diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 71993087f85..742c8f8663d 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -7,12 +7,12 @@ #include "kernels_common.cuh" +#include #include #include #include #include -#include #include #include @@ -241,7 +241,7 @@ launch_partitioned_retrieve(probe_key_type const* keys, auto right_indices = std::make_unique>(total_output, stream, mr); // Global atomic counter claimed in bulk by each flushing-tile buffer flush. - rmm::device_scalar output_counter(size_type{0}, stream); + cudf::detail::device_scalar output_counter(size_type{0}, stream); auto constexpr tiles_in_block = DEFAULT_JOIN_BLOCK_SIZE / Ref::cg_size; auto const num_blocks = static_cast((n + tiles_in_block - 1) / tiles_in_block); diff --git a/cpp/src/join/hash_join/ref_types.cuh b/cpp/src/join/hash_join/ref_types.cuh index bd2fb424064..a1511c56dcb 100644 --- a/cpp/src/join/hash_join/ref_types.cuh +++ b/cpp/src/join/hash_join/ref_types.cuh @@ -14,6 +14,8 @@ #include +#include + namespace cudf::detail { // --- Equality types from the 3 dispatch paths --- diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 9bdbe705541..76414d9b3bd 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -3489,4 +3489,80 @@ TEST_F(JoinTest, HashJoinPartitionedWholeTable) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sort, *result_sort); } +// Exercises both a sliced (non-zero offset) probe view and a partition size large enough +// to span multiple kernel blocks. +TEST_F(JoinTest, HashJoinPartitionedSlicedMultiBlock) +{ + auto constexpr probe_full_rows = 4000; + auto constexpr probe_offset = 1234; + auto constexpr probe_rows = 2500; + auto constexpr build_rows = 300; + + std::vector probe_vals(probe_full_rows); + for (cudf::size_type i = 0; i < probe_full_rows; ++i) { + probe_vals[i] = i % 200; + } + std::vector build_vals(build_rows); + for (cudf::size_type i = 0; i < build_rows; ++i) { + build_vals[i] = i; + } + + column_wrapper probe_col(probe_vals.begin(), probe_vals.end()); + column_wrapper build_col(build_vals.begin(), build_vals.end()); + + CVector cols_probe, cols_build; + cols_probe.push_back(probe_col.release()); + cols_build.push_back(build_col.release()); + Table probe_full(std::move(cols_probe)); + Table build(std::move(cols_build)); + + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + // Sliced probe view with non-zero offset + auto const probe_view = + cudf::slice(probe_full.view(), {probe_offset, probe_offset + probe_rows})[0]; + + // Reference: full inner join on the sliced probe + auto expected = inner_join(cudf::table_view{probe_view}, build, {0}, {0}); + auto expected_order = cudf::sorted_order(expected->view()); + auto expected_sort = cudf::gather(expected->view(), *expected_order); + + cudf::hash_join hash_joiner(build.select({0}), cudf::null_equality::EQUAL, stream); + auto match_ctx = hash_joiner.inner_join_match_context(probe_view, stream, mr); + auto part_ctx = cudf::join_partition_context{ + std::make_unique(std::move(match_ctx)), 0, 0}; + + // Two partitions covering the sliced probe; each is large enough to span multiple GPU blocks. + auto const mid = probe_rows / 2; + std::vector> const partitions = {{0, mid}, + {mid, probe_rows}}; + + std::vector> partials; + std::vector partial_views; + for (auto [s, e] : partitions) { + part_ctx.left_start_idx = s; + part_ctx.left_end_idx = e; + auto const [left_idx, right_idx] = hash_joiner.partitioned_inner_join(part_ctx, stream, mr); + auto left_col_view = cudf::column_view{cudf::device_span{*left_idx}}; + auto right_col_view = cudf::column_view{cudf::device_span{*right_idx}}; + auto left_res = cudf::gather( + cudf::table_view{probe_view}, left_col_view, cudf::out_of_bounds_policy::DONT_CHECK); + auto right_res = cudf::gather(build, right_col_view, cudf::out_of_bounds_policy::DONT_CHECK); + auto joined = left_res->release(); + auto right_c = right_res->release(); + joined.insert(joined.end(), + std::make_move_iterator(right_c.begin()), + std::make_move_iterator(right_c.end())); + partials.push_back(std::make_unique(std::move(joined))); + partial_views.push_back(partials.back()->view()); + } + + auto concat = cudf::concatenate(partial_views, stream, mr); + auto concat_order = cudf::sorted_order(concat->view()); + auto concat_sort = cudf::gather(concat->view(), *concat_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sort, *concat_sort); +} + CUDF_TEST_PROGRAM_MAIN() From f2c9fff2ab46a6510c53ab86a2d8a8cd7518d8f1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 19 May 2026 22:33:47 +0000 Subject: [PATCH 26/29] Fix probe_table_num_rows reference in match_context.cu --- cpp/src/join/hash_join/match_context.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index af14faecb8b..bcdde2825c0 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -51,11 +51,11 @@ std::unique_ptr> make_join_match_counts( auto count_matches = [&](auto equality, auto d_hasher) { // Precompute probe keys: {hash(row_idx), row_idx} for each probe row. - auto const n = static_cast(probe_table_num_rows); + auto const n = static_cast(left_table_num_rows); rmm::device_uvector probe_keys(n, stream); thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::counting_iterator(0), - cuda::counting_iterator(probe_table_num_rows), + cuda::counting_iterator(left_table_num_rows), probe_keys.begin(), pair_fn{d_hasher}); From 3a47653e3efe697f19ba9c7a1fb748286c15ff84 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 19 May 2026 22:40:00 +0000 Subject: [PATCH 27/29] Use right/left naming for table nouns in finalize_full_join and tests --- cpp/src/join/hash_join/match_context.cu | 10 +-- cpp/src/join/join_common_utils.hpp | 38 ++++++------ cpp/src/join/join_utils.cu | 82 ++++++++++++------------- cpp/tests/join/join_tests.cpp | 65 ++++++++++---------- 4 files changed, 97 insertions(+), 98 deletions(-) diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu index bcdde2825c0..5fc2dd5ba9c 100644 --- a/cpp/src/join/hash_join/match_context.cu +++ b/cpp/src/join/hash_join/match_context.cu @@ -50,23 +50,23 @@ std::unique_ptr> make_join_match_counts( auto const left_table_num_rows = left.num_rows(); auto count_matches = [&](auto equality, auto d_hasher) { - // Precompute probe keys: {hash(row_idx), row_idx} for each probe row. + // Precompute left keys: {hash(row_idx), row_idx} for each left row. auto const n = static_cast(left_table_num_rows); - rmm::device_uvector probe_keys(n, stream); + rmm::device_uvector left_keys(n, stream); thrust::transform(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::counting_iterator(0), cuda::counting_iterator(left_table_num_rows), - probe_keys.begin(), + left_keys.begin(), pair_fn{d_hasher}); auto const ref = hash_table.ref(cuco::op::count) .rebind_key_eq(equality) .rebind_hash_function(hash_table.hash_function()); if (join == join_kind::INNER_JOIN) { - launch_partitioned_count(probe_keys.data(), n, match_counts->begin(), ref, stream); + launch_partitioned_count(left_keys.data(), n, match_counts->begin(), ref, stream); } else { // IsOuter=true handles the clamp (zero → 1) for LEFT/FULL joins internally. - launch_partitioned_count(probe_keys.data(), n, match_counts->begin(), ref, stream); + launch_partitioned_count(left_keys.data(), n, match_counts->begin(), ref, stream); } }; diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 6af7047fbeb..94ba17d5245 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -43,44 +43,44 @@ VectorPair get_trivial_left_join_indices(table_view const& left, rmm::device_async_resource_ref mr); /** - * @brief Finalize a full-join result from a single probe-side `(left, right)` index pair. + * @brief Finalize a full-join result from a single `(left, right)` index pair. * - * Takes ownership of `probe_indices`, resizes both vectors to `probe_indices.first->size() + - * build_table_num_rows`, and appends the complement (unmatched build rows paired with + * Takes ownership of `indices`, resizes both vectors to `indices.first->size() + + * right_table_num_rows`, and appends the complement (unmatched right rows paired with * `JoinNoMatch`) into the tail. The vectors are then resized down to the true output length. * * Used by the non-partitioned full-join paths (hash/mixed/conditional); consuming the caller's - * buffers in-place avoids a redundant concat memcpy over the probe data. + * buffers in-place avoids a redundant concat memcpy over the left-side data. * - * @param probe_indices Probe-side `(left, right)` index vectors (consumed). - * @param probe_table_num_rows Number of rows in the probe table (0 → every build row is - * unmatched, fast path). - * @param build_table_num_rows Number of rows in the build table. + * @param indices `(left, right)` index vectors (consumed). + * @param left_table_num_rows Number of rows in the left table (0 → every right row is + * unmatched, fast path). + * @param right_table_num_rows Number of rows in the right table. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate working storage. * * @return `[left_indices, right_indices]` of the complete full-join output. */ -VectorPair finalize_full_join(VectorPair&& probe_indices, - size_type probe_table_num_rows, - size_type build_table_num_rows, +VectorPair finalize_full_join(VectorPair&& indices, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @brief Finalize a full-join result from per-partition probe index spans. + * @brief Finalize a full-join result from per-partition index spans. * * Concatenates every `(left_partials[i], right_partials[i])` pair into the head of the output - * and appends the complement (unmatched build rows paired with `JoinNoMatch`) into the tail. + * and appends the complement (unmatched right rows paired with `JoinNoMatch`) into the tail. * Internally delegates to the `VectorPair&&` overload, so the mark/compact path is shared. * * Used by `cudf::hash_join::finalize_partitioned_full_join` for partitioned full joins where the * partials live in separate buffers and must be gathered. * - * @param left_partials Per-partition probe-side (left) index spans. - * @param right_partials Per-partition probe-side (right) index spans. - * @param probe_table_num_rows Number of rows in the probe table. - * @param build_table_num_rows Number of rows in the build table. + * @param left_partials Per-partition left index spans. + * @param right_partials Per-partition right index spans. + * @param left_table_num_rows Number of rows in the left table. + * @param right_table_num_rows Number of rows in the right table. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned vectors. * @@ -89,8 +89,8 @@ VectorPair finalize_full_join(VectorPair&& probe_indices, VectorPair finalize_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 2c02c6b930f..16e0a070bc6 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -56,16 +56,16 @@ VectorPair get_trivial_left_join_indices(table_view const& left, namespace { -// Predicate: build row `idx` is unmatched iff its flag slot is zero. -// We use an int32 flag (one per build row) rather than a packed bit or a byte: byte stores +// Predicate: right row `idx` is unmatched iff its flag slot is zero. +// We use an int32 flag (one per right row) rather than a packed bit or a byte: byte stores // from a dense 32-wide scatter don't coalesce into full-word transactions, which costs 2–3× -// in the mark kernel for skewed probe/build ratios. +// in the mark kernel for skewed left/right ratios. struct unmatched_flag { size_type const* flags; __device__ bool operator()(size_type idx) const noexcept { return flags[idx] == 0; } }; -// Transform a selected (unmatched) build index into a (JoinNoMatch, idx) pair that is stored +// Transform a selected (unmatched) right index into a (JoinNoMatch, idx) pair that is stored // through a zip iterator over (left_out_tail, right_out_tail). struct to_no_match_pair { __device__ cuda::std::tuple operator()(size_type idx) const noexcept @@ -76,75 +76,75 @@ struct to_no_match_pair { } // namespace -VectorPair finalize_full_join(VectorPair&& probe_indices, - size_type probe_table_num_rows, - size_type build_table_num_rows, +VectorPair finalize_full_join(VectorPair&& indices, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto [left_out, right_out] = std::move(probe_indices); + auto [left_out, right_out] = std::move(indices); CUDF_EXPECTS(left_out->size() == right_out->size(), - "probe left/right index vectors must have equal size", + "left/right index vectors must have equal size", std::invalid_argument); - auto const probe_total = left_out->size(); + auto const match_total = left_out->size(); - // Empty-probe fast path: every build row is unmatched. - if (probe_table_num_rows == 0) { - auto const tail = static_cast(build_table_num_rows); - left_out->resize(probe_total + tail, stream); - right_out->resize(probe_total + tail, stream); + // Empty-left fast path: every right row is unmatched. + if (left_table_num_rows == 0) { + auto const tail = static_cast(right_table_num_rows); + left_out->resize(match_total + tail, stream); + right_out->resize(match_total + tail, stream); thrust::sequence(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - right_out->begin() + probe_total, + right_out->begin() + match_total, right_out->end(), 0); thrust::uninitialized_fill( rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), - left_out->begin() + probe_total, + left_out->begin() + match_total, left_out->end(), cudf::JoinNoMatch); return std::pair(std::move(left_out), std::move(right_out)); } - if (build_table_num_rows == 0) { return std::pair(std::move(left_out), std::move(right_out)); } + if (right_table_num_rows == 0) { return std::pair(std::move(left_out), std::move(right_out)); } - // Grow to the upper bound (probe_total + build_table_num_rows); the complement is appended + // Grow to the upper bound (match_total + right_table_num_rows); the complement is appended // into the tail. If the caller pre-reserved this capacity (see the span overload below), // these resizes don't reallocate. - auto const upper = probe_total + static_cast(build_table_num_rows); + auto const upper = match_total + static_cast(right_table_num_rows); left_out->resize(upper, stream); right_out->resize(upper, stream); - // Mark matched build rows in an int32 flag array (one word per build row). Redundant stores + // Mark matched right rows in an int32 flag array (one word per right row). Redundant stores // of the same value are idempotent, so no atomics are needed. Word-sized stores coalesce into // full 128-byte transactions per warp; byte-sized flags cost ~2–3× here because partial-word // stores from dense scatters serialize within each 32-bit sector. auto flags = cudf::detail::make_zeroed_device_uvector_async( - build_table_num_rows, stream, cudf::get_current_device_resource_ref()); + right_table_num_rows, stream, cudf::get_current_device_resource_ref()); thrust::scatter_if(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()), cuda::make_constant_iterator(size_type{1}), - cuda::make_constant_iterator(size_type{1}) + probe_total, + cuda::make_constant_iterator(size_type{1}) + match_total, right_out->begin(), right_out->begin(), flags.begin(), - valid_range{0, build_table_num_rows}); + valid_range{0, right_table_num_rows}); - // Fused compaction: for each unmatched build row, emit (JoinNoMatch, build_idx) into + // Fused compaction: for each unmatched right row, emit (JoinNoMatch, right_idx) into // (left_out_tail, right_out_tail) in a single CUB DeviceSelect pass. auto zip_tail = - thrust::make_zip_iterator(left_out->data() + probe_total, right_out->data() + probe_total); + thrust::make_zip_iterator(left_out->data() + match_total, right_out->data() + match_total); auto out_iter = thrust::make_transform_output_iterator(zip_tail, to_no_match_pair{}); auto const new_end = cudf::detail::copy_if(cuda::counting_iterator{0}, - cuda::counting_iterator{build_table_num_rows}, + cuda::counting_iterator{right_table_num_rows}, out_iter, unmatched_flag{flags.data()}, stream); auto const comp_size = static_cast(new_end - out_iter); - left_out->resize(probe_total + comp_size, stream); - right_out->resize(probe_total + comp_size, stream); + left_out->resize(match_total + comp_size, stream); + right_out->resize(match_total + comp_size, stream); return std::pair(std::move(left_out), std::move(right_out)); } @@ -152,8 +152,8 @@ VectorPair finalize_full_join(VectorPair&& probe_indices, VectorPair finalize_full_join( cudf::host_span const> left_partials, cudf::host_span const> right_partials, - size_type probe_table_num_rows, - size_type build_table_num_rows, + size_type left_table_num_rows, + size_type right_table_num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -161,22 +161,22 @@ VectorPair finalize_full_join( "left_partials and right_partials must have the same length", std::invalid_argument); - std::size_t probe_total = 0; + std::size_t match_total = 0; for (std::size_t i = 0; i < left_partials.size(); ++i) { CUDF_EXPECTS(left_partials[i].size() == right_partials[i].size(), "matching partials must have equal left/right sizes", std::invalid_argument); - probe_total += left_partials[i].size(); + match_total += left_partials[i].size(); } // Pre-allocate at the upper bound so the VectorPair overload's resize-up becomes a no-op // (capacity is already there). - auto const upper = probe_total + static_cast(build_table_num_rows); + auto const upper = match_total + static_cast(right_table_num_rows); auto left_out = std::make_unique>(upper, stream, mr); auto right_out = std::make_unique>(upper, stream, mr); - // Concatenate every probe partial into the head of the output via one batched memcpy. - if (probe_total > 0) { + // Concatenate every partial into the head of the output via one batched memcpy. + if (match_total > 0) { auto const n = left_partials.size(); std::vector dsts; std::vector srcs; @@ -199,14 +199,14 @@ VectorPair finalize_full_join( dsts.data(), srcs.data(), sizes.data(), dsts.size(), stream)); } - // Shrink the uvectors' logical size to probe_total (capacity stays at upper bound), then + // Shrink the uvectors' logical size to match_total (capacity stays at upper bound), then // delegate to the VectorPair overload which resizes back up and appends the complement. - left_out->resize(probe_total, stream); - right_out->resize(probe_total, stream); + left_out->resize(match_total, stream); + right_out->resize(match_total, stream); return finalize_full_join(std::pair(std::move(left_out), std::move(right_out)), - probe_table_num_rows, - build_table_num_rows, + left_table_num_rows, + right_table_num_rows, stream, mr); } diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 76414d9b3bd..231a52085e7 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -282,12 +282,12 @@ std::unique_ptr full_join( auto match_ctx = hash_joiner.full_join_match_context(left, stream, mr); auto part_ctx = cudf::join_partition_context{ std::make_unique(std::move(match_ctx)), 0, left.num_rows()}; - auto [probe_left, probe_right] = hash_joiner.partitioned_full_join(part_ctx, stream, mr); + auto [left_idx, right_idx] = hash_joiner.partitioned_full_join(part_ctx, stream, mr); std::vector> left_partials{ - cudf::device_span{probe_left->data(), probe_left->size()}}; + cudf::device_span{left_idx->data(), left_idx->size()}}; std::vector> right_partials{ - cudf::device_span{probe_right->data(), probe_right->size()}}; + cudf::device_span{right_idx->data(), right_idx->size()}}; return cudf::hash_join::finalize_partitioned_full_join( left_partials, right_partials, left.num_rows(), right.num_rows(), stream, mr); }, @@ -3489,54 +3489,53 @@ TEST_F(JoinTest, HashJoinPartitionedWholeTable) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_sort, *result_sort); } -// Exercises both a sliced (non-zero offset) probe view and a partition size large enough +// Exercises both a sliced (non-zero offset) left view and a partition size large enough // to span multiple kernel blocks. TEST_F(JoinTest, HashJoinPartitionedSlicedMultiBlock) { - auto constexpr probe_full_rows = 4000; - auto constexpr probe_offset = 1234; - auto constexpr probe_rows = 2500; - auto constexpr build_rows = 300; + auto constexpr left_full_rows = 4000; + auto constexpr left_offset = 1234; + auto constexpr left_rows = 2500; + auto constexpr right_rows = 300; - std::vector probe_vals(probe_full_rows); - for (cudf::size_type i = 0; i < probe_full_rows; ++i) { - probe_vals[i] = i % 200; + std::vector left_vals(left_full_rows); + for (cudf::size_type i = 0; i < left_full_rows; ++i) { + left_vals[i] = i % 200; } - std::vector build_vals(build_rows); - for (cudf::size_type i = 0; i < build_rows; ++i) { - build_vals[i] = i; + std::vector right_vals(right_rows); + for (cudf::size_type i = 0; i < right_rows; ++i) { + right_vals[i] = i; } - column_wrapper probe_col(probe_vals.begin(), probe_vals.end()); - column_wrapper build_col(build_vals.begin(), build_vals.end()); + column_wrapper left_col(left_vals.begin(), left_vals.end()); + column_wrapper right_col(right_vals.begin(), right_vals.end()); - CVector cols_probe, cols_build; - cols_probe.push_back(probe_col.release()); - cols_build.push_back(build_col.release()); - Table probe_full(std::move(cols_probe)); - Table build(std::move(cols_build)); + CVector cols_left, cols_right; + cols_left.push_back(left_col.release()); + cols_right.push_back(right_col.release()); + Table left_full(std::move(cols_left)); + Table right(std::move(cols_right)); auto const stream = cudf::get_default_stream(); auto const mr = cudf::get_current_device_resource_ref(); - // Sliced probe view with non-zero offset - auto const probe_view = - cudf::slice(probe_full.view(), {probe_offset, probe_offset + probe_rows})[0]; + // Sliced left view with non-zero offset + auto const left_view = cudf::slice(left_full.view(), {left_offset, left_offset + left_rows})[0]; - // Reference: full inner join on the sliced probe - auto expected = inner_join(cudf::table_view{probe_view}, build, {0}, {0}); + // Reference: full inner join on the sliced left + auto expected = inner_join(cudf::table_view{left_view}, right, {0}, {0}); auto expected_order = cudf::sorted_order(expected->view()); auto expected_sort = cudf::gather(expected->view(), *expected_order); - cudf::hash_join hash_joiner(build.select({0}), cudf::null_equality::EQUAL, stream); - auto match_ctx = hash_joiner.inner_join_match_context(probe_view, stream, mr); + cudf::hash_join hash_joiner(right.select({0}), cudf::null_equality::EQUAL, stream); + auto match_ctx = hash_joiner.inner_join_match_context(left_view, stream, mr); auto part_ctx = cudf::join_partition_context{ std::make_unique(std::move(match_ctx)), 0, 0}; - // Two partitions covering the sliced probe; each is large enough to span multiple GPU blocks. - auto const mid = probe_rows / 2; + // Two partitions covering the sliced left; each is large enough to span multiple GPU blocks. + auto const mid = left_rows / 2; std::vector> const partitions = {{0, mid}, - {mid, probe_rows}}; + {mid, left_rows}}; std::vector> partials; std::vector partial_views; @@ -3547,8 +3546,8 @@ TEST_F(JoinTest, HashJoinPartitionedSlicedMultiBlock) auto left_col_view = cudf::column_view{cudf::device_span{*left_idx}}; auto right_col_view = cudf::column_view{cudf::device_span{*right_idx}}; auto left_res = cudf::gather( - cudf::table_view{probe_view}, left_col_view, cudf::out_of_bounds_policy::DONT_CHECK); - auto right_res = cudf::gather(build, right_col_view, cudf::out_of_bounds_policy::DONT_CHECK); + cudf::table_view{left_view}, left_col_view, cudf::out_of_bounds_policy::DONT_CHECK); + auto right_res = cudf::gather(right, right_col_view, cudf::out_of_bounds_policy::DONT_CHECK); auto joined = left_res->release(); auto right_c = right_res->release(); joined.insert(joined.end(), From 87765a3d989e575dc71ca11e13b2432d717e8205 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 20 May 2026 17:14:20 +0000 Subject: [PATCH 28/29] Simplify partitioned_count_kernel: drop tile.all short-circuit, fuse IsOuter writers --- .../hash_join/partitioned_count_kernels.cuh | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/src/join/hash_join/partitioned_count_kernels.cuh b/cpp/src/join/hash_join/partitioned_count_kernels.cuh index bcdc462c3b0..f9d3809a0a4 100644 --- a/cpp/src/join/hash_join/partitioned_count_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_count_kernels.cuh @@ -60,22 +60,16 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - if constexpr (IsOuter) { - auto const temp_count = static_cast(ref.count(tile, key)); - if (tile.all(temp_count == 0)) { - cooperative_groups::invoke_one(tile, [&]() { output[idx] = size_type{1}; }); + auto const temp_count = static_cast(ref.count(tile, key)); + auto const match_count = + cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); + cooperative_groups::invoke_one(tile, [&]() { + if constexpr (IsOuter) { + output[idx] = (match_count == 0) ? size_type{1} : match_count; } else { - auto const match_count = - cooperative_groups::reduce(tile, temp_count, cooperative_groups::plus()); - cooperative_groups::invoke_one(tile, [&]() { output[idx] = match_count; }); + output[idx] = match_count; } - } else { - auto const match_count = - cooperative_groups::reduce(tile, - static_cast(ref.count(tile, key)), - cooperative_groups::plus()); - cooperative_groups::invoke_one(tile, [&]() { output[idx] = match_count; }); - } + }); } idx += stride; } From 594a5320e84676d373ee2f71e55ba2a6daac2b7a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 20 May 2026 17:20:40 +0000 Subject: [PATCH 29/29] Add tile size check + use cuda::std::distance --- cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh | 2 ++ cpp/src/join/join_utils.cu | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh index 742c8f8663d..7e822fed12b 100644 --- a/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh +++ b/cpp/src/join/hash_join/partitioned_retrieve_kernels.cuh @@ -70,6 +70,8 @@ CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) auto constexpr bucket_size = Ref::bucket_size; auto constexpr flushing_tile_size = 32; // full warp for coalesced flushes static_assert(flushing_tile_size >= cg_size); + static_assert(flushing_tile_size % cg_size == 0, + "Every probing tile must sit inside a single flushing tile"); static_assert(DEFAULT_JOIN_BLOCK_SIZE % flushing_tile_size == 0); auto constexpr num_flushing_tiles = DEFAULT_JOIN_BLOCK_SIZE / flushing_tile_size; diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 16e0a070bc6..c345eb6a915 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -142,7 +143,7 @@ VectorPair finalize_full_join(VectorPair&& indices, unmatched_flag{flags.data()}, stream); - auto const comp_size = static_cast(new_end - out_iter); + auto const comp_size = cuda::std::distance(out_iter, new_end); left_out->resize(match_total + comp_size, stream); right_out->resize(match_total + comp_size, stream);