diff --git a/cudax/include/cuda/experimental/__coop/reduce.cuh b/cudax/include/cuda/experimental/__coop/reduce.cuh index fae409e49c0..3413eae3025 100644 --- a/cudax/include/cuda/experimental/__coop/reduce.cuh +++ b/cudax/include/cuda/experimental/__coop/reduce.cuh @@ -196,9 +196,51 @@ __reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn _ return ::cuda::std::nullopt; } +_CCCL_TEMPLATE(class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn) +_CCCL_REQUIRES(::cuda::std::is_same_v + _CCCL_AND ::cuda::std::is_same_v) +[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> +__reduce_impl(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) +{ + constexpr auto __nwarps_in_group = warp.static_count(__group); + static_assert(__nwarps_in_group != ::cuda::std::dynamic_extent, + "cuda::coop::reduce requires the group to have statically known size"); + + using _WarpReduce = ::cub::WarpReduce<_Tp>; + union _Scratch + { + typename _WarpReduce::TempStorage __warp_reduce_[__nwarps_in_group]; + _Tp __partials_[__nwarps_in_group]; + }; + __shared__ _Scratch __scratch; + + const auto __partial = _WarpReduce{__scratch.__warp_reduce_[warp.rank(__group)]}.Reduce(__thread_data, __red_fn); + __group.sync_aligned(); + + this_warp __warp{__group.hierarchy()}; + if (gpu_thread.is_root_rank(__warp)) + { + __scratch.__partials_[warp.rank(__group)] = __partial; + } + __group.sync_aligned(); + + if (warp.is_root_rank(__group)) + { + const auto __value = (gpu_thread.rank(__warp) < __nwarps_in_group) + ? __scratch.__partials_[gpu_thread.rank(__warp)] + : ::cuda::identity_element<_RedFn, _Tp>(); + const auto __result = _WarpReduce{__scratch.__warp_reduce_[0]}.Reduce(__value, __red_fn); + if (gpu_thread.is_root_rank(__warp)) + { + return ::cuda::std::optional{__result}; + } + } + return ::cuda::std::nullopt; +} + template [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> -reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn) +reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) { static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent, "cuda::coop::reduce requires the group to have statically known size"); diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh index 6025e8e2eee..85bd2ff35f0 100644 --- a/cudax/include/cuda/experimental/__group/group.cuh +++ b/cudax/include/cuda/experimental/__group/group.cuh @@ -99,11 +99,21 @@ class group { _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); _CCCL_ASSERT(__mapping_result.rank() < __mapping_result.count(), "invalid rank"); - _CCCL_ASSERT( - (__mapping_result.lane_mask() & ::cuda::device::lane_mask::this_lane()) != ::cuda::device::lane_mask::none(), - "invalid lane mask - this lane must be contained in the lane mask"); - _CCCL_ASSERT(::cuda::std::popcount(__mapping_result.lane_mask().value()) <= __mapping_result.count(), - "invalid lane mask - too many lanes are set in the lane mask"); + + if constexpr (::cuda::std::is_same_v<_Unit, thread_level>) + { + _CCCL_ASSERT( + (__mapping_result.lane_mask() & ::cuda::device::lane_mask::this_lane()) != ::cuda::device::lane_mask::none(), + "invalid lane mask - this lane must be contained in the lane mask"); + _CCCL_ASSERT(::cuda::std::popcount(__mapping_result.lane_mask().value()) <= __mapping_result.count(), + "invalid lane mask - too many lanes are set in the lane mask"); + } + else + { + _CCCL_ASSERT(__mapping_result.lane_mask() == ::cuda::device::lane_mask::all(), + "invalid lane mask - must be equal to cuda::device::lane_mask::all() when _Unit is not " + "cuda::thread_level"); + } } return __mapping_result; } diff --git a/cudax/include/cuda/experimental/__group/queries.cuh b/cudax/include/cuda/experimental/__group/queries.cuh index 162f9f1110b..b5bfc92e3e7 100644 --- a/cudax/include/cuda/experimental/__group/queries.cuh +++ b/cudax/include/cuda/experimental/__group/queries.cuh @@ -103,9 +103,9 @@ template } else { - const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy()); - const auto __group_unit_count = ::cuda::experimental::__count_query_group<_Tp, _Unit>(__group); - return static_cast<_Tp>(__group_unit_rank * __group_unit_count + __unit_rank); + const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy()); + const auto __unit_count = __count_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy()); + return static_cast<_Tp>(__group_unit_rank * __unit_count + __unit_rank); } } diff --git a/cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh b/cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh index 45bc0494f51..4db16fb931c 100644 --- a/cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh +++ b/cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh @@ -123,9 +123,18 @@ public: _CCCL_ASSERT(__mapping_result.group_count() <= __barriers_.size(), "invalid number of barriers passed"); } - if (__mapping_result.is_valid() && __mapping_result.rank() == 0) + ::cuda::std::size_t __nthread_in_unit = 1; + ::cuda::std::size_t __thread_rank_in_unit = 0; + if constexpr (!::cuda::std::is_same_v) { - init(&__barriers_[__mapping_result.group_rank()], static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count())); + __nthread_in_unit = gpu_thread.count(_Unit{}, __parent.hierarchy()); + __thread_rank_in_unit = gpu_thread.rank(_Unit{}, __parent.hierarchy()); + } + + if (__mapping_result.is_valid() && __mapping_result.rank() == 0 && __thread_rank_in_unit == 0) + { + init(&__barriers_[__mapping_result.group_rank()], + static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count() * __nthread_in_unit)); } // todo(dabayer): How we can expose making this aligned? diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index c74701b26bb..6428b4d36a0 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -196,6 +196,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_cluster cudax_add_catch2_test(test_target coop.reduce.this_grid coop/reduce/this_grid.cu ) +cudax_add_catch2_test(test_target coop.reduce.warps_within_block + coop/reduce/warps_within_block.cu +) if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes diff --git a/cudax/test/coop/reduce/warps_within_block.cu b/cudax/test/coop/reduce/warps_within_block.cu new file mode 100644 index 00000000000..c00537bb9c0 --- /dev/null +++ b/cudax/test/coop/reduce/warps_within_block.cu @@ -0,0 +1,196 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +constexpr int nwarps_in_group = 3; +constexpr int warp_size = 32; + +/*********************************************************************************************************************** + * Thread Reduce Wrapper Kernels + **********************************************************************************************************************/ + +struct ReduceKernel +{ + template + __device__ void operator()( + Config config, + cuda::std::integral_constant, + const T* __restrict__ d_in, + T* __restrict__ d_out, + RedOp red_op) + { + cudax::this_block block{config}; + + using Barriers = cuda::barrier[1]; + __shared__ cuda::std::aligned_storage_t barriers_storage; + auto& barriers = reinterpret_cast(barriers_storage); + + cudax::group group{ + cuda::warp, block, cudax::group_by{}, cudax::barrier_synchronizer{barriers}}; + + // All threads that are not part of the groups should exit early. + if (!cuda::gpu_thread.is_part_of(group)) + { + return; + } + + T thread_data[NumItems]; + for (int i = 0; i < NumItems; ++i) + { + thread_data[i] = d_in[cuda::gpu_thread.rank_as(group) + i * cuda::gpu_thread.count_as(group)]; + } + const auto result = cudax::coop::reduce(group, thread_data, red_op); + + REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(group)); + if (cuda::gpu_thread.is_root_rank(group)) + { + *d_out = result.value(); + } + } +}; + +/*********************************************************************************************************************** + * Type list definition + **********************************************************************************************************************/ + +using integral_type_list = + c2h::type_list; + +using fp_type_list = c2h::type_list; + +using operator_integral_list = + c2h::type_list, + cuda::std::multiplies<>, + cuda::std::bit_and<>, + cuda::std::bit_or<>, + cuda::std::bit_xor<>, + cuda::minimum<>, + cuda::maximum<>>; + +using operator_fp_list = c2h::type_list, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>; + +/*********************************************************************************************************************** + * Verify results and kernel launch + **********************************************************************************************************************/ + +template +void verify_results(const T& expected_data, const T& test_results) +{ + if constexpr (cuda::std::is_floating_point_v) + { + REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); + } + else + { + REQUIRE(expected_data == test_results); + } +} + +template +void run_thread_reduce_kernel( + cuda::stream_ref stream, int num_items, const c2h::device_vector& in, c2h::device_vector& out, RedOp red_op) +{ + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<(nwarps_in_group + 2) * warp_size>()); + const auto in_ptr = thrust::raw_pointer_cast(in.data()); + const auto out_ptr = thrust::raw_pointer_cast(out.data()); + const ReduceKernel kernel{}; + + switch (num_items) + { + case 1: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 2: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 3: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 4: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + default: + FAIL("Unsupported number of items"); + } + stream.sync(); +} + +constexpr int max_size = 4; +constexpr int num_seeds = 10; + +/*********************************************************************************************************************** + * Test cases + **********************************************************************************************************************/ + +_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data + +C2H_TEST("reduce/warps_within_block Integral Type Tests", + "[reduce][warps_within_block]", + integral_type_list, + operator_integral_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + constexpr auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * nwarps_in_group * warp_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = cuda::std::accumulate( + h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op); + run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} + +C2H_TEST( + "reduce/warps_within_block Floating-Point Type Tests", "[reduce][warps_within_block]", fp_type_list, operator_fp_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + const auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * nwarps_in_group * warp_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = cuda::std::accumulate( + h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op); + run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +}