Skip to content

Commit a7140fb

Browse files
committed
[cudax] Implement cudax::coop::reduce for warp groups within a block
1 parent bfb2df6 commit a7140fb

6 files changed

Lines changed: 271 additions & 11 deletions

File tree

cudax/include/cuda/experimental/__coop/reduce.cuh

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,51 @@ __reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn _
196196
return ::cuda::std::nullopt;
197197
}
198198

199+
_CCCL_TEMPLATE(class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn)
200+
_CCCL_REQUIRES(::cuda::std::is_same_v<warp_level, typename _Group::unit_type>
201+
_CCCL_AND ::cuda::std::is_same_v<block_level, typename _Group::level_type>)
202+
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
203+
__reduce_impl(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
204+
{
205+
constexpr auto __nwarps_in_group = warp.static_count(__group);
206+
static_assert(__nwarps_in_group != ::cuda::std::dynamic_extent,
207+
"cuda::coop::reduce requires the group to have statically known size");
208+
209+
using _WarpReduce = ::cub::WarpReduce<_Tp>;
210+
union _Scratch
211+
{
212+
typename _WarpReduce::TempStorage __warp_reduce_[__nwarps_in_group];
213+
_Tp __partials_[__nwarps_in_group];
214+
};
215+
__shared__ _Scratch __scratch;
216+
217+
const auto __partial = _WarpReduce{__scratch.__warp_reduce_[warp.rank(__group)]}.Reduce(__thread_data, __red_fn);
218+
__group.sync_aligned();
219+
220+
this_warp __warp{__group.hierarchy()};
221+
if (gpu_thread.is_root_rank(__warp))
222+
{
223+
__scratch.__partials_[warp.rank(__group)] = __partial;
224+
}
225+
__group.sync_aligned();
226+
227+
if (warp.is_root_rank(__group))
228+
{
229+
const auto __value = (gpu_thread.rank(__warp) < __nwarps_in_group)
230+
? __scratch.__partials_[gpu_thread.rank(__warp)]
231+
: ::cuda::identity_element<_RedFn, _Tp>();
232+
const auto __result = _WarpReduce{__scratch.__warp_reduce_[0]}.Reduce(__value, __red_fn);
233+
if (gpu_thread.is_root_rank(__warp))
234+
{
235+
return ::cuda::std::optional{__result};
236+
}
237+
}
238+
return ::cuda::std::nullopt;
239+
}
240+
199241
template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
200242
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
201-
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
243+
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
202244
{
203245
static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
204246
"cuda::coop::reduce requires the group to have statically known size");

cudax/include/cuda/experimental/__group/group.cuh

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,21 @@ class group
9999
{
100100
_CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank");
101101
_CCCL_ASSERT(__mapping_result.rank() < __mapping_result.count(), "invalid rank");
102-
_CCCL_ASSERT(
103-
(__mapping_result.lane_mask() & ::cuda::device::lane_mask::this_lane()) != ::cuda::device::lane_mask::none(),
104-
"invalid lane mask - this lane must be contained in the lane mask");
105-
_CCCL_ASSERT(::cuda::std::popcount(__mapping_result.lane_mask().value()) <= __mapping_result.count(),
106-
"invalid lane mask - too many lanes are set in the lane mask");
102+
103+
if constexpr (::cuda::std::is_same_v<_Unit, thread_level>)
104+
{
105+
_CCCL_ASSERT(
106+
(__mapping_result.lane_mask() & ::cuda::device::lane_mask::this_lane()) != ::cuda::device::lane_mask::none(),
107+
"invalid lane mask - this lane must be contained in the lane mask");
108+
_CCCL_ASSERT(::cuda::std::popcount(__mapping_result.lane_mask().value()) <= __mapping_result.count(),
109+
"invalid lane mask - too many lanes are set in the lane mask");
110+
}
111+
else
112+
{
113+
_CCCL_ASSERT(__mapping_result.lane_mask() == ::cuda::device::lane_mask::all(),
114+
"invalid lane mask - must be equal to cuda::device::lane_mask::all() when _Unit is not "
115+
"cuda::thread_level");
116+
}
107117
}
108118
return __mapping_result;
109119
}

cudax/include/cuda/experimental/__group/queries.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ template <class _Tp, class _Unit, class _Group>
103103
}
104104
else
105105
{
106-
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
107-
const auto __group_unit_count = ::cuda::experimental::__count_query_group<_Tp, _Unit>(__group);
108-
return static_cast<_Tp>(__group_unit_rank * __group_unit_count + __unit_rank);
106+
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
107+
const auto __unit_count = __count_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
108+
return static_cast<_Tp>(__group_unit_rank * __unit_count + __unit_rank);
109109
}
110110
}
111111

cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,18 @@ public:
123123
_CCCL_ASSERT(__mapping_result.group_count() <= __barriers_.size(), "invalid number of barriers passed");
124124
}
125125

126-
if (__mapping_result.is_valid() && __mapping_result.rank() == 0)
126+
::cuda::std::size_t __nthread_in_unit = 1;
127+
::cuda::std::size_t __thread_rank_in_unit = 0;
128+
if constexpr (!::cuda::std::is_same_v<thread_level, _Unit>)
127129
{
128-
init(&__barriers_[__mapping_result.group_rank()], static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count()));
130+
__nthread_in_unit = gpu_thread.count(_Unit{}, __parent.hierarchy());
131+
__thread_rank_in_unit = gpu_thread.rank(_Unit{}, __parent.hierarchy());
132+
}
133+
134+
if (__mapping_result.is_valid() && __mapping_result.rank() == 0 && __thread_rank_in_unit == 0)
135+
{
136+
init(&__barriers_[__mapping_result.group_rank()],
137+
static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count() * __nthread_in_unit));
129138
}
130139

131140
// todo(dabayer): How we can expose making this aligned?

cudax/test/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_cluster
196196
cudax_add_catch2_test(test_target coop.reduce.this_grid
197197
coop/reduce/this_grid.cu
198198
)
199+
cudax_add_catch2_test(test_target coop.reduce.warps_within_block
200+
coop/reduce/warps_within_block.cu
201+
)
199202

200203
if (cudax_ENABLE_CUFILE)
201204
cudax_add_catch2_test(test_target cufile.driver_attributes
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <cuda/atomic>
12+
#include <cuda/devices>
13+
#include <cuda/functional>
14+
#include <cuda/hierarchy>
15+
#include <cuda/launch>
16+
#include <cuda/std/algorithm>
17+
#include <cuda/std/type_traits>
18+
#include <cuda/stream>
19+
20+
#include <cuda/experimental/coop.cuh>
21+
#include <cuda/experimental/group.cuh>
22+
23+
#include <testing.cuh>
24+
25+
#include <c2h/catch2_test_helper.h>
26+
#include <c2h/extended_types.h>
27+
#include <c2h/generators.h>
28+
#include <catch2/matchers/catch_matchers_floating_point.hpp>
29+
30+
constexpr int nwarps_in_group = 3;
31+
constexpr int warp_size = 32;
32+
33+
/***********************************************************************************************************************
34+
* Thread Reduce Wrapper Kernels
35+
**********************************************************************************************************************/
36+
37+
struct ReduceKernel
38+
{
39+
template <class Config, int NumItems, class T, class RedOp>
40+
__device__ void operator()(
41+
Config config,
42+
cuda::std::integral_constant<int, NumItems>,
43+
const T* __restrict__ d_in,
44+
T* __restrict__ d_out,
45+
RedOp red_op)
46+
{
47+
cudax::this_block block{config};
48+
49+
using Barriers = cuda::barrier<cuda::thread_scope_block>[1];
50+
__shared__ cuda::std::aligned_storage_t<sizeof(Barriers), alignof(Barriers)> barriers_storage;
51+
auto& barriers = reinterpret_cast<Barriers&>(barriers_storage);
52+
53+
cudax::group group{
54+
cuda::warp, block, cudax::group_by<nwarps_in_group, false>{}, cudax::barrier_synchronizer{barriers}};
55+
56+
// All threads that are not part of the groups should exit early.
57+
if (!cuda::gpu_thread.is_part_of(group))
58+
{
59+
return;
60+
}
61+
62+
T thread_data[NumItems];
63+
for (int i = 0; i < NumItems; ++i)
64+
{
65+
thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(group) + i * cuda::gpu_thread.count_as<int>(group)];
66+
}
67+
const auto result = cudax::coop::reduce(group, thread_data, red_op);
68+
69+
REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(group));
70+
if (cuda::gpu_thread.is_root_rank(group))
71+
{
72+
*d_out = result.value();
73+
}
74+
}
75+
};
76+
77+
/***********************************************************************************************************************
78+
* Type list definition
79+
**********************************************************************************************************************/
80+
81+
using integral_type_list =
82+
c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
83+
84+
using fp_type_list = c2h::type_list<float, double>;
85+
86+
using operator_integral_list =
87+
c2h::type_list<cuda::std::plus<>,
88+
cuda::std::multiplies<>,
89+
cuda::std::bit_and<>,
90+
cuda::std::bit_or<>,
91+
cuda::std::bit_xor<>,
92+
cuda::minimum<>,
93+
cuda::maximum<>>;
94+
95+
using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
96+
97+
/***********************************************************************************************************************
98+
* Verify results and kernel launch
99+
**********************************************************************************************************************/
100+
101+
template <class T>
102+
void verify_results(const T& expected_data, const T& test_results)
103+
{
104+
if constexpr (cuda::std::is_floating_point_v<T>)
105+
{
106+
REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
107+
}
108+
else
109+
{
110+
REQUIRE(expected_data == test_results);
111+
}
112+
}
113+
114+
template <class T, class RedOp>
115+
void run_thread_reduce_kernel(
116+
cuda::stream_ref stream, int num_items, const c2h::device_vector<T>& in, c2h::device_vector<T>& out, RedOp red_op)
117+
{
118+
const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<(nwarps_in_group + 2) * warp_size>());
119+
const auto in_ptr = thrust::raw_pointer_cast(in.data());
120+
const auto out_ptr = thrust::raw_pointer_cast(out.data());
121+
const ReduceKernel kernel{};
122+
123+
switch (num_items)
124+
{
125+
case 1:
126+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
127+
break;
128+
case 2:
129+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 2>{}, in_ptr, out_ptr, red_op);
130+
break;
131+
case 3:
132+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 3>{}, in_ptr, out_ptr, red_op);
133+
break;
134+
case 4:
135+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
136+
break;
137+
default:
138+
FAIL("Unsupported number of items");
139+
}
140+
stream.sync();
141+
}
142+
143+
constexpr int max_size = 4;
144+
constexpr int num_seeds = 10;
145+
146+
/***********************************************************************************************************************
147+
* Test cases
148+
**********************************************************************************************************************/
149+
150+
_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
151+
152+
C2H_TEST("reduce/warps_within_block Integral Type Tests",
153+
"[reduce][warps_within_block]",
154+
integral_type_list,
155+
operator_integral_list)
156+
{
157+
using value_t = c2h::get<0, TestType>;
158+
using op_t = c2h::get<1, TestType>;
159+
constexpr auto reduce_op = op_t{};
160+
constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
161+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
162+
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
163+
c2h::device_vector<value_t> d_out(1);
164+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
165+
c2h::host_vector<value_t> h_in = d_in;
166+
cuda::stream stream{cuda::devices[0]};
167+
for (int num_items = 1; num_items <= max_size; ++num_items)
168+
{
169+
auto reference_result = cuda::std::accumulate(
170+
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
171+
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
172+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
173+
}
174+
}
175+
176+
C2H_TEST(
177+
"reduce/warps_within_block Floating-Point Type Tests", "[reduce][warps_within_block]", fp_type_list, operator_fp_list)
178+
{
179+
using value_t = c2h::get<0, TestType>;
180+
using op_t = c2h::get<1, TestType>;
181+
constexpr auto reduce_op = op_t{};
182+
const auto operator_identity = cuda::identity_element<op_t, value_t>();
183+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
184+
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
185+
c2h::device_vector<value_t> d_out(1);
186+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
187+
c2h::host_vector<value_t> h_in = d_in;
188+
cuda::stream stream{cuda::devices[0]};
189+
for (int num_items = 1; num_items <= max_size; ++num_items)
190+
{
191+
auto reference_result = cuda::std::accumulate(
192+
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
193+
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
194+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
195+
}
196+
}

0 commit comments

Comments
 (0)