Skip to content

Commit 3aebfa2

Browse files
committed
[cudax] Implement cudax::coop::reduce for warp groups within a block
1 parent 2f7cb8b commit 3aebfa2

5 files changed

Lines changed: 246 additions & 6 deletions

File tree

cudax/include/cuda/experimental/__coop/reduce.cuh

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,51 @@ __reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn _
196196
return ::cuda::std::nullopt;
197197
}
198198

199+
_CCCL_TEMPLATE(class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn)
200+
_CCCL_REQUIRES(::cuda::std::is_same_v<warp_level, typename _Group::unit_type>
201+
_CCCL_AND ::cuda::std::is_same_v<block_level, typename _Group::level_type>)
202+
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
203+
__reduce_impl(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
204+
{
205+
constexpr auto __nwarps_in_group = warp.static_count(__group);
206+
static_assert(__nwarps_in_group != ::cuda::std::dynamic_extent,
207+
"cuda::coop::reduce requires the group to have statically known size");
208+
209+
using _WarpReduce = ::cub::WarpReduce<_Tp>;
210+
union _Scratch
211+
{
212+
typename _WarpReduce::TempStorage __warp_reduce_[__nwarps_in_group];
213+
_Tp __partials_[__nwarps_in_group];
214+
};
215+
__shared__ _Scratch __scratch;
216+
217+
const auto __partial = _WarpReduce{__scratch.__warp_reduce_[warp.rank(__group)]}.Reduce(__thread_data, __red_fn);
218+
__group.sync_aligned();
219+
220+
this_warp __warp{__group.hierarchy()};
221+
if (gpu_thread.is_root_rank(__warp))
222+
{
223+
__scratch.__partials_[warp.rank(__group)] = __partial;
224+
}
225+
__group.sync_aligned();
226+
227+
if (warp.is_root_rank(__group))
228+
{
229+
const auto __value = (gpu_thread.rank(__warp) < __nwarps_in_group)
230+
? __scratch.__partials_[gpu_thread.rank(__warp)]
231+
: ::cuda::identity_element<_RedFn, _Tp>();
232+
const auto __result = _WarpReduce{__scratch.__warp_reduce_[0]}.Reduce(__value, __red_fn);
233+
if (gpu_thread.is_root_rank(__warp))
234+
{
235+
return ::cuda::std::optional{__result};
236+
}
237+
}
238+
return ::cuda::std::nullopt;
239+
}
240+
199241
template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
200242
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
201-
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
243+
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
202244
{
203245
static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
204246
"cuda::coop::reduce requires the group to have statically known size");

cudax/include/cuda/experimental/__group/queries.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ template <class _Tp, class _Unit, class _Group>
103103
}
104104
else
105105
{
106-
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
107-
const auto __group_unit_count = ::cuda::experimental::__count_query_group<_Tp, _Unit>(__group);
108-
return static_cast<_Tp>(__group_unit_rank * __group_unit_count + __unit_rank);
106+
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
107+
const auto __unit_count = __count_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
108+
return static_cast<_Tp>(__group_unit_rank * __unit_count + __unit_rank);
109109
}
110110
}
111111

cudax/include/cuda/experimental/__group/synchronizer/barrier_synchronizer.cuh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,12 @@ public:
123123
_CCCL_ASSERT(__mapping_result.group_count() <= __barriers_.size(), "invalid number of barriers passed");
124124
}
125125

126-
if (__mapping_result.is_valid() && __mapping_result.rank() == 0)
126+
if (__mapping_result.is_valid() && __mapping_result.rank() == 0
127+
&& gpu_thread.rank(_Unit{}, __parent.hierarchy()) == 0)
127128
{
128-
init(&__barriers_[__mapping_result.group_rank()], static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count()));
129+
init(&__barriers_[__mapping_result.group_rank()],
130+
static_cast<::cuda::std::ptrdiff_t>(
131+
__mapping_result.count() * gpu_thread.count(_Unit{}, __parent.hierarchy())));
129132
}
130133

131134
// todo(dabayer): How we can expose making this aligned?

cudax/test/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_cluster
193193
cudax_add_catch2_test(test_target coop.reduce.this_grid
194194
coop/reduce/this_grid.cu
195195
)
196+
cudax_add_catch2_test(test_target coop.reduce.warps_within_block
197+
coop/reduce/warps_within_block.cu
198+
)
196199

197200
if (cudax_ENABLE_CUFILE)
198201
cudax_add_catch2_test(test_target cufile.driver_attributes
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <cuda/atomic>
12+
#include <cuda/devices>
13+
#include <cuda/functional>
14+
#include <cuda/hierarchy>
15+
#include <cuda/launch>
16+
#include <cuda/std/algorithm>
17+
#include <cuda/std/type_traits>
18+
#include <cuda/stream>
19+
20+
#include <cuda/experimental/coop.cuh>
21+
#include <cuda/experimental/group.cuh>
22+
23+
#include <testing.cuh>
24+
25+
#include <c2h/catch2_test_helper.h>
26+
#include <c2h/extended_types.h>
27+
#include <c2h/generators.h>
28+
#include <catch2/matchers/catch_matchers_floating_point.hpp>
29+
30+
constexpr int nwarps_in_group = 3;
31+
constexpr int warp_size = 32;
32+
33+
/***********************************************************************************************************************
34+
* Thread Reduce Wrapper Kernels
35+
**********************************************************************************************************************/
36+
37+
struct ReduceKernel
38+
{
39+
template <class Config, int NumItems, class T, class RedOp>
40+
__device__ void operator()(
41+
Config config,
42+
cuda::std::integral_constant<int, NumItems>,
43+
const T* __restrict__ d_in,
44+
T* __restrict__ d_out,
45+
RedOp red_op)
46+
{
47+
cudax::this_block block{config};
48+
49+
using Barriers = cuda::barrier<cuda::thread_scope_block>[1];
50+
__shared__ cuda::std::aligned_storage_t<sizeof(Barriers), alignof(Barriers)> barriers_storage;
51+
auto& barriers = reinterpret_cast<Barriers&>(barriers_storage);
52+
53+
cudax::group group{
54+
cuda::warp, block, cudax::group_by<nwarps_in_group, false>{}, cudax::barrier_synchronizer{barriers}};
55+
56+
// All threads that are not part of the groups should exit early.
57+
if (!cuda::gpu_thread.is_part_of(group))
58+
{
59+
return;
60+
}
61+
62+
T thread_data[NumItems];
63+
for (int i = 0; i < NumItems; ++i)
64+
{
65+
thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(group) + i * cuda::gpu_thread.count_as<int>(group)];
66+
}
67+
const auto result = cudax::coop::reduce(group, thread_data, red_op);
68+
69+
REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(group));
70+
if (cuda::gpu_thread.is_root_rank(group))
71+
{
72+
*d_out = result.value();
73+
}
74+
}
75+
};
76+
77+
/***********************************************************************************************************************
78+
* Type list definition
79+
**********************************************************************************************************************/
80+
81+
using integral_type_list =
82+
c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
83+
84+
using fp_type_list = c2h::type_list<float, double>;
85+
86+
using operator_integral_list =
87+
c2h::type_list<cuda::std::plus<>,
88+
cuda::std::multiplies<>,
89+
cuda::std::bit_and<>,
90+
cuda::std::bit_or<>,
91+
cuda::std::bit_xor<>,
92+
cuda::minimum<>,
93+
cuda::maximum<>>;
94+
95+
using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
96+
97+
/***********************************************************************************************************************
98+
* Verify results and kernel launch
99+
**********************************************************************************************************************/
100+
101+
template <class T>
102+
void verify_results(const T& expected_data, const T& test_results)
103+
{
104+
if constexpr (cuda::std::is_floating_point_v<T>)
105+
{
106+
REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
107+
}
108+
else
109+
{
110+
REQUIRE(expected_data == test_results);
111+
}
112+
}
113+
114+
template <class T, class RedOp>
115+
void run_thread_reduce_kernel(
116+
cuda::stream_ref stream, int num_items, const c2h::device_vector<T>& in, c2h::device_vector<T>& out, RedOp red_op)
117+
{
118+
const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<(nwarps_in_group + 2) * warp_size>());
119+
const auto in_ptr = thrust::raw_pointer_cast(in.data());
120+
const auto out_ptr = thrust::raw_pointer_cast(out.data());
121+
const ReduceKernel kernel{};
122+
123+
switch (num_items)
124+
{
125+
case 1:
126+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
127+
break;
128+
case 2:
129+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 2>{}, in_ptr, out_ptr, red_op);
130+
break;
131+
case 3:
132+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 3>{}, in_ptr, out_ptr, red_op);
133+
break;
134+
case 4:
135+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
136+
break;
137+
default:
138+
FAIL("Unsupported number of items");
139+
}
140+
stream.sync();
141+
}
142+
143+
constexpr int max_size = 4;
144+
constexpr int num_seeds = 10;
145+
146+
/***********************************************************************************************************************
147+
* Test cases
148+
**********************************************************************************************************************/
149+
150+
_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
151+
152+
C2H_TEST("reduce/this_warp Integral Type Tests", "[reduce][this_warp]", integral_type_list, operator_integral_list)
153+
{
154+
using value_t = c2h::get<0, TestType>;
155+
using op_t = c2h::get<1, TestType>;
156+
constexpr auto reduce_op = op_t{};
157+
constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
158+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
159+
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
160+
c2h::device_vector<value_t> d_out(1);
161+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
162+
c2h::host_vector<value_t> h_in = d_in;
163+
cuda::stream stream{cuda::devices[0]};
164+
for (int num_items = 1; num_items <= max_size; ++num_items)
165+
{
166+
auto reference_result = cuda::std::accumulate(
167+
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
168+
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
169+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
170+
}
171+
}
172+
173+
C2H_TEST("reduce/this_warp Floating-Point Type Tests", "[reduce][this_warp]", fp_type_list, operator_fp_list)
174+
{
175+
using value_t = c2h::get<0, TestType>;
176+
using op_t = c2h::get<1, TestType>;
177+
constexpr auto reduce_op = op_t{};
178+
const auto operator_identity = cuda::identity_element<op_t, value_t>();
179+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
180+
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
181+
c2h::device_vector<value_t> d_out(1);
182+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
183+
c2h::host_vector<value_t> h_in = d_in;
184+
cuda::stream stream{cuda::devices[0]};
185+
for (int num_items = 1; num_items <= max_size; ++num_items)
186+
{
187+
auto reference_result = cuda::std::accumulate(
188+
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
189+
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
190+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
191+
}
192+
}

0 commit comments

Comments
 (0)