Skip to content

Commit fbb9e24

Browse files
authored
[cudax] Implement cudax::coop::reduce for cudax::this_grid (#9203)
1 parent 6b7ae38 commit fbb9e24

3 files changed

Lines changed: 261 additions & 0 deletions

File tree

cudax/include/cuda/experimental/__coop/reduce.cuh

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525
#include <cub/thread/thread_reduce.cuh>
2626
#include <cub/warp/warp_reduce.cuh>
2727

28+
#include <cuda/__cmath/ceil_div.h>
2829
#include <cuda/__functional/operator_properties.h>
2930
#include <cuda/std/__cstddef/types.h>
3031
#include <cuda/std/__functional/operations.h>
32+
#include <cuda/std/array>
3133
#include <cuda/std/optional>
3234

3335
#include <cuda/experimental/group.cuh>
@@ -36,6 +38,9 @@
3638

3739
#if !defined(_CCCL_DOXYGEN_INVOKED)
3840

41+
// todo(dabayer): We share the temporary storage in shared/global memory for all reduce invocations. This is a temporary
42+
// state before we make it a parameter.
43+
3944
namespace cuda::experimental::coop
4045
{
4146
template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
@@ -146,6 +151,51 @@ __reduce_impl(this_cluster<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedF
146151
}
147152
}
148153

154+
template <class _Tp, ::cuda::std::size_t _Np>
155+
_CCCL_DEVICE ::cuda::std::array<_Tp, _Np> __reduce_grid_partials;
156+
157+
template <class _Hierarchy, class _Tp, cuda::std::size_t _Np, class _RedFn>
158+
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
159+
__reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
160+
{
161+
using _GridExts = decltype(cluster.extents(grid, __group.hierarchy()));
162+
static_assert(_GridExts::rank_dynamic() == 0,
163+
"cuda::coop::reduce requires the grid level to have all static extents.");
164+
165+
constexpr auto __nclusters_in_grid =
166+
_GridExts::static_extent(0) * _GridExts::static_extent(1) * _GridExts::static_extent(2);
167+
168+
this_cluster __cluster{__group.hierarchy()};
169+
const auto __partial = ::cuda::experimental::coop::__reduce_impl(__cluster, __thread_data, __red_fn);
170+
171+
if (gpu_thread.is_root_rank(__cluster))
172+
{
173+
__reduce_grid_partials<_Tp, __nclusters_in_grid>[cluster.rank(__group)] = __partial.value();
174+
}
175+
__group.sync_aligned();
176+
177+
if (block.is_root_rank(__group))
178+
{
179+
this_block __block{__group.hierarchy()};
180+
181+
constexpr auto __npartials_per_thread = ::cuda::ceil_div(__nclusters_in_grid, gpu_thread.static_count(__block));
182+
_Tp __thread_partials[__npartials_per_thread];
183+
const auto __offset = gpu_thread.rank(__block) * __npartials_per_thread;
184+
185+
// todo(dabayer): This is not the most efficient way to load values, it doesn't take into account element size and
186+
// reads N consecutive elements by 1 thread.
187+
for (unsigned __i = 0; __i < __npartials_per_thread; ++__i)
188+
{
189+
__thread_partials[__i] =
190+
(__offset + __i < __nclusters_in_grid)
191+
? __reduce_grid_partials<_Tp, __nclusters_in_grid>[__offset + __i]
192+
: ::cuda::identity_element<_RedFn, _Tp>();
193+
}
194+
return ::cuda::experimental::coop::__reduce_impl(__block, __thread_partials, __red_fn);
195+
}
196+
return ::cuda::std::nullopt;
197+
}
198+
149199
template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
150200
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
151201
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)

cudax/test/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_block
190190
cudax_add_catch2_test(test_target coop.reduce.this_cluster
191191
coop/reduce/this_cluster.cu
192192
)
193+
cudax_add_catch2_test(test_target coop.reduce.this_grid
194+
coop/reduce/this_grid.cu
195+
)
193196

194197
if (cudax_ENABLE_CUFILE)
195198
cudax_add_catch2_test(test_target cufile.driver_attributes
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <cuda/devices>
12+
#include <cuda/functional>
13+
#include <cuda/hierarchy>
14+
#include <cuda/launch>
15+
#include <cuda/std/algorithm>
16+
#include <cuda/std/type_traits>
17+
#include <cuda/stream>
18+
19+
#include <cuda/experimental/coop.cuh>
20+
#include <cuda/experimental/group.cuh>
21+
22+
#include <testing.cuh>
23+
24+
#include <c2h/catch2_test_helper.h>
25+
#include <c2h/extended_types.h>
26+
#include <c2h/generators.h>
27+
#include <catch2/matchers/catch_matchers_floating_point.hpp>
28+
29+
constexpr int cluster_size = 2;
30+
constexpr int block_size = 128;
31+
32+
/***********************************************************************************************************************
33+
* Thread Reduce Wrapper Kernels
34+
**********************************************************************************************************************/
35+
36+
struct ReduceKernel
37+
{
38+
template <class Config, int NumItems, class T, class RedOp>
39+
__device__ void operator()(
40+
Config config,
41+
cuda::std::integral_constant<int, NumItems>,
42+
const T* __restrict__ d_in,
43+
T* __restrict__ d_out,
44+
RedOp red_op)
45+
{
46+
cudax::this_grid grid{config};
47+
48+
T thread_data[NumItems];
49+
for (int i = 0; i < NumItems; ++i)
50+
{
51+
thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(grid) + i * cuda::gpu_thread.count_as<int>(grid)];
52+
}
53+
const auto result = cudax::coop::reduce(grid, thread_data, red_op);
54+
55+
REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(grid));
56+
if (cuda::gpu_thread.is_root_rank(grid))
57+
{
58+
*d_out = result.value();
59+
}
60+
}
61+
};
62+
63+
/***********************************************************************************************************************
64+
* Type list definition
65+
**********************************************************************************************************************/
66+
67+
using integral_type_list =
68+
c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
69+
70+
using fp_type_list = c2h::type_list<float, double>;
71+
72+
using operator_integral_list =
73+
c2h::type_list<cuda::std::plus<>,
74+
cuda::std::multiplies<>,
75+
cuda::std::bit_and<>,
76+
cuda::std::bit_or<>,
77+
cuda::std::bit_xor<>,
78+
cuda::minimum<>,
79+
cuda::maximum<>>;
80+
81+
using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
82+
83+
using grid_size_list = c2h::enum_type_list<int, 1, 12, 32>;
84+
85+
/***********************************************************************************************************************
86+
* Verify results and kernel launch`
87+
**********************************************************************************************************************/
88+
89+
template <class T>
90+
void verify_results(const T& expected_data, const T& test_results)
91+
{
92+
if constexpr (cuda::std::is_floating_point_v<T>)
93+
{
94+
REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
95+
}
96+
else
97+
{
98+
REQUIRE(expected_data == test_results);
99+
}
100+
}
101+
102+
template <int GridSize, class T, class RedOp>
103+
void run_reduce_kernel(
104+
cuda::stream_ref stream,
105+
cuda::std::integral_constant<int, GridSize>,
106+
int num_items,
107+
const c2h::device_vector<T>& in,
108+
c2h::device_vector<T>& out,
109+
RedOp red_op)
110+
{
111+
const auto config = cuda::make_config(
112+
cuda::grid_dims<GridSize>(),
113+
cuda::cluster_dims<cluster_size>(),
114+
cuda::block_dims<block_size>(),
115+
cuda::cooperative_launch{});
116+
const auto in_ptr = thrust::raw_pointer_cast(in.data());
117+
const auto out_ptr = thrust::raw_pointer_cast(out.data());
118+
const ReduceKernel kernel{};
119+
120+
switch (num_items)
121+
{
122+
case 1:
123+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
124+
break;
125+
case 4:
126+
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
127+
break;
128+
default:
129+
FAIL("Unsupported number of items");
130+
}
131+
stream.sync();
132+
}
133+
134+
constexpr int max_size = 4;
135+
constexpr int num_seeds = 10;
136+
137+
/***********************************************************************************************************************
138+
* Test cases
139+
**********************************************************************************************************************/
140+
141+
_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
142+
143+
C2H_TEST("reduce/this_grid Integral Type Tests",
144+
"[reduce][this_grid]",
145+
integral_type_list,
146+
operator_integral_list,
147+
grid_size_list)
148+
{
149+
const auto device = cuda::devices[0];
150+
if (cuda::device_attributes::compute_capability_major(device) < 9)
151+
{
152+
return;
153+
}
154+
155+
using value_t = c2h::get<0, TestType>;
156+
using op_t = c2h::get<1, TestType>;
157+
using grid_size_t = c2h::get<2, TestType>;
158+
constexpr auto reduce_op = op_t{};
159+
constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
160+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
161+
c2h::device_vector<value_t> d_in(max_size * grid_size_t::value * cluster_size * block_size);
162+
c2h::device_vector<value_t> d_out(1);
163+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
164+
c2h::host_vector<value_t> h_in = d_in;
165+
cuda::stream stream{device};
166+
for (int num_items : {1, 4})
167+
{
168+
auto reference_result = cuda::std::accumulate(
169+
h_in.begin(),
170+
h_in.begin() + num_items * grid_size_t::value * cluster_size * block_size,
171+
operator_identity,
172+
reduce_op);
173+
run_reduce_kernel(stream, grid_size_t{}, num_items, d_in, d_out, reduce_op);
174+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
175+
}
176+
}
177+
178+
C2H_TEST(
179+
"reduce/this_grid Floating-Point Type Tests", "[reduce][this_grid]", fp_type_list, operator_fp_list, grid_size_list)
180+
{
181+
const auto device = cuda::devices[0];
182+
if (cuda::device_attributes::compute_capability_major(device) < 9)
183+
{
184+
return;
185+
}
186+
187+
using value_t = c2h::get<0, TestType>;
188+
using op_t = c2h::get<1, TestType>;
189+
using grid_size_t = c2h::get<2, TestType>;
190+
constexpr auto reduce_op = op_t{};
191+
const auto operator_identity = cuda::identity_element<op_t, value_t>();
192+
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
193+
c2h::device_vector<value_t> d_in(max_size * grid_size_t::value * cluster_size * block_size);
194+
c2h::device_vector<value_t> d_out(1);
195+
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
196+
c2h::host_vector<value_t> h_in = d_in;
197+
cuda::stream stream{device};
198+
for (int num_items : {1, 4})
199+
{
200+
auto reference_result = cuda::std::accumulate(
201+
h_in.begin(),
202+
h_in.begin() + num_items * grid_size_t::value * cluster_size * block_size,
203+
operator_identity,
204+
reduce_op);
205+
run_reduce_kernel(stream, grid_size_t{}, num_items, d_in, d_out, reduce_op);
206+
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
207+
}
208+
}

0 commit comments

Comments
 (0)