[cudax] Implement cudax::coop::reduce for cudax::this_grid (#9203)

davebayer · web-flow · commit fbb9e242aa39 · 2026-06-04T06:59:08.000Z
diff --git a/cudax/include/cuda/experimental/__coop/reduce.cuh b/cudax/include/cuda/experimental/__coop/reduce.cuh
@@ -25,9 +25,11 @@
 #include <cub/thread/thread_reduce.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/__cmath/ceil_div.h>
 #include <cuda/__functional/operator_properties.h>
 #include <cuda/std/__cstddef/types.h>
 #include <cuda/std/__functional/operations.h>
+#include <cuda/std/array>
 #include <cuda/std/optional>
 
 #include <cuda/experimental/group.cuh>
@@ -36,6 +38,9 @@
 
 #if !defined(_CCCL_DOXYGEN_INVOKED)
 
+// todo(dabayer): We share the temporary storage in shared/global memory for all reduce invocations. This is a temporary
+// state before we make it a parameter.
+
 namespace cuda::experimental::coop
 {
 template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
@@ -146,6 +151,51 @@ __reduce_impl(this_cluster<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedF
   }
 }
 
+template <class _Tp, ::cuda::std::size_t _Np>
+_CCCL_DEVICE ::cuda::std::array<_Tp, _Np> __reduce_grid_partials;
+
+template <class _Hierarchy, class _Tp, cuda::std::size_t _Np, class _RedFn>
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+__reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
+{
+  using _GridExts = decltype(cluster.extents(grid, __group.hierarchy()));
+  static_assert(_GridExts::rank_dynamic() == 0,
+                "cuda::coop::reduce requires the grid level to have all static extents.");
+
+  constexpr auto __nclusters_in_grid =
+    _GridExts::static_extent(0) * _GridExts::static_extent(1) * _GridExts::static_extent(2);
+
+  this_cluster __cluster{__group.hierarchy()};
+  const auto __partial = ::cuda::experimental::coop::__reduce_impl(__cluster, __thread_data, __red_fn);
+
+  if (gpu_thread.is_root_rank(__cluster))
+  {
+    __reduce_grid_partials<_Tp, __nclusters_in_grid>[cluster.rank(__group)] = __partial.value();
+  }
+  __group.sync_aligned();
+
+  if (block.is_root_rank(__group))
+  {
+    this_block __block{__group.hierarchy()};
+
+    constexpr auto __npartials_per_thread = ::cuda::ceil_div(__nclusters_in_grid, gpu_thread.static_count(__block));
+    _Tp __thread_partials[__npartials_per_thread];
+    const auto __offset = gpu_thread.rank(__block) * __npartials_per_thread;
+
+    // todo(dabayer): This is not the most efficient way to load values, it doesn't take into account element size and
+    // reads N consecutive elements by 1 thread.
+    for (unsigned __i = 0; __i < __npartials_per_thread; ++__i)
+    {
+      __thread_partials[__i] =
+        (__offset + __i < __nclusters_in_grid)
+          ? __reduce_grid_partials<_Tp, __nclusters_in_grid>[__offset + __i]
+          : ::cuda::identity_element<_RedFn, _Tp>();
+    }
+    return ::cuda::experimental::coop::__reduce_impl(__block, __thread_partials, __red_fn);
+  }
+  return ::cuda::std::nullopt;
+}
+
 template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
 [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
 reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
@@ -190,6 +190,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_block
 cudax_add_catch2_test(test_target coop.reduce.this_cluster
     coop/reduce/this_cluster.cu
 )
+cudax_add_catch2_test(test_target coop.reduce.this_grid
+    coop/reduce/this_grid.cu
+)
 
 if (cudax_ENABLE_CUFILE)
   cudax_add_catch2_test(test_target cufile.driver_attributes
diff --git a/cudax/test/coop/reduce/this_grid.cu b/cudax/test/coop/reduce/this_grid.cu
@@ -0,0 +1,208 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/devices>
+#include <cuda/functional>
+#include <cuda/hierarchy>
+#include <cuda/launch>
+#include <cuda/std/algorithm>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+#include <cuda/experimental/coop.cuh>
+#include <cuda/experimental/group.cuh>
+
+#include <testing.cuh>
+
+#include <c2h/catch2_test_helper.h>
+#include <c2h/extended_types.h>
+#include <c2h/generators.h>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+constexpr int cluster_size = 2;
+constexpr int block_size   = 128;
+
+/***********************************************************************************************************************
+ * Thread Reduce Wrapper Kernels
+ **********************************************************************************************************************/
+
+struct ReduceKernel
+{
+  template <class Config, int NumItems, class T, class RedOp>
+  __device__ void operator()(
+    Config config,
+    cuda::std::integral_constant<int, NumItems>,
+    const T* __restrict__ d_in,
+    T* __restrict__ d_out,
+    RedOp red_op)
+  {
+    cudax::this_grid grid{config};
+
+    T thread_data[NumItems];
+    for (int i = 0; i < NumItems; ++i)
+    {
+      thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(grid) + i * cuda::gpu_thread.count_as<int>(grid)];
+    }
+    const auto result = cudax::coop::reduce(grid, thread_data, red_op);
+
+    REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(grid));
+    if (cuda::gpu_thread.is_root_rank(grid))
+    {
+      *d_out = result.value();
+    }
+  }
+};
+
+/***********************************************************************************************************************
+ * Type list definition
+ **********************************************************************************************************************/
+
+using integral_type_list =
+  c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
+
+using fp_type_list = c2h::type_list<float, double>;
+
+using operator_integral_list =
+  c2h::type_list<cuda::std::plus<>,
+                 cuda::std::multiplies<>,
+                 cuda::std::bit_and<>,
+                 cuda::std::bit_or<>,
+                 cuda::std::bit_xor<>,
+                 cuda::minimum<>,
+                 cuda::maximum<>>;
+
+using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
+
+using grid_size_list = c2h::enum_type_list<int, 1, 12, 32>;
+
+/***********************************************************************************************************************
+ * Verify results and kernel launch`
+ **********************************************************************************************************************/
+
+template <class T>
+void verify_results(const T& expected_data, const T& test_results)
+{
+  if constexpr (cuda::std::is_floating_point_v<T>)
+  {
+    REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
+  }
+  else
+  {
+    REQUIRE(expected_data == test_results);
+  }
+}
+
+template <int GridSize, class T, class RedOp>
+void run_reduce_kernel(
+  cuda::stream_ref stream,
+  cuda::std::integral_constant<int, GridSize>,
+  int num_items,
+  const c2h::device_vector<T>& in,
+  c2h::device_vector<T>& out,
+  RedOp red_op)
+{
+  const auto config = cuda::make_config(
+    cuda::grid_dims<GridSize>(),
+    cuda::cluster_dims<cluster_size>(),
+    cuda::block_dims<block_size>(),
+    cuda::cooperative_launch{});
+  const auto in_ptr  = thrust::raw_pointer_cast(in.data());
+  const auto out_ptr = thrust::raw_pointer_cast(out.data());
+  const ReduceKernel kernel{};
+
+  switch (num_items)
+  {
+    case 1:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
+      break;
+    case 4:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
+      break;
+    default:
+      FAIL("Unsupported number of items");
+  }
+  stream.sync();
+}
+
+constexpr int max_size  = 4;
+constexpr int num_seeds = 10;
+
+/***********************************************************************************************************************
+ * Test cases
+ **********************************************************************************************************************/
+
+_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
+
+C2H_TEST("reduce/this_grid Integral Type Tests",
+         "[reduce][this_grid]",
+         integral_type_list,
+         operator_integral_list,
+         grid_size_list)
+{
+  const auto device = cuda::devices[0];
+  if (cuda::device_attributes::compute_capability_major(device) < 9)
+  {
+    return;
+  }
+
+  using value_t                    = c2h::get<0, TestType>;
+  using op_t                       = c2h::get<1, TestType>;
+  using grid_size_t                = c2h::get<2, TestType>;
+  constexpr auto reduce_op         = op_t{};
+  constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * grid_size_t::value * cluster_size * block_size);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{device};
+  for (int num_items : {1, 4})
+  {
+    auto reference_result = cuda::std::accumulate(
+      h_in.begin(),
+      h_in.begin() + num_items * grid_size_t::value * cluster_size * block_size,
+      operator_identity,
+      reduce_op);
+    run_reduce_kernel(stream, grid_size_t{}, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}
+
+C2H_TEST(
+  "reduce/this_grid Floating-Point Type Tests", "[reduce][this_grid]", fp_type_list, operator_fp_list, grid_size_list)
+{
+  const auto device = cuda::devices[0];
+  if (cuda::device_attributes::compute_capability_major(device) < 9)
+  {
+    return;
+  }
+
+  using value_t                = c2h::get<0, TestType>;
+  using op_t                   = c2h::get<1, TestType>;
+  using grid_size_t            = c2h::get<2, TestType>;
+  constexpr auto reduce_op     = op_t{};
+  const auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * grid_size_t::value * cluster_size * block_size);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{device};
+  for (int num_items : {1, 4})
+  {
+    auto reference_result = cuda::std::accumulate(
+      h_in.begin(),
+      h_in.begin() + num_items * grid_size_t::value * cluster_size * block_size,
+      operator_identity,
+      reduce_op);
+    run_reduce_kernel(stream, grid_size_t{}, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -190,6 +190,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_block`
`190`	`190`	`cudax_add_catch2_test(test_target coop.reduce.this_cluster`
`191`	`191`	`coop/reduce/this_cluster.cu`
`192`	`192`	`)`
	`193`	`+cudax_add_catch2_test(test_target coop.reduce.this_grid`
	`194`	`+ coop/reduce/this_grid.cu`
	`195`	`+)`
`193`	`196`
`194`	`197`	`if (cudax_ENABLE_CUFILE)`
`195`	`198`	`cudax_add_catch2_test(test_target cufile.driver_attributes`