diff --git a/cudax/include/cuda/experimental/__group/concepts.cuh b/cudax/include/cuda/experimental/__group/concepts.cuh index c5d5b8c6b94..9eb58cd50ec 100644 --- a/cudax/include/cuda/experimental/__group/concepts.cuh +++ b/cudax/include/cuda/experimental/__group/concepts.cuh @@ -33,7 +33,7 @@ namespace cuda::experimental { template -_CCCL_CONCEPT group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)( +_CCCL_CONCEPT is_group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)( typename(typename _Group::unit_type), requires(__is_hierarchy_level_v), typename(typename _Group::level_type), diff --git a/cudax/include/cuda/experimental/__group/fwd.cuh b/cudax/include/cuda/experimental/__group/fwd.cuh index b8a613bc0b3..5cec9b4e3d6 100644 --- a/cudax/include/cuda/experimental/__group/fwd.cuh +++ b/cudax/include/cuda/experimental/__group/fwd.cuh @@ -44,7 +44,7 @@ using __implicit_hierarchy_t = hierarchy_level_desc>, hierarchy_level_desc>>; -// this groups +// groups template class __this_group_base; @@ -60,16 +60,8 @@ class this_cluster; template class this_grid; -// other groups - -template -class thread_group; -template -class warp_group; -template -class block_group; -template -class cluster_group; +template +class group; // mappings diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh index 36c079b3568..d40f813ca94 100644 --- a/cudax/include/cuda/experimental/__group/group.cuh +++ b/cudax/include/cuda/experimental/__group/group.cuh @@ -44,11 +44,10 @@ namespace cuda::experimental { -// todo(dabayer): Make groups be based on another group, not level. -template -class thread_group +template +class group { - using _MappingResult = __group_mapping_result_t<_Mapping, thread_level, _Level, _Hierarchy>; + using _MappingResult = __group_mapping_result_t<_Mapping, _Unit, _Level, _Hierarchy>; static_assert(__group_mapping_result<_MappingResult>); _Hierarchy __hier_; @@ -57,7 +56,7 @@ class thread_group _Synchronizer __synchronizer_; public: - using unit_type = thread_level; + using unit_type = _Unit; using level_type = _Level; using mapping_type = _Mapping; using __mapping_result_type = _MappingResult; @@ -67,11 +66,11 @@ public: // todo(dabayer): Do we want default behaviour like this, or do we want some kind of cuda::auto_sync_mechanism{} tag? _CCCL_TEMPLATE(class _HierarchyLike) _CCCL_REQUIRES(::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>) - _CCCL_DEVICE_API explicit thread_group( - const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept + _CCCL_DEVICE_API explicit group( + const _Unit&, const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept : __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)} , __mapping_{__mapping} - , __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))} + , __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))} , __synchronizer_{__mapping_result_} { ::cuda::experimental::__check_mapping_result(__mapping_result_); @@ -80,14 +79,15 @@ public: _CCCL_TEMPLATE(class _Synchronizer2 = _Synchronizer, class _MappingResult2 = _MappingResult, class _HierarchyLike) _CCCL_REQUIRES(__is_barrier_synchronizer<_Synchronizer2> _CCCL_AND ::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>) - _CCCL_DEVICE_API explicit thread_group( + _CCCL_DEVICE_API explicit group( + const _Unit&, const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like, ::cuda::std::span __barriers) noexcept : __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)} , __mapping_{__mapping} - , __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))} + , __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))} , __synchronizer_{__mapping_result_, __barriers} { ::cuda::experimental::__check_mapping_result(__mapping_result_); @@ -130,24 +130,28 @@ public: } }; -_CCCL_TEMPLATE(class _Level, ::cuda::std::size_t _Np, class _HierarchyLike) -_CCCL_REQUIRES(__is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike>) -_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&) - -> thread_group<_Level, - group_by<_Np>, - __hierarchy_type_of<_HierarchyLike>, - __synchronizer_select_t>>; - -_CCCL_TEMPLATE(class _Level, +_CCCL_TEMPLATE(class _Unit, class _Level, ::cuda::std::size_t _Np, class _HierarchyLike) +_CCCL_REQUIRES(__is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND + __is_or_has_hierarchy_member_v<_HierarchyLike>) +_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&) + -> group<_Unit, + _Level, + group_by<_Np>, + __hierarchy_type_of<_HierarchyLike>, + __synchronizer_select_t<_Unit, _Level, group_by<_Np>>>; + +_CCCL_TEMPLATE(class _Unit, + class _Level, ::cuda::std::size_t _Np, class _HierarchyLike, class _SyncParam, - class _Synchronizer = __barrier_synchronizer>) + class _Synchronizer = __barrier_synchronizer<_Unit, _Level, group_by<_Np>>) _CCCL_REQUIRES( - __is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike> - _CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span, _SyncParam>) -_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&) - -> thread_group<_Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>; + __is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND + __is_or_has_hierarchy_member_v<_HierarchyLike> + _CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span, _SyncParam>) +_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&) + -> group<_Unit, _Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>; } // namespace cuda::experimental #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cudax/test/group/group.cu b/cudax/test/group/group.cu index 5bbd71c18b8..8684f7d91ac 100644 --- a/cudax/test/group/group.cu +++ b/cudax/test/group/group.cu @@ -33,7 +33,7 @@ template __device__ void test_common_properties(const Hierarchy&, Group& group) { // Assert that Group satisfies the group concept. - static_assert(cudax::group); + static_assert(cudax::is_group); // Test types static_assert(cuda::std::is_same_v); @@ -413,7 +413,7 @@ __device__ void test_this_group(const Config& config) } template -__device__ void test_queries(const cudax::thread_group, Hierarchy, Sync>& group) +__device__ void test_queries(const cudax::group, Hierarchy, Sync>& group) { // todo(dabayer): These queries end up in `error: expression must have a constant value`, when group is taken by // reference. Can we find a solution that works without copying the group? @@ -428,18 +428,21 @@ __device__ void test_queries(const cudax::thread_group CUDAX_REQUIRE(cuda::gpu_thread.is_part_of(group)); } -template class GroupTempl, class Level, class Config, cuda::std::size_t N> +template __device__ void test_group_by_group(const Config& config) { // Test statically known group size { using Mapping = cudax::group_by; - GroupTempl group{Level{}, Mapping{}, config}; + cudax::group group{Unit{}, Level{}, Mapping{}, config}; static_assert( - cuda::std::is_same_v< - GroupTempl>, - decltype(group)>); + cuda::std::is_same_v>, + decltype(group)>); test_common_properties(config.hierarchy(), group); test_queries(group); @@ -460,10 +463,11 @@ __device__ void test_group_by_group(const Config& config) auto& barriers = reinterpret_cast(barrier_storage); - GroupTempl group{Level{}, Mapping{}, config, barriers}; + cudax::group group{Unit{}, Level{}, Mapping{}, config, barriers}; static_assert( cuda::std::is_same_v< - GroupTempl>, + cudax:: + group>, decltype(group)>); test_common_properties(config.hierarchy(), group); @@ -475,9 +479,9 @@ __device__ void test_group_by_group(const Config& config) // { // using Mapping = cudax::group_by; - // GroupTempl group{Level{}, Mapping{static_cast(N)}, config}; + // cudax::group group{Unit{}, Level{}, Mapping{static_cast(N)}, config}; // static_assert( - // cuda::std::is_same_v>, decltype(group)>); // test_common_properties(config.hierarchy(), group); @@ -485,19 +489,19 @@ __device__ void test_group_by_group(const Config& config) // } } -template class GroupTempl, class Level, class Config> +template __device__ void test_group_by_group(const Config& config) { // powers of 2 - test_group_by_group(config); - test_group_by_group(config); - test_group_by_group(config); - test_group_by_group(config); + test_group_by_group(config); + test_group_by_group(config); + test_group_by_group(config); + test_group_by_group(config); if constexpr (cuda::std::is_same_v) { - test_group_by_group(config); - test_group_by_group(config); + test_group_by_group(config); + test_group_by_group(config); } } @@ -515,13 +519,13 @@ struct TestKernel test_this_group(config); // todo: allow this once hierarchy is queryable for missing levels - test_group_by_group(config); - test_group_by_group(config); + test_group_by_group(config); + test_group_by_group(config); if constexpr (Hierarchy::has_level(cuda::cluster)) { - test_group_by_group(config); + test_group_by_group(config); } - test_group_by_group(config); + test_group_by_group(config); } }; diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h index 1012fbda97e..67f4a9161cf 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h @@ -174,49 +174,49 @@ struct hierarchy_level_base # if _CCCL_CUDA_COMPILATION() _CCCL_TEMPLATE(class _Group) - _CCCL_REQUIRES(::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t static_count(const _Group&) noexcept { return ::cuda::experimental::__static_count_query_group<_Level, _Group>(); } _CCCL_TEMPLATE(class _Group) - _CCCL_REQUIRES(::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static constexpr auto count(const _Group& __group) noexcept { return count_as<__default_1d_query_type>(__group); } _CCCL_TEMPLATE(class _Group) - _CCCL_REQUIRES(::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static auto rank(const _Group& __group) noexcept { return rank_as<__default_1d_query_type>(__group); } _CCCL_TEMPLATE(class _Tp, class _Group) - _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static constexpr _Tp count_as(const _Group& __group) noexcept { return ::cuda::experimental::__count_query_group<_Tp, _Level>(__group); } _CCCL_TEMPLATE(class _Tp, class _Group) - _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static _Tp rank_as(const _Group& __group) noexcept { return ::cuda::experimental::__rank_query_group<_Tp, _Level>(__group); } _CCCL_TEMPLATE(class _Group) - _CCCL_REQUIRES(::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_root_rank(const _Group& __group) noexcept { return _Level::rank(__group) == 0; } _CCCL_TEMPLATE(class _Group) - _CCCL_REQUIRES(::cuda::experimental::group<_Group>) + _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>) [[nodiscard]] _CCCL_API static constexpr bool is_part_of(const _Group& __group) noexcept { // todo: static_assert that the _Level <= _Group::unit_type