Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cudax/include/cuda/experimental/__group/concepts.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
namespace cuda::experimental
{
template <class _Group>
_CCCL_CONCEPT group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)(
_CCCL_CONCEPT is_group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)(
typename(typename _Group::unit_type),
requires(__is_hierarchy_level_v<typename _Group::unit_type>),
typename(typename _Group::level_type),
Expand Down
14 changes: 3 additions & 11 deletions cudax/include/cuda/experimental/__group/fwd.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ using __implicit_hierarchy_t =
hierarchy_level_desc<cluster_level, ::cuda::std::dims<3, unsigned>>,
hierarchy_level_desc<block_level, ::cuda::std::dims<3, unsigned>>>;

// this groups
// groups

template <class _Level, class _Hierarchy>
class __this_group_base;
Expand All @@ -60,16 +60,8 @@ class this_cluster;
template <class _Hierarchy>
class this_grid;

// other groups

template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class thread_group;
template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class warp_group;
template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class block_group;
template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class cluster_group;
template <class _Unit, class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class group;

// mappings

Expand Down
52 changes: 28 additions & 24 deletions cudax/include/cuda/experimental/__group/group.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,10 @@

namespace cuda::experimental
{
// todo(dabayer): Make groups be based on another group, not level.
template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class thread_group
template <class _Unit, class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
class group
{
using _MappingResult = __group_mapping_result_t<_Mapping, thread_level, _Level, _Hierarchy>;
using _MappingResult = __group_mapping_result_t<_Mapping, _Unit, _Level, _Hierarchy>;
static_assert(__group_mapping_result<_MappingResult>);

_Hierarchy __hier_;
Expand All @@ -57,7 +56,7 @@ class thread_group
_Synchronizer __synchronizer_;

public:
using unit_type = thread_level;
using unit_type = _Unit;
using level_type = _Level;
using mapping_type = _Mapping;
using __mapping_result_type = _MappingResult;
Expand All @@ -67,11 +66,11 @@ public:
// todo(dabayer): Do we want default behaviour like this, or do we want some kind of cuda::auto_sync_mechanism{} tag?
_CCCL_TEMPLATE(class _HierarchyLike)
_CCCL_REQUIRES(::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>)
_CCCL_DEVICE_API explicit thread_group(
const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept
_CCCL_DEVICE_API explicit group(
const _Unit&, const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept
: __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)}
, __mapping_{__mapping}
, __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
, __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
, __synchronizer_{__mapping_result_}
{
::cuda::experimental::__check_mapping_result(__mapping_result_);
Expand All @@ -80,14 +79,15 @@ public:
_CCCL_TEMPLATE(class _Synchronizer2 = _Synchronizer, class _MappingResult2 = _MappingResult, class _HierarchyLike)
_CCCL_REQUIRES(__is_barrier_synchronizer<_Synchronizer2>
_CCCL_AND ::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>)
_CCCL_DEVICE_API explicit thread_group(
_CCCL_DEVICE_API explicit group(
const _Unit&,
const _Level&,
const _Mapping& __mapping,
const _HierarchyLike& __hier_like,
::cuda::std::span<typename _Synchronizer2::__barrier_type, _MappingResult::static_group_count()> __barriers) noexcept
: __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)}
, __mapping_{__mapping}
, __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
, __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
, __synchronizer_{__mapping_result_, __barriers}
{
::cuda::experimental::__check_mapping_result(__mapping_result_);
Expand Down Expand Up @@ -130,24 +130,28 @@ public:
}
};

_CCCL_TEMPLATE(class _Level, ::cuda::std::size_t _Np, class _HierarchyLike)
_CCCL_REQUIRES(__is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike>)
_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&)
-> thread_group<_Level,
group_by<_Np>,
__hierarchy_type_of<_HierarchyLike>,
__synchronizer_select_t<thread_level, _Level, group_by<_Np>>>;

_CCCL_TEMPLATE(class _Level,
_CCCL_TEMPLATE(class _Unit, class _Level, ::cuda::std::size_t _Np, class _HierarchyLike)
_CCCL_REQUIRES(__is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND
__is_or_has_hierarchy_member_v<_HierarchyLike>)
_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&)
-> group<_Unit,
_Level,
group_by<_Np>,
__hierarchy_type_of<_HierarchyLike>,
__synchronizer_select_t<_Unit, _Level, group_by<_Np>>>;

_CCCL_TEMPLATE(class _Unit,
class _Level,
::cuda::std::size_t _Np,
class _HierarchyLike,
class _SyncParam,
class _Synchronizer = __barrier_synchronizer<thread_level, _Level, group_by<_Np>>)
class _Synchronizer = __barrier_synchronizer<_Unit, _Level, group_by<_Np>>)
_CCCL_REQUIRES(
__is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike>
_CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span<typename _Synchronizer::__barrier_type>, _SyncParam>)
_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&)
-> thread_group<_Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>;
__is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND
__is_or_has_hierarchy_member_v<_HierarchyLike>
_CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span<typename _Synchronizer::__barrier_type>, _SyncParam>)
_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&)
-> group<_Unit, _Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>;
} // namespace cuda::experimental

#endif // !_CCCL_DOXYGEN_INVOKED
Expand Down
48 changes: 26 additions & 22 deletions cudax/test/group/group.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ template <class Unit, class Level, class Hierarchy, class Group>
__device__ void test_common_properties(const Hierarchy&, Group& group)
{
// Assert that Group satisfies the group concept.
static_assert(cudax::group<Group>);
static_assert(cudax::is_group<Group>);

// Test types
static_assert(cuda::std::is_same_v<Unit, typename Group::unit_type>);
Expand Down Expand Up @@ -413,7 +413,7 @@ __device__ void test_this_group(const Config& config)
}

template <class Level, cuda::std::size_t N, class Hierarchy, class Sync>
__device__ void test_queries(const cudax::thread_group<Level, cudax::group_by<N>, Hierarchy, Sync>& group)
__device__ void test_queries(const cudax::group<cuda::thread_level, Level, cudax::group_by<N>, Hierarchy, Sync>& group)
{
// todo(dabayer): These queries end up in `error: expression must have a constant value`, when group is taken by
// reference. Can we find a solution that works without copying the group?
Expand All @@ -428,18 +428,21 @@ __device__ void test_queries(const cudax::thread_group<Level, cudax::group_by<N>
CUDAX_REQUIRE(cuda::gpu_thread.is_part_of(group));
}

template <class Unit, template <class...> class GroupTempl, class Level, class Config, cuda::std::size_t N>
template <class Unit, class Level, class Config, cuda::std::size_t N>
__device__ void test_group_by_group(const Config& config)
{
// Test statically known group size
{
using Mapping = cudax::group_by<N>;

GroupTempl group{Level{}, Mapping{}, config};
cudax::group group{Unit{}, Level{}, Mapping{}, config};
static_assert(
cuda::std::is_same_v<
GroupTempl<Level, Mapping, typename Config::hierarchy_type, cudax::__synchronizer_select_t<Unit, Level, Mapping>>,
decltype(group)>);
cuda::std::is_same_v<cudax::group<Unit,
Level,
Mapping,
typename Config::hierarchy_type,
cudax::__synchronizer_select_t<Unit, Level, Mapping>>,
decltype(group)>);

test_common_properties<Unit, Level>(config.hierarchy(), group);
test_queries<Level>(group);
Expand All @@ -460,10 +463,11 @@ __device__ void test_group_by_group(const Config& config)

auto& barriers = reinterpret_cast<Barrier(&)[nbarriers]>(barrier_storage);

GroupTempl group{Level{}, Mapping{}, config, barriers};
cudax::group group{Unit{}, Level{}, Mapping{}, config, barriers};
static_assert(
cuda::std::is_same_v<
GroupTempl<Level, Mapping, typename Config::hierarchy_type, cudax::__barrier_synchronizer<Unit, Level, Mapping>>,
cudax::
group<Unit, Level, Mapping, typename Config::hierarchy_type, cudax::__barrier_synchronizer<Unit, Level, Mapping>>,
decltype(group)>);

test_common_properties<Unit, Level>(config.hierarchy(), group);
Expand All @@ -475,29 +479,29 @@ __device__ void test_group_by_group(const Config& config)
// {
// using Mapping = cudax::group_by<cuda::std::dynamic_extent>;

// GroupTempl group{Level{}, Mapping{static_cast<unsigned>(N)}, config};
// cudax::group group{Unit{}, Level{}, Mapping{static_cast<unsigned>(N)}, config};
// static_assert(
// cuda::std::is_same_v<GroupTempl<Level, Mapping, typename Config::hierarchy_type,
// cuda::std::is_same_v<cudax::group<Unit, Level, Mapping, typename Config::hierarchy_type,
// cudax::__syncwarp_synchronizer<Unit, Level, Mapping>>, decltype(group)>);

// test_common_properties<Unit, Level>(config.hierarchy(), group);
// test_queries<Level>(group);
// }
}

template <class Unit, template <class...> class GroupTempl, class Level, class Config>
template <class Unit, class Level, class Config>
__device__ void test_group_by_group(const Config& config)
{
// powers of 2
test_group_by_group<Unit, GroupTempl, Level, Config, 1>(config);
test_group_by_group<Unit, GroupTempl, Level, Config, 4>(config);
test_group_by_group<Unit, GroupTempl, Level, Config, 16>(config);
test_group_by_group<Unit, GroupTempl, Level, Config, 32>(config);
test_group_by_group<Unit, Level, Config, 1>(config);
test_group_by_group<Unit, Level, Config, 4>(config);
test_group_by_group<Unit, Level, Config, 16>(config);
test_group_by_group<Unit, Level, Config, 32>(config);

if constexpr (cuda::std::is_same_v<Level, cuda::block_level>)
{
test_group_by_group<Unit, GroupTempl, Level, Config, 64>(config);
test_group_by_group<Unit, GroupTempl, Level, Config, 128>(config);
test_group_by_group<Unit, Level, Config, 64>(config);
test_group_by_group<Unit, Level, Config, 128>(config);
}
}

Expand All @@ -515,13 +519,13 @@ struct TestKernel
test_this_group<cuda::grid_level, cudax::this_grid>(config);

// todo: allow this once hierarchy is queryable for missing levels
test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::warp_level>(config);
test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::block_level>(config);
test_group_by_group<cuda::thread_level, cuda::warp_level>(config);
test_group_by_group<cuda::thread_level, cuda::block_level>(config);
if constexpr (Hierarchy::has_level(cuda::cluster))
{
test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::cluster_level>(config);
test_group_by_group<cuda::thread_level, cuda::cluster_level>(config);
}
test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::grid_level>(config);
test_group_by_group<cuda::thread_level, cuda::grid_level>(config);
}
};

Expand Down
14 changes: 7 additions & 7 deletions libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,49 +174,49 @@ struct hierarchy_level_base
# if _CCCL_CUDA_COMPILATION()

_CCCL_TEMPLATE(class _Group)
_CCCL_REQUIRES(::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t static_count(const _Group&) noexcept
{
return ::cuda::experimental::__static_count_query_group<_Level, _Group>();
}

_CCCL_TEMPLATE(class _Group)
_CCCL_REQUIRES(::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static constexpr auto count(const _Group& __group) noexcept
{
return count_as<__default_1d_query_type<typename _Group::unit_type>>(__group);
}

_CCCL_TEMPLATE(class _Group)
_CCCL_REQUIRES(::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static auto rank(const _Group& __group) noexcept
{
return rank_as<__default_1d_query_type<typename _Group::unit_type>>(__group);
}

_CCCL_TEMPLATE(class _Tp, class _Group)
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static constexpr _Tp count_as(const _Group& __group) noexcept
{
return ::cuda::experimental::__count_query_group<_Tp, _Level>(__group);
}

_CCCL_TEMPLATE(class _Tp, class _Group)
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static _Tp rank_as(const _Group& __group) noexcept
{
return ::cuda::experimental::__rank_query_group<_Tp, _Level>(__group);
}

_CCCL_TEMPLATE(class _Group)
_CCCL_REQUIRES(::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_root_rank(const _Group& __group) noexcept
{
return _Level::rank(__group) == 0;
}

_CCCL_TEMPLATE(class _Group)
_CCCL_REQUIRES(::cuda::experimental::group<_Group>)
_CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
[[nodiscard]] _CCCL_API static constexpr bool is_part_of(const _Group& __group) noexcept
{
// todo: static_assert that the _Level <= _Group::unit_type
Expand Down
Loading