NVIDIA · davebayer · Apr 20, 2026 · Apr 18, 2026
@@ -33,7 +33,7 @@
 namespace cuda::experimental
 {
 template <class _Group>
-_CCCL_CONCEPT group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)(
+_CCCL_CONCEPT is_group = _CCCL_REQUIRES_EXPR((_Group), _Group&& __g, const _Group&& __cg)(
   typename(typename _Group::unit_type),
   requires(__is_hierarchy_level_v<typename _Group::unit_type>),
   typename(typename _Group::level_type),

@@ -44,7 +44,7 @@ using __implicit_hierarchy_t =
             hierarchy_level_desc<cluster_level, ::cuda::std::dims<3, unsigned>>,
             hierarchy_level_desc<block_level, ::cuda::std::dims<3, unsigned>>>;
 
-// this groups
+// groups
 
 template <class _Level, class _Hierarchy>
 class __this_group_base;
@@ -60,16 +60,8 @@ class this_cluster;
 template <class _Hierarchy>
 class this_grid;
 
-// other groups
-
-template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
-class thread_group;
-template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
-class warp_group;
-template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
-class block_group;
-template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
-class cluster_group;
+template <class _Unit, class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
+class group;
 
 // mappings
 

@@ -44,11 +44,10 @@
 
 namespace cuda::experimental
 {
-// todo(dabayer): Make groups be based on another group, not level.
-template <class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
-class thread_group
+template <class _Unit, class _Level, class _Mapping, class _Hierarchy, class _Synchronizer>
+class group
 {
-  using _MappingResult = __group_mapping_result_t<_Mapping, thread_level, _Level, _Hierarchy>;
+  using _MappingResult = __group_mapping_result_t<_Mapping, _Unit, _Level, _Hierarchy>;
   static_assert(__group_mapping_result<_MappingResult>);
 
   _Hierarchy __hier_;
@@ -57,7 +56,7 @@ class thread_group
   _Synchronizer __synchronizer_;
 
 public:
-  using unit_type             = thread_level;
+  using unit_type             = _Unit;
   using level_type            = _Level;
   using mapping_type          = _Mapping;
   using __mapping_result_type = _MappingResult;
@@ -67,11 +66,11 @@ public:
   // todo(dabayer): Do we want default behaviour like this, or do we want some kind of cuda::auto_sync_mechanism{} tag?
   _CCCL_TEMPLATE(class _HierarchyLike)
   _CCCL_REQUIRES(::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>)
-  _CCCL_DEVICE_API explicit thread_group(
-    const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept
+  _CCCL_DEVICE_API explicit group(
+    const _Unit&, const _Level&, const _Mapping& __mapping, const _HierarchyLike& __hier_like) noexcept
       : __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)}
       , __mapping_{__mapping}
-      , __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
+      , __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
       , __synchronizer_{__mapping_result_}
   {
     ::cuda::experimental::__check_mapping_result(__mapping_result_);
@@ -80,14 +79,15 @@ public:
   _CCCL_TEMPLATE(class _Synchronizer2 = _Synchronizer, class _MappingResult2 = _MappingResult, class _HierarchyLike)
   _CCCL_REQUIRES(__is_barrier_synchronizer<_Synchronizer2>
                    _CCCL_AND ::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>)
-  _CCCL_DEVICE_API explicit thread_group(
+  _CCCL_DEVICE_API explicit group(
+    const _Unit&,
     const _Level&,
     const _Mapping& __mapping,
     const _HierarchyLike& __hier_like,
     ::cuda::std::span<typename _Synchronizer2::__barrier_type, _MappingResult::static_group_count()> __barriers) noexcept
       : __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)}
       , __mapping_{__mapping}
-      , __mapping_result_{__mapping_.map(thread_level{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
+      , __mapping_result_{__mapping_.map(_Unit{}, _Level{}, ::cuda::__unpack_hierarchy_if_needed(__hier_like))}
       , __synchronizer_{__mapping_result_, __barriers}
   {
     ::cuda::experimental::__check_mapping_result(__mapping_result_);
@@ -130,24 +130,28 @@ public:
   }
 };
 
-_CCCL_TEMPLATE(class _Level, ::cuda::std::size_t _Np, class _HierarchyLike)
-_CCCL_REQUIRES(__is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike>)
-_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&)
-  -> thread_group<_Level,
-                  group_by<_Np>,
-                  __hierarchy_type_of<_HierarchyLike>,
-                  __synchronizer_select_t<thread_level, _Level, group_by<_Np>>>;
-
-_CCCL_TEMPLATE(class _Level,
+_CCCL_TEMPLATE(class _Unit, class _Level, ::cuda::std::size_t _Np, class _HierarchyLike)
+_CCCL_REQUIRES(__is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND
+                 __is_or_has_hierarchy_member_v<_HierarchyLike>)
+_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&)
+  -> group<_Unit,
+           _Level,
+           group_by<_Np>,
+           __hierarchy_type_of<_HierarchyLike>,
+           __synchronizer_select_t<_Unit, _Level, group_by<_Np>>>;
+
+_CCCL_TEMPLATE(class _Unit,
+               class _Level,
                ::cuda::std::size_t _Np,
                class _HierarchyLike,
                class _SyncParam,
-               class _Synchronizer = __barrier_synchronizer<thread_level, _Level, group_by<_Np>>)
+               class _Synchronizer = __barrier_synchronizer<_Unit, _Level, group_by<_Np>>)
 _CCCL_REQUIRES(
-  __is_hierarchy_level_v<_Level> _CCCL_AND __is_or_has_hierarchy_member_v<_HierarchyLike>
-    _CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span<typename _Synchronizer::__barrier_type>, _SyncParam>)
-_CCCL_HOST_DEVICE thread_group(const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&)
-  -> thread_group<_Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>;
+  __is_hierarchy_level_v<_Unit> _CCCL_AND __is_hierarchy_level_v<_Level> _CCCL_AND
+    __is_or_has_hierarchy_member_v<_HierarchyLike>
+      _CCCL_AND ::cuda::std::is_constructible_v<::cuda::std::span<typename _Synchronizer::__barrier_type>, _SyncParam>)
+_CCCL_DEVICE group(const _Unit&, const _Level&, const group_by<_Np>&, const _HierarchyLike&, _SyncParam&&)
+  -> group<_Unit, _Level, group_by<_Np>, __hierarchy_type_of<_HierarchyLike>, _Synchronizer>;
 } // namespace cuda::experimental
 
 #endif // !_CCCL_DOXYGEN_INVOKED

@@ -33,7 +33,7 @@ template <class Unit, class Level, class Hierarchy, class Group>
 __device__ void test_common_properties(const Hierarchy&, Group& group)
 {
   // Assert that Group satisfies the group concept.
-  static_assert(cudax::group<Group>);
+  static_assert(cudax::is_group<Group>);
 
   // Test types
   static_assert(cuda::std::is_same_v<Unit, typename Group::unit_type>);
@@ -413,7 +413,7 @@ __device__ void test_this_group(const Config& config)
 }
 
 template <class Level, cuda::std::size_t N, class Hierarchy, class Sync>
-__device__ void test_queries(const cudax::thread_group<Level, cudax::group_by<N>, Hierarchy, Sync>& group)
+__device__ void test_queries(const cudax::group<cuda::thread_level, Level, cudax::group_by<N>, Hierarchy, Sync>& group)
 {
   // todo(dabayer): These queries end up in `error: expression must have a constant value`, when group is taken by
   // reference. Can we find a solution that works without copying the group?
@@ -428,18 +428,21 @@ __device__ void test_queries(const cudax::thread_group<Level, cudax::group_by<N>
   CUDAX_REQUIRE(cuda::gpu_thread.is_part_of(group));
 }
 
-template <class Unit, template <class...> class GroupTempl, class Level, class Config, cuda::std::size_t N>
+template <class Unit, class Level, class Config, cuda::std::size_t N>
 __device__ void test_group_by_group(const Config& config)
 {
   // Test statically known group size
   {
     using Mapping = cudax::group_by<N>;
 
-    GroupTempl group{Level{}, Mapping{}, config};
+    cudax::group group{Unit{}, Level{}, Mapping{}, config};
     static_assert(
-      cuda::std::is_same_v<
-        GroupTempl<Level, Mapping, typename Config::hierarchy_type, cudax::__synchronizer_select_t<Unit, Level, Mapping>>,
-        decltype(group)>);
+      cuda::std::is_same_v<cudax::group<Unit,
+                                        Level,
+                                        Mapping,
+                                        typename Config::hierarchy_type,
+                                        cudax::__synchronizer_select_t<Unit, Level, Mapping>>,
+                           decltype(group)>);
 
     test_common_properties<Unit, Level>(config.hierarchy(), group);
     test_queries<Level>(group);
@@ -460,10 +463,11 @@ __device__ void test_group_by_group(const Config& config)
 
     auto& barriers = reinterpret_cast<Barrier(&)[nbarriers]>(barrier_storage);
 
-    GroupTempl group{Level{}, Mapping{}, config, barriers};
+    cudax::group group{Unit{}, Level{}, Mapping{}, config, barriers};
     static_assert(
       cuda::std::is_same_v<
-        GroupTempl<Level, Mapping, typename Config::hierarchy_type, cudax::__barrier_synchronizer<Unit, Level, Mapping>>,
+        cudax::
+          group<Unit, Level, Mapping, typename Config::hierarchy_type, cudax::__barrier_synchronizer<Unit, Level, Mapping>>,
         decltype(group)>);
 
     test_common_properties<Unit, Level>(config.hierarchy(), group);
@@ -475,29 +479,29 @@ __device__ void test_group_by_group(const Config& config)
   // {
   //   using Mapping = cudax::group_by<cuda::std::dynamic_extent>;
 
-  //   GroupTempl group{Level{}, Mapping{static_cast<unsigned>(N)}, config};
+  //   cudax::group group{Unit{}, Level{}, Mapping{static_cast<unsigned>(N)}, config};
   //   static_assert(
-  //     cuda::std::is_same_v<GroupTempl<Level, Mapping, typename Config::hierarchy_type,
+  //     cuda::std::is_same_v<cudax::group<Unit, Level, Mapping, typename Config::hierarchy_type,
   //     cudax::__syncwarp_synchronizer<Unit, Level, Mapping>>, decltype(group)>);
 
   //   test_common_properties<Unit, Level>(config.hierarchy(), group);
   //   test_queries<Level>(group);
   // }
 }
 
-template <class Unit, template <class...> class GroupTempl, class Level, class Config>
+template <class Unit, class Level, class Config>
 __device__ void test_group_by_group(const Config& config)
 {
   // powers of 2
-  test_group_by_group<Unit, GroupTempl, Level, Config, 1>(config);
-  test_group_by_group<Unit, GroupTempl, Level, Config, 4>(config);
-  test_group_by_group<Unit, GroupTempl, Level, Config, 16>(config);
-  test_group_by_group<Unit, GroupTempl, Level, Config, 32>(config);
+  test_group_by_group<Unit, Level, Config, 1>(config);
+  test_group_by_group<Unit, Level, Config, 4>(config);
+  test_group_by_group<Unit, Level, Config, 16>(config);
+  test_group_by_group<Unit, Level, Config, 32>(config);
 
   if constexpr (cuda::std::is_same_v<Level, cuda::block_level>)
   {
-    test_group_by_group<Unit, GroupTempl, Level, Config, 64>(config);
-    test_group_by_group<Unit, GroupTempl, Level, Config, 128>(config);
+    test_group_by_group<Unit, Level, Config, 64>(config);
+    test_group_by_group<Unit, Level, Config, 128>(config);
   }
 }
 
@@ -515,13 +519,13 @@ struct TestKernel
     test_this_group<cuda::grid_level, cudax::this_grid>(config);
 
     // todo: allow this once hierarchy is queryable for missing levels
-    test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::warp_level>(config);
-    test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::block_level>(config);
+    test_group_by_group<cuda::thread_level, cuda::warp_level>(config);
+    test_group_by_group<cuda::thread_level, cuda::block_level>(config);
     if constexpr (Hierarchy::has_level(cuda::cluster))
     {
-      test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::cluster_level>(config);
+      test_group_by_group<cuda::thread_level, cuda::cluster_level>(config);
     }
-    test_group_by_group<cuda::thread_level, cudax::thread_group, cuda::grid_level>(config);
+    test_group_by_group<cuda::thread_level, cuda::grid_level>(config);
   }
 };
 

@@ -174,49 +174,49 @@ struct hierarchy_level_base
 #    if _CCCL_CUDA_COMPILATION()
 
   _CCCL_TEMPLATE(class _Group)
-  _CCCL_REQUIRES(::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t static_count(const _Group&) noexcept
   {
     return ::cuda::experimental::__static_count_query_group<_Level, _Group>();
   }
 
   _CCCL_TEMPLATE(class _Group)
-  _CCCL_REQUIRES(::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static constexpr auto count(const _Group& __group) noexcept
   {
     return count_as<__default_1d_query_type<typename _Group::unit_type>>(__group);
   }
 
   _CCCL_TEMPLATE(class _Group)
-  _CCCL_REQUIRES(::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static auto rank(const _Group& __group) noexcept
   {
     return rank_as<__default_1d_query_type<typename _Group::unit_type>>(__group);
   }
 
   _CCCL_TEMPLATE(class _Tp, class _Group)
-  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static constexpr _Tp count_as(const _Group& __group) noexcept
   {
     return ::cuda::experimental::__count_query_group<_Tp, _Level>(__group);
   }
 
   _CCCL_TEMPLATE(class _Tp, class _Group)
-  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static _Tp rank_as(const _Group& __group) noexcept
   {
     return ::cuda::experimental::__rank_query_group<_Tp, _Level>(__group);
   }
 
   _CCCL_TEMPLATE(class _Group)
-  _CCCL_REQUIRES(::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_root_rank(const _Group& __group) noexcept
   {
     return _Level::rank(__group) == 0;
   }
 
   _CCCL_TEMPLATE(class _Group)
-  _CCCL_REQUIRES(::cuda::experimental::group<_Group>)
+  _CCCL_REQUIRES(::cuda::experimental::is_group<_Group>)
   [[nodiscard]] _CCCL_API static constexpr bool is_part_of(const _Group& __group) noexcept
   {
     // todo: static_assert that the _Level <= _Group::unit_type