Skip to content

Commit ee9f95b

Browse files
authored
cudax::fill_bytes(mdspan) (#9193)
1 parent 75c7b14 commit ee9f95b

8 files changed

Lines changed: 835 additions & 0 deletions

File tree

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H
12+
#define __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__driver/driver_api.h>
27+
# include <cuda/__mdspan/host_device_mdspan.h>
28+
# include <cuda/__mdspan/traits.h>
29+
# include <cuda/__stream/stream_ref.h>
30+
# include <cuda/std/__cstddef/types.h>
31+
# include <cuda/std/__exception/exception_macros.h>
32+
# include <cuda/std/__host_stdlib/stdexcept>
33+
# include <cuda/std/__mdspan/default_accessor.h>
34+
# include <cuda/std/__mdspan/mdspan.h>
35+
# include <cuda/std/__memory/is_sufficiently_aligned.h>
36+
# include <cuda/std/__type_traits/has_unique_object_representation.h>
37+
# include <cuda/std/__type_traits/is_const.h>
38+
# include <cuda/std/__type_traits/is_convertible.h>
39+
# include <cuda/std/__type_traits/is_trivially_copyable.h>
40+
# include <cuda/std/__type_traits/remove_cvref.h>
41+
42+
# include <cuda/experimental/__copy_bytes/mdspan_to_raw_tensor.cuh>
43+
# include <cuda/experimental/__copy_bytes/memcpy_batch_tiles.cuh>
44+
# include <cuda/experimental/__copy_bytes/simplify_paired.cuh>
45+
# include <cuda/experimental/__copy_bytes/tensor_query.cuh>
46+
# include <cuda/experimental/__fill_bytes/fill_bytes_mdspan_utils.cuh>
47+
48+
# include <cuda/std/__cccl/prologue.h>
49+
50+
namespace cuda::experimental
51+
{
52+
template <typename _ByteT>
53+
inline constexpr bool __can_fill_bytes_value_v =
54+
::cuda::std::is_trivially_copyable_v<_ByteT> && ::cuda::std::has_unique_object_representations_v<_ByteT>
55+
&& (sizeof(_ByteT) == 1 || sizeof(_ByteT) == 2 || sizeof(_ByteT) == 4);
56+
57+
// __half, __nv_bfloat16 don't have a unique object representation
58+
# if _CCCL_HAS_NVFP16()
59+
template <>
60+
inline constexpr bool __can_fill_bytes_value_v<::__half> = false;
61+
# endif // _CCCL_HAS_NVFP16()
62+
63+
# if _CCCL_HAS_NVBF16()
64+
template <>
65+
inline constexpr bool __can_fill_bytes_value_v<::__nv_bfloat16> = false;
66+
# endif // _CCCL_HAS_NVBF16()
67+
68+
template <typename _Tp, typename _ByteT>
69+
_CCCL_HOST_API void __fill_bytes_tile(
70+
_Tp* __ptr, const ::cuda::std::size_t __tile_bytes, const _ByteT __byte_value, const ::cuda::stream_ref __stream)
71+
{
72+
_CCCL_ASSERT(__tile_bytes % sizeof(_ByteT) == 0,
73+
"cudax::fill_bytes: destination byte size must be a multiple of the fill value size");
74+
_CCCL_ASSERT(::cuda::std::is_sufficiently_aligned<sizeof(_ByteT)>(static_cast<void*>(__ptr)),
75+
"cudax::fill_bytes: destination tile must be sufficiently aligned");
76+
::cuda::__driver::__memsetAsync(__ptr, __byte_value, __tile_bytes / sizeof(_ByteT), __stream.get());
77+
}
78+
79+
/***********************************************************************************************************************
80+
* Public API
81+
**********************************************************************************************************************/
82+
83+
//! @rst
84+
//! .. _cudax-fill-bytes:
85+
//!
86+
//! Asynchronous mdspan byte fill
87+
//! -----------------------------
88+
//!
89+
//! ``fill_bytes`` asynchronously fills the mapped elements of a device ``mdspan`` with a repeated byte pattern on the
90+
//! given CUDA stream. The pattern is the object representation of a 1-, 2-, or 4-byte value. This is a byte
91+
//! operation: it does not assign ``__byte_value`` as an object of the destination element type. For strided layouts,
92+
//! only bytes belonging to mapped destination elements are filled; padding bytes outside the mapping are left
93+
//! unchanged.
94+
//!
95+
//! The operation is enqueued on ``__stream`` and may complete after ``fill_bytes`` returns. Synchronize the stream, or
96+
//! otherwise order dependent work on the same stream, before observing the filled data.
97+
//!
98+
//! - Destination element and fill value types must be trivially copyable.
99+
//! - The fill value type must have unique object representations and size 1, 2, or 4.
100+
//! - The destination element type must not be ``const``.
101+
//! - The destination element size must be a multiple of the fill value size.
102+
//! - The destination element alignment must be at least the fill value size.
103+
//! - Layout policies must be one of the predefined ``cuda::std`` layout policies
104+
//! (``layout_right``, ``layout_left``, ``layout_stride``) or ``cuda::layout_stride_relaxed``.
105+
//! - Accessor policies must be convertible to ``cuda::std::default_accessor``.
106+
//! - The destination must not have an interleaved stride order.
107+
//! - Zero-size mdspans are no-ops and do not require a non-null data handle.
108+
//!
109+
//! Integer literals use their usual type. For example, ``0`` is an ``int`` and requests a 4-byte pattern fill; use
110+
//! ``cuda::std::uint8_t{0}`` or ``cuda::std::byte{0}`` for a byte pattern fill. The implementation is optimized to
111+
//! maximize the contiguous memory regions to fill.
112+
//!
113+
//! .. literalinclude:: ../../../../test/fill_bytes/fill_bytes_mdspan_example.cu
114+
//! :language: c++
115+
//! :dedent:
116+
//! :start-after: example-begin fill-bytes-mdspan
117+
//! :end-before: example-end fill-bytes-mdspan
118+
//!
119+
//! @endrst
120+
//! @brief Asynchronously fills a device mdspan with a 1-, 2-, or 4-byte pattern.
121+
//!
122+
//! Validates the public preconditions, then dispatches asynchronous memset operations over the mapped destination
123+
//! elements.
124+
//!
125+
//! @param[out] __mdspan Destination device mdspan
126+
//! @param[in] __byte_value Value pattern to fill into the destination
127+
//! @param[in] __stream CUDA stream for the asynchronous fill
128+
//! @throws std::invalid_argument if ``__stream`` is the null stream, or if a non-empty destination has a null data
129+
//! handle, is insufficiently aligned, or has interleaved stride order.
130+
template <typename _Tp, typename _Extents, typename _Layout, typename _Accessor, typename _ByteT>
131+
_CCCL_HOST_API void fill_bytes(::cuda::device_mdspan<_Tp, _Extents, _Layout, _Accessor> __mdspan,
132+
const _ByteT __byte_value,
133+
const ::cuda::stream_ref __stream)
134+
{
135+
using __mdspan_t = ::cuda::std::mdspan<_Tp, _Extents, _Layout, _Accessor>;
136+
using __value_t = ::cuda::std::remove_cvref_t<_ByteT>;
137+
using __accessor_t = ::cuda::std::default_accessor<_Tp>;
138+
using __extent_t = typename _Extents::index_type;
139+
using __stride_t = __mdspan_stride_t<_Layout, typename __mdspan_t::mapping_type>;
140+
141+
static_assert(!::cuda::std::is_const_v<_Tp>, "cudax::fill_bytes: element type must not be const");
142+
static_assert(::cuda::std::is_trivially_copyable_v<_Tp>,
143+
"cudax::fill_bytes: element type must be trivially copyable");
144+
static_assert(__can_fill_bytes_value_v<__value_t>,
145+
"cudax::fill_bytes: fill value type must be trivially copyable with unique object representations and "
146+
"have size 1, 2, or 4");
147+
static_assert(sizeof(_Tp) % sizeof(__value_t) == 0,
148+
"cudax::fill_bytes: element size must be a multiple of the fill value size");
149+
static_assert(alignof(_Tp) >= sizeof(__value_t),
150+
"cudax::fill_bytes: element alignment must be at least the fill value size");
151+
static_assert(::cuda::__is_cuda_mdspan_layout_v<_Layout>,
152+
"cudax::fill_bytes: LayoutPolicy must be a predefined layout policy");
153+
static_assert(::cuda::std::is_convertible_v<_Accessor, __accessor_t>,
154+
"cudax::fill_bytes: AccessorPolicy must be convertible to cuda::std::default_accessor");
155+
156+
if (__stream.get() == nullptr)
157+
{
158+
_CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: stream must not be nullptr");
159+
}
160+
161+
const auto __tensor_size = __mdspan.size();
162+
if (__tensor_size == 0)
163+
{
164+
return;
165+
}
166+
if (__mdspan.data_handle() == nullptr)
167+
{
168+
_CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: mdspan data handle must not be nullptr");
169+
}
170+
if (!::cuda::std::is_sufficiently_aligned<alignof(_Tp)>(__mdspan.data_handle()))
171+
{
172+
_CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: destination mdspan must be sufficiently aligned");
173+
}
174+
if (::cuda::experimental::__has_interleaved_stride_order(__mdspan))
175+
{
176+
_CCCL_THROW(::std::invalid_argument,
177+
"cudax::fill_bytes: destination mdspan must not have interleaved stride order");
178+
}
179+
if (__tensor_size == 1) // rank == 0 also falls into this case
180+
{
181+
auto* __data_ptr = __mdspan.data_handle();
182+
if constexpr (::cuda::__is_layout_stride_relaxed_v<_Layout>)
183+
{
184+
__data_ptr += __mdspan.mapping().offset();
185+
}
186+
::cuda::experimental::__fill_bytes_tile(__data_ptr, sizeof(_Tp), __byte_value, __stream);
187+
return;
188+
}
189+
190+
constexpr auto __rank = _Extents::rank();
191+
if constexpr (__rank > 0)
192+
{
193+
const auto __raw_tensor = ::cuda::experimental::__to_raw_tensor<__extent_t, __stride_t, __rank>(__mdspan);
194+
auto __simplified = ::cuda::experimental::__sort_by_stride(__raw_tensor);
195+
::cuda::experimental::__flip_negative_strides_single(__simplified);
196+
::cuda::experimental::__coalesce_single(__simplified);
197+
198+
const bool __stride1 = (__simplified.__strides[0] == 1);
199+
const auto __tile_size = __stride1 ? __simplified.__extents[0] : __extent_t{1};
200+
const auto __final_tensor = (__tile_size > 1) ? __simplified : ::cuda::experimental::__reverse_modes(__simplified);
201+
const auto __num_tiles = static_cast<::cuda::std::size_t>(__tensor_size / __tile_size);
202+
const auto __tile_bytes = static_cast<::cuda::std::size_t>(__tile_size) * sizeof(_Tp);
203+
_CCCL_ASSERT(__tensor_size % __tile_size == 0, "cudax::fill_bytes: tensor size must be divisible by tile size");
204+
205+
__tile_iterator_linearized<__extent_t, __stride_t, _Tp, __rank> __tiles_iterator(__final_tensor, __tile_size);
206+
for (::cuda::std::size_t __tile_idx = 0; __tile_idx < __num_tiles; ++__tile_idx)
207+
{
208+
auto* const __tile_ptr = __tiles_iterator(static_cast<__extent_t>(__tile_idx));
209+
::cuda::experimental::__fill_bytes_tile(__tile_ptr, __tile_bytes, __byte_value, __stream);
210+
}
211+
}
212+
}
213+
} // namespace cuda::experimental
214+
215+
# include <cuda/std/__cccl/epilogue.h>
216+
217+
#endif // !_CCCL_COMPILER(NVRTC)
218+
#endif // __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_UTILS_H
12+
#define __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_UTILS_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/std/__cstddef/types.h>
27+
# include <cuda/std/__type_traits/is_signed.h>
28+
29+
# include <cuda/experimental/__copy_bytes/types.cuh>
30+
31+
# include <cuda/std/__cccl/prologue.h>
32+
33+
namespace cuda::experimental
34+
{
35+
//! @brief Flips negative-stride modes to equivalent positive-stride modes.
36+
//!
37+
//! @param[in,out] __tensor Raw tensor to flip negative strides for
38+
template <typename _ExtentT, typename _StrideT, typename _Tp, ::cuda::std::size_t _MaxRank>
39+
_CCCL_HOST_API void
40+
__flip_negative_strides_single([[maybe_unused]] __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank>& __tensor) noexcept
41+
{
42+
if constexpr (::cuda::std::is_signed_v<_StrideT>)
43+
{
44+
using __raw_tensor_t = __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank>;
45+
using __rank_t = typename __raw_tensor_t::__rank_t;
46+
for (__rank_t __i = 0; __i < __tensor.__rank; ++__i)
47+
{
48+
if (__tensor.__strides[__i] < 0)
49+
{
50+
const auto __extent = static_cast<_StrideT>(__tensor.__extents[__i] - 1);
51+
__tensor.__data += __extent * __tensor.__strides[__i];
52+
_CCCL_ASSERT(__tensor.__strides[__i] != ::cuda::std::numeric_limits<_StrideT>::min(),
53+
"cudax::flip_negative_strides_single: stride is at min value");
54+
__tensor.__strides[__i] = -__tensor.__strides[__i];
55+
}
56+
}
57+
}
58+
}
59+
60+
//! @brief Merges adjacent modes that are contiguous in the destination tensor.
61+
//!
62+
//! @param[in,out] __tensor Raw tensor to coalesce modes for
63+
template <typename _ExtentT, typename _StrideT, typename _Tp, ::cuda::std::size_t _MaxRank>
64+
_CCCL_HOST_API void __coalesce_single(__raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank>& __tensor) noexcept
65+
{
66+
if (__tensor.__rank <= 1)
67+
{
68+
return;
69+
}
70+
using __raw_tensor_t = __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank>;
71+
using __rank_t = typename __raw_tensor_t::__rank_t;
72+
__rank_t __out_rank = 1;
73+
for (__rank_t __i = 1; __i < __tensor.__rank; ++__i)
74+
{
75+
const auto __prev_extent = static_cast<_StrideT>(__tensor.__extents[__out_rank - 1]);
76+
if (__prev_extent * __tensor.__strides[__out_rank - 1] == __tensor.__strides[__i])
77+
{
78+
__tensor.__extents[__out_rank - 1] *= __tensor.__extents[__i];
79+
continue;
80+
}
81+
__tensor.__extents[__out_rank] = __tensor.__extents[__i];
82+
__tensor.__strides[__out_rank] = __tensor.__strides[__i];
83+
++__out_rank;
84+
}
85+
for (__rank_t __i = __out_rank; __i < _MaxRank; ++__i)
86+
{
87+
__tensor.__extents[__i] = _ExtentT{1};
88+
}
89+
__tensor.__rank = __out_rank;
90+
}
91+
} // namespace cuda::experimental
92+
93+
# include <cuda/std/__cccl/epilogue.h>
94+
95+
#endif // !_CCCL_COMPILER(NVRTC)
96+
#endif // __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_UTILS_H
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef __CUDAX_FILL_BYTES_CUH
12+
#define __CUDAX_FILL_BYTES_CUH
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/experimental/__fill_bytes/fill_bytes_mdspan.cuh>
25+
26+
#endif // __CUDAX_FILL_BYTES_CUH

cudax/test/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ cudax_add_catch2_test(test_target copy_bytes
9090
copy_bytes/mdspan_d2h_h2d_relaxed.cu
9191
)
9292

93+
cudax_add_catch2_test(test_target fill_bytes
94+
fill_bytes/fill_bytes_mdspan.cu
95+
fill_bytes/fill_bytes_mdspan_example.cu
96+
)
97+
9398
cudax_add_catch2_test(test_target copy
9499
copy/copy.cu
95100
copy/copy_edge_cases.cu

0 commit comments

Comments
 (0)