|
| 1 | +//===----------------------------------------------------------------------===// |
| 2 | +// |
| 3 | +// Part of CUDA Experimental in CUDA C++ Core Libraries, |
| 4 | +// under the Apache License v2.0 with LLVM Exceptions. |
| 5 | +// See https://llvm.org/LICENSE.txt for license information. |
| 6 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 | +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. |
| 8 | +// |
| 9 | +//===----------------------------------------------------------------------===// |
| 10 | + |
| 11 | +#ifndef __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H |
| 12 | +#define __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H |
| 13 | + |
| 14 | +#include <cuda/std/detail/__config> |
| 15 | + |
| 16 | +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) |
| 17 | +# pragma GCC system_header |
| 18 | +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) |
| 19 | +# pragma clang system_header |
| 20 | +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) |
| 21 | +# pragma system_header |
| 22 | +#endif // no system header |
| 23 | + |
| 24 | +#if !_CCCL_COMPILER(NVRTC) |
| 25 | + |
| 26 | +# include <cuda/__driver/driver_api.h> |
| 27 | +# include <cuda/__mdspan/host_device_mdspan.h> |
| 28 | +# include <cuda/__mdspan/traits.h> |
| 29 | +# include <cuda/__stream/stream_ref.h> |
| 30 | +# include <cuda/std/__cstddef/types.h> |
| 31 | +# include <cuda/std/__exception/exception_macros.h> |
| 32 | +# include <cuda/std/__host_stdlib/stdexcept> |
| 33 | +# include <cuda/std/__mdspan/default_accessor.h> |
| 34 | +# include <cuda/std/__mdspan/mdspan.h> |
| 35 | +# include <cuda/std/__memory/is_sufficiently_aligned.h> |
| 36 | +# include <cuda/std/__type_traits/has_unique_object_representation.h> |
| 37 | +# include <cuda/std/__type_traits/is_const.h> |
| 38 | +# include <cuda/std/__type_traits/is_convertible.h> |
| 39 | +# include <cuda/std/__type_traits/is_trivially_copyable.h> |
| 40 | +# include <cuda/std/__type_traits/remove_cvref.h> |
| 41 | + |
| 42 | +# include <cuda/experimental/__copy_bytes/mdspan_to_raw_tensor.cuh> |
| 43 | +# include <cuda/experimental/__copy_bytes/memcpy_batch_tiles.cuh> |
| 44 | +# include <cuda/experimental/__copy_bytes/simplify_paired.cuh> |
| 45 | +# include <cuda/experimental/__copy_bytes/tensor_query.cuh> |
| 46 | +# include <cuda/experimental/__fill_bytes/fill_bytes_mdspan_utils.cuh> |
| 47 | + |
| 48 | +# include <cuda/std/__cccl/prologue.h> |
| 49 | + |
| 50 | +namespace cuda::experimental |
| 51 | +{ |
| 52 | +template <typename _ByteT> |
| 53 | +inline constexpr bool __can_fill_bytes_value_v = |
| 54 | + ::cuda::std::is_trivially_copyable_v<_ByteT> && ::cuda::std::has_unique_object_representations_v<_ByteT> |
| 55 | + && (sizeof(_ByteT) == 1 || sizeof(_ByteT) == 2 || sizeof(_ByteT) == 4); |
| 56 | + |
| 57 | +// __half, __nv_bfloat16 don't have a unique object representation |
| 58 | +# if _CCCL_HAS_NVFP16() |
| 59 | +template <> |
| 60 | +inline constexpr bool __can_fill_bytes_value_v<::__half> = false; |
| 61 | +# endif // _CCCL_HAS_NVFP16() |
| 62 | + |
| 63 | +# if _CCCL_HAS_NVBF16() |
| 64 | +template <> |
| 65 | +inline constexpr bool __can_fill_bytes_value_v<::__nv_bfloat16> = false; |
| 66 | +# endif // _CCCL_HAS_NVBF16() |
| 67 | + |
| 68 | +template <typename _Tp, typename _ByteT> |
| 69 | +_CCCL_HOST_API void __fill_bytes_tile( |
| 70 | + _Tp* __ptr, const ::cuda::std::size_t __tile_bytes, const _ByteT __byte_value, const ::cuda::stream_ref __stream) |
| 71 | +{ |
| 72 | + _CCCL_ASSERT(__tile_bytes % sizeof(_ByteT) == 0, |
| 73 | + "cudax::fill_bytes: destination byte size must be a multiple of the fill value size"); |
| 74 | + _CCCL_ASSERT(::cuda::std::is_sufficiently_aligned<sizeof(_ByteT)>(static_cast<void*>(__ptr)), |
| 75 | + "cudax::fill_bytes: destination tile must be sufficiently aligned"); |
| 76 | + ::cuda::__driver::__memsetAsync(__ptr, __byte_value, __tile_bytes / sizeof(_ByteT), __stream.get()); |
| 77 | +} |
| 78 | + |
| 79 | +/*********************************************************************************************************************** |
| 80 | + * Public API |
| 81 | + **********************************************************************************************************************/ |
| 82 | + |
| 83 | +//! @rst |
| 84 | +//! .. _cudax-fill-bytes: |
| 85 | +//! |
| 86 | +//! Asynchronous mdspan byte fill |
| 87 | +//! ----------------------------- |
| 88 | +//! |
| 89 | +//! ``fill_bytes`` asynchronously fills the mapped elements of a device ``mdspan`` with a repeated byte pattern on the |
| 90 | +//! given CUDA stream. The pattern is the object representation of a 1-, 2-, or 4-byte value. This is a byte |
| 91 | +//! operation: it does not assign ``__byte_value`` as an object of the destination element type. For strided layouts, |
| 92 | +//! only bytes belonging to mapped destination elements are filled; padding bytes outside the mapping are left |
| 93 | +//! unchanged. |
| 94 | +//! |
| 95 | +//! The operation is enqueued on ``__stream`` and may complete after ``fill_bytes`` returns. Synchronize the stream, or |
| 96 | +//! otherwise order dependent work on the same stream, before observing the filled data. |
| 97 | +//! |
| 98 | +//! - Destination element and fill value types must be trivially copyable. |
| 99 | +//! - The fill value type must have unique object representations and size 1, 2, or 4. |
| 100 | +//! - The destination element type must not be ``const``. |
| 101 | +//! - The destination element size must be a multiple of the fill value size. |
| 102 | +//! - The destination element alignment must be at least the fill value size. |
| 103 | +//! - Layout policies must be one of the predefined ``cuda::std`` layout policies |
| 104 | +//! (``layout_right``, ``layout_left``, ``layout_stride``) or ``cuda::layout_stride_relaxed``. |
| 105 | +//! - Accessor policies must be convertible to ``cuda::std::default_accessor``. |
| 106 | +//! - The destination must not have an interleaved stride order. |
| 107 | +//! - Zero-size mdspans are no-ops and do not require a non-null data handle. |
| 108 | +//! |
| 109 | +//! Integer literals use their usual type. For example, ``0`` is an ``int`` and requests a 4-byte pattern fill; use |
| 110 | +//! ``cuda::std::uint8_t{0}`` or ``cuda::std::byte{0}`` for a byte pattern fill. The implementation is optimized to |
| 111 | +//! maximize the contiguous memory regions to fill. |
| 112 | +//! |
| 113 | +//! .. literalinclude:: ../../../../test/fill_bytes/fill_bytes_mdspan_example.cu |
| 114 | +//! :language: c++ |
| 115 | +//! :dedent: |
| 116 | +//! :start-after: example-begin fill-bytes-mdspan |
| 117 | +//! :end-before: example-end fill-bytes-mdspan |
| 118 | +//! |
| 119 | +//! @endrst |
| 120 | +//! @brief Asynchronously fills a device mdspan with a 1-, 2-, or 4-byte pattern. |
| 121 | +//! |
| 122 | +//! Validates the public preconditions, then dispatches asynchronous memset operations over the mapped destination |
| 123 | +//! elements. |
| 124 | +//! |
| 125 | +//! @param[out] __mdspan Destination device mdspan |
| 126 | +//! @param[in] __byte_value Value pattern to fill into the destination |
| 127 | +//! @param[in] __stream CUDA stream for the asynchronous fill |
| 128 | +//! @throws std::invalid_argument if ``__stream`` is the null stream, or if a non-empty destination has a null data |
| 129 | +//! handle, is insufficiently aligned, or has interleaved stride order. |
| 130 | +template <typename _Tp, typename _Extents, typename _Layout, typename _Accessor, typename _ByteT> |
| 131 | +_CCCL_HOST_API void fill_bytes(::cuda::device_mdspan<_Tp, _Extents, _Layout, _Accessor> __mdspan, |
| 132 | + const _ByteT __byte_value, |
| 133 | + const ::cuda::stream_ref __stream) |
| 134 | +{ |
| 135 | + using __mdspan_t = ::cuda::std::mdspan<_Tp, _Extents, _Layout, _Accessor>; |
| 136 | + using __value_t = ::cuda::std::remove_cvref_t<_ByteT>; |
| 137 | + using __accessor_t = ::cuda::std::default_accessor<_Tp>; |
| 138 | + using __extent_t = typename _Extents::index_type; |
| 139 | + using __stride_t = __mdspan_stride_t<_Layout, typename __mdspan_t::mapping_type>; |
| 140 | + |
| 141 | + static_assert(!::cuda::std::is_const_v<_Tp>, "cudax::fill_bytes: element type must not be const"); |
| 142 | + static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, |
| 143 | + "cudax::fill_bytes: element type must be trivially copyable"); |
| 144 | + static_assert(__can_fill_bytes_value_v<__value_t>, |
| 145 | + "cudax::fill_bytes: fill value type must be trivially copyable with unique object representations and " |
| 146 | + "have size 1, 2, or 4"); |
| 147 | + static_assert(sizeof(_Tp) % sizeof(__value_t) == 0, |
| 148 | + "cudax::fill_bytes: element size must be a multiple of the fill value size"); |
| 149 | + static_assert(alignof(_Tp) >= sizeof(__value_t), |
| 150 | + "cudax::fill_bytes: element alignment must be at least the fill value size"); |
| 151 | + static_assert(::cuda::__is_cuda_mdspan_layout_v<_Layout>, |
| 152 | + "cudax::fill_bytes: LayoutPolicy must be a predefined layout policy"); |
| 153 | + static_assert(::cuda::std::is_convertible_v<_Accessor, __accessor_t>, |
| 154 | + "cudax::fill_bytes: AccessorPolicy must be convertible to cuda::std::default_accessor"); |
| 155 | + |
| 156 | + if (__stream.get() == nullptr) |
| 157 | + { |
| 158 | + _CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: stream must not be nullptr"); |
| 159 | + } |
| 160 | + |
| 161 | + const auto __tensor_size = __mdspan.size(); |
| 162 | + if (__tensor_size == 0) |
| 163 | + { |
| 164 | + return; |
| 165 | + } |
| 166 | + if (__mdspan.data_handle() == nullptr) |
| 167 | + { |
| 168 | + _CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: mdspan data handle must not be nullptr"); |
| 169 | + } |
| 170 | + if (!::cuda::std::is_sufficiently_aligned<alignof(_Tp)>(__mdspan.data_handle())) |
| 171 | + { |
| 172 | + _CCCL_THROW(::std::invalid_argument, "cudax::fill_bytes: destination mdspan must be sufficiently aligned"); |
| 173 | + } |
| 174 | + if (::cuda::experimental::__has_interleaved_stride_order(__mdspan)) |
| 175 | + { |
| 176 | + _CCCL_THROW(::std::invalid_argument, |
| 177 | + "cudax::fill_bytes: destination mdspan must not have interleaved stride order"); |
| 178 | + } |
| 179 | + if (__tensor_size == 1) // rank == 0 also falls into this case |
| 180 | + { |
| 181 | + auto* __data_ptr = __mdspan.data_handle(); |
| 182 | + if constexpr (::cuda::__is_layout_stride_relaxed_v<_Layout>) |
| 183 | + { |
| 184 | + __data_ptr += __mdspan.mapping().offset(); |
| 185 | + } |
| 186 | + ::cuda::experimental::__fill_bytes_tile(__data_ptr, sizeof(_Tp), __byte_value, __stream); |
| 187 | + return; |
| 188 | + } |
| 189 | + |
| 190 | + constexpr auto __rank = _Extents::rank(); |
| 191 | + if constexpr (__rank > 0) |
| 192 | + { |
| 193 | + const auto __raw_tensor = ::cuda::experimental::__to_raw_tensor<__extent_t, __stride_t, __rank>(__mdspan); |
| 194 | + auto __simplified = ::cuda::experimental::__sort_by_stride(__raw_tensor); |
| 195 | + ::cuda::experimental::__flip_negative_strides_single(__simplified); |
| 196 | + ::cuda::experimental::__coalesce_single(__simplified); |
| 197 | + |
| 198 | + const bool __stride1 = (__simplified.__strides[0] == 1); |
| 199 | + const auto __tile_size = __stride1 ? __simplified.__extents[0] : __extent_t{1}; |
| 200 | + const auto __final_tensor = (__tile_size > 1) ? __simplified : ::cuda::experimental::__reverse_modes(__simplified); |
| 201 | + const auto __num_tiles = static_cast<::cuda::std::size_t>(__tensor_size / __tile_size); |
| 202 | + const auto __tile_bytes = static_cast<::cuda::std::size_t>(__tile_size) * sizeof(_Tp); |
| 203 | + _CCCL_ASSERT(__tensor_size % __tile_size == 0, "cudax::fill_bytes: tensor size must be divisible by tile size"); |
| 204 | + |
| 205 | + __tile_iterator_linearized<__extent_t, __stride_t, _Tp, __rank> __tiles_iterator(__final_tensor, __tile_size); |
| 206 | + for (::cuda::std::size_t __tile_idx = 0; __tile_idx < __num_tiles; ++__tile_idx) |
| 207 | + { |
| 208 | + auto* const __tile_ptr = __tiles_iterator(static_cast<__extent_t>(__tile_idx)); |
| 209 | + ::cuda::experimental::__fill_bytes_tile(__tile_ptr, __tile_bytes, __byte_value, __stream); |
| 210 | + } |
| 211 | + } |
| 212 | +} |
| 213 | +} // namespace cuda::experimental |
| 214 | + |
| 215 | +# include <cuda/std/__cccl/epilogue.h> |
| 216 | + |
| 217 | +#endif // !_CCCL_COMPILER(NVRTC) |
| 218 | +#endif // __CUDAX___FILL_BYTES_FILL_BYTES_MDSPAN_H |
0 commit comments