diff --git a/.clang-tidy b/.clang-tidy index 1c2737ab8fc..24f7205393d 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -36,7 +36,6 @@ Checks: - '-modernize-type-traits' - '-modernize-use-nullptr' - '-modernize-return-braced-init-list' - - '-modernize-use-auto' - '-modernize-use-equals-delete' - '-modernize-pass-by-value' - '-modernize-loop-convert' diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index 446792886a0..52fbb1be988 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -68,8 +68,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") stf_cuda_kernel_add_dep(k, lX, STF_READ); stf_cuda_kernel_add_dep(k, lY, STF_RW); stf_cuda_kernel_start(k); - double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); - double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); + auto* dX = (double*) stf_cuda_kernel_get_arg(k, 0); + auto* dY = (double*) stf_cuda_kernel_get_arg(k, 1); const void* args[4] = {&N, &alpha, &dX, &dY}; cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); REQUIRE(err == cudaSuccess); diff --git a/c/experimental/stf/test/test_host_launch.cu b/c/experimental/stf/test/test_host_launch.cu index 1782fef304c..a848bb31e8b 100644 --- a/c/experimental/stf/test/test_host_launch.cu +++ b/c/experimental/stf/test/test_host_launch.cu @@ -84,7 +84,7 @@ C2H_TEST("host_launch with stream context", "[host_launch]") stf_task_set_symbol(t, "fill"); stf_task_add_dep(t, lData, STF_WRITE); stf_task_start(t); - double* dData = (double*) stf_task_get(t, 0); + auto* dData = (double*) stf_task_get(t, 0); fill_kernel<<<2, 128, 0, (cudaStream_t) stf_task_get_custream(t)>>>((int) N, dData, 42.0); stf_task_end(t); stf_task_destroy(t); @@ -134,8 +134,8 @@ C2H_TEST("host_launch with graph context", "[host_launch]") stf_task_add_dep(t, lData, STF_WRITE); stf_task_enable_capture(t); stf_task_start(t); - double* dData = (double*) stf_task_get(t, 0); - cudaStream_t stream = (cudaStream_t) stf_task_get_custream(t); + auto* dData = (double*) stf_task_get(t, 0); + auto stream = (cudaStream_t) stf_task_get_custream(t); fill_kernel<<<2, 128, 0, stream>>>((int) N, dData, 42.0); stf_task_end(t); stf_task_destroy(t); diff --git a/c/experimental/stf/test/test_logical_data_with_place.cu b/c/experimental/stf/test/test_logical_data_with_place.cu index 517d5937ef1..b6e17b0c71d 100644 --- a/c/experimental/stf/test/test_logical_data_with_place.cu +++ b/c/experimental/stf/test/test_logical_data_with_place.cu @@ -75,7 +75,7 @@ C2H_TEST("stf_logical_data_with_place - host place (pinned memory)", "[logical_d cudaError_t err = cudaMallocHost(&A_raw, N * sizeof(float)); REQUIRE(err == cudaSuccess); std::unique_ptr A_owner(A_raw, cudaFreeHost); - float* A = static_cast(A_owner.get()); + auto* A = static_cast(A_owner.get()); for (size_t i = 0; i < N; ++i) { A[i] = static_cast(i); @@ -114,7 +114,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)", cudaError_t err = cudaMalloc(&d_raw, N * sizeof(float)); REQUIRE(err == cudaSuccess); std::unique_ptr d_data_owner(d_raw, cudaFree); - float* d_data = static_cast(d_data_owner.get()); + auto* d_data = static_cast(d_data_owner.get()); std::vector h_init(N); for (size_t i = 0; i < N; ++i) @@ -135,7 +135,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)", stf_cuda_kernel_set_symbol(k, "scale_inplace"); stf_cuda_kernel_add_dep(k, lD, STF_RW); stf_cuda_kernel_start(k); - float* arg_ptr = static_cast(stf_cuda_kernel_get_arg(k, 0)); + auto* arg_ptr = static_cast(stf_cuda_kernel_get_arg(k, 0)); REQUIRE(arg_ptr == d_data); int n = static_cast(N); const void* args[3] = {&n, &arg_ptr, &factor}; diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index 62386212d6e..8c413429647 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -28,7 +28,7 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_ { part_size = 1; } - int64_t c = static_cast(data_coords.x); + auto c = static_cast(data_coords.x); int64_t place_x = c / static_cast(part_size); if (place_x >= static_cast(nplaces)) { diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh index 4a83a2146e6..27418d3329a 100644 --- a/c2h/include/c2h/bfloat16.cuh +++ b/c2h/include/c2h/bfloat16.cuh @@ -106,9 +106,9 @@ struct bfloat16_t /// Cast to float __host__ __device__ __forceinline__ operator float() const { - float f = 0; - uint32_t* p = reinterpret_cast(&f); - *p = uint32_t(__x) << 16; + float f = 0; + auto* p = reinterpret_cast(&f); + *p = uint32_t(__x) << 16; return f; } diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu index 96498ad8dd2..915180e0e0b 100644 --- a/cub/benchmarks/bench/histogram/even.cu +++ b/cub/benchmarks/bench/histogram/even.cu @@ -58,7 +58,7 @@ static void even(nvbench::state& state, nvbench::type_list(num_bins, elements); + const auto upper_level = get_upper_level(num_bins, elements); thrust::device_vector input = generate(elements, entropy, lower_level, upper_level); thrust::device_vector hist(num_bins); @@ -70,7 +70,7 @@ static void even(nvbench::state& state, nvbench::type_list is_byte_sample; - OffsetT num_row_pixels = static_cast(elements); + auto num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu index 8ff340388c2..1960051d528 100644 --- a/cub/benchmarks/bench/histogram/multi/even.cu +++ b/cub/benchmarks/bench/histogram/multi/even.cu @@ -60,7 +60,7 @@ static void even(nvbench::state& state, nvbench::type_list(num_bins, elements); + const auto upper_level_r = get_upper_level(num_bins, elements); const SampleT lower_level_g = lower_level_r; const SampleT upper_level_g = upper_level_r; const SampleT lower_level_b = lower_level_g; @@ -80,7 +80,7 @@ static void even(nvbench::state& state, nvbench::type_list is_byte_sample; - OffsetT num_row_pixels = static_cast(elements); + auto num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu index 9cfb2f07994..66501c74dc9 100644 --- a/cub/benchmarks/bench/histogram/multi/range.cu +++ b/cub/benchmarks/bench/histogram/multi/range.cu @@ -52,7 +52,7 @@ static void range(nvbench::state& state, nvbench::type_list(num_bins, elements); + const auto upper_level = get_upper_level(num_bins, elements); SampleT step = (upper_level - lower_level) / num_bins; thrust::device_vector levels_r(num_bins + 1); @@ -80,7 +80,7 @@ static void range(nvbench::state& state, nvbench::type_list is_byte_sample; - OffsetT num_row_pixels = static_cast(elements); + auto num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu index 5e83868fc28..33f2e25b87c 100644 --- a/cub/benchmarks/bench/histogram/range.cu +++ b/cub/benchmarks/bench/histogram/range.cu @@ -50,7 +50,7 @@ static void range(nvbench::state& state, nvbench::type_list(num_bins) + 1; const SampleT lower_level = 0; - const SampleT upper_level = get_upper_level(num_bins, elements); + const auto upper_level = get_upper_level(num_bins, elements); SampleT step = (upper_level - lower_level) / num_bins; thrust::device_vector levels(num_bins + 1); @@ -69,7 +69,7 @@ static void range(nvbench::state& state, nvbench::type_list is_byte_sample; - OffsetT num_row_pixels = static_cast(elements); + auto num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index c91feb9904e..e491fe916d2 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -57,7 +57,7 @@ static void reduce(nvbench::state& state, nvbench::type_list(elements); + const auto num_items = static_cast(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { return cub::detail::reduce_by_key::dispatch( diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index f5ce392b536..25944fedc7e 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -66,7 +66,7 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); + const auto num_items = static_cast(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { return cub::detail::reduce_by_key::dispatch_streaming( diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index fc902393c9d..6c5c08c323b 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -64,7 +64,7 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); + const auto num_items = static_cast(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { cub::detail::rle::dispatch( diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index 5ac2cf3bad7..5bf8999f72c 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -98,7 +98,7 @@ template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out) { const uint32_t offset = reinterpret_cast(ptr) % 4U; - const uint32_t* aligned_ptr = reinterpret_cast(ptr - offset); + auto* aligned_ptr = reinterpret_cast(ptr - offset); constexpr uint32_t bits_per_byte = 8U; const uint32_t bit_shift = offset * bits_per_byte; @@ -175,7 +175,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes) uint32_t out_start_aligned = ::cuda::round_up(in_offset_req + alignment_offset, out_datatype_size); // Compute the beginning of the aligned ranges (output and input pointers) - VectorT* out_aligned_begin = reinterpret_cast(out_chars_aligned + out_start_aligned); + auto* out_aligned_begin = reinterpret_cast(out_chars_aligned + out_start_aligned); const char* in_aligned_begin = in_ptr + (reinterpret_cast(out_aligned_begin) - out_ptr); // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1) @@ -197,7 +197,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes) out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size; } - VectorT* out_aligned_end = reinterpret_cast(out_chars_aligned + out_end_aligned); + auto* out_aligned_end = reinterpret_cast(out_chars_aligned + out_end_aligned); const char* in_aligned_end = in_ptr + (reinterpret_cast(out_aligned_end) - out_ptr); return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end}; diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 16963e15262..5b99bccfc27 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -311,7 +311,7 @@ struct AgentReduceImpl // Load items as vector items InputT input_items[ITEMS_PER_THREAD]; - VectorT* vec_items = reinterpret_cast(input_items); + auto* vec_items = reinterpret_cast(input_items); // Alias items as an array of VectorT and load it in striped fashion static constexpr int words = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH; diff --git a/cub/cub/agent/agent_topk.cuh b/cub/cub/agent/agent_topk.cuh index 21422d4c10a..34a9d823dc3 100644 --- a/cub/cub/agent/agent_topk.cuh +++ b/cub/cub/agent/agent_topk.cuh @@ -613,8 +613,8 @@ struct AgentTopK counter->k = k - prev; // The number of candidates in the next pass - counter->len = cur - prev; - const unsigned int bucket = static_cast(bin_idx); + counter->len = cur - prev; + const auto bucket = static_cast(bin_idx); // Update the "splitter" key by adding the radix digit of the k-th item bin of this pass set_kth_key_bits(counter->kth_key_bits, pass, bucket); } diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index c29bfe2d3cf..f361e23b136 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -741,8 +741,8 @@ struct ScanTileState { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - TxnWord val = TxnWord(); - TileDescriptor* descriptor = reinterpret_cast(&val); + TxnWord val = TxnWord(); + auto* descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { @@ -1150,9 +1150,9 @@ struct ReduceByKeyScanTileState */ _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles) { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - TxnWord val = TxnWord(); - TileDescriptor* descriptor = reinterpret_cast(&val); + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + TxnWord val = TxnWord(); + auto* descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index dee105ed3b1..4b8acd7a538 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -43,7 +43,7 @@ MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, while (keys1_begin < keys1_end) { - const OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); + const auto mid = cub::MidPoint(keys1_begin, keys1_end); // pull copies of the keys before calling binary_pred so proxy references are unwrapped const detail::it_value_t key1 = keys1[mid]; const detail::it_value_t key2 = keys2[diag - 1 - mid]; diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 23a5e25f6fc..b8d0c6e3bd7 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -387,7 +387,7 @@ private: ::cuda::std::bool_constant is_keys_only, DecomposerT decomposer = {}) { - bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); + auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); _CCCL_PRAGMA_UNROLL_FULL() for (int KEY = 0; KEY < ItemsPerThread; KEY++) @@ -466,7 +466,7 @@ public: ::cuda::std::bool_constant is_keys_only, DecomposerT decomposer = {}) { - bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); + auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); _CCCL_PRAGMA_UNROLL_FULL() for (int KEY = 0; KEY < ItemsPerThread; KEY++) diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 1ce6cbcb73a..7cca1c1a359 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -277,8 +277,8 @@ private: _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i <= Log2::VALUE; i++) { - OffsetT mid = cub::MidPoint(lower_bound, upper_bound); - mid = (::cuda::std::min) (mid, num_items - 1); + auto mid = cub::MidPoint(lower_bound, upper_bound); + mid = (::cuda::std::min) (mid, num_items - 1); if (val < input[mid]) { diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index d55367c4a6c..8527ca7c243 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -174,7 +174,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ItemsPerTh if (reinterpret_cast(block_ptr) % (alignof(Vector)) == 0) { // Alias global pointer - Vector* block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); + auto* block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) Vector raw_vector[VECTORS_PER_THREAD]; diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 2b740da8f97..688dabcf46d 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -195,7 +195,7 @@ struct BlockReduceRaking // sync before re-using shmem (warp_storage/raking_grid are aliased) static_assert(RAKING_THREADS <= warp_threads, "RAKING_THREADS must be <= warp size."); - unsigned int mask = static_cast((1ull << RAKING_THREADS) - 1); + auto mask = static_cast((1ull << RAKING_THREADS) - 1); __syncwarp(mask); partial = WarpReduce(temp_storage.warp_storage) diff --git a/cub/cub/block/specializations/block_topk_air.cuh b/cub/cub/block/specializations/block_topk_air.cuh index 43c4d2a59d6..2fd3b59278c 100644 --- a/cub/cub/block/specializations/block_topk_air.cuh +++ b/cub/cub/block/specializations/block_topk_air.cuh @@ -242,10 +242,10 @@ private: for (int pass = 0; pass < num_passes; ++pass) { // Bit-range & mask of the current pass - const int pass_end_bit = end_bit - pass * RadixBits; - const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit); - const int pass_bits = pass_end_bit - pass_begin_bit; - const bit_ordered_type pass_mask = ::cuda::bitmask(pass_begin_bit, pass_bits); + const int pass_end_bit = end_bit - pass * RadixBits; + const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit); + const int pass_bits = pass_end_bit - pass_begin_bit; + const auto pass_mask = ::cuda::bitmask(pass_begin_bit, pass_bits); // Zero-initialize histograms for the current pass init_histograms(); @@ -329,8 +329,8 @@ private: // Get bit-twiddled sortkeys. For float keys, track which were -0.0 (normalized to +0.0 for ranking) so we can // restore -0.0 in the output via a bitvector; no extra key buffer. - bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); - constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32); + auto& unsigned_keys = reinterpret_cast(keys); + constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32); [[maybe_unused]] ::cuda::std::uint32_t flip_back_bits[flip_back_num_words] = {}; if constexpr (::cuda::is_floating_point_v) { diff --git a/cub/cub/detail/rfa.cuh b/cub/cub/detail/rfa.cuh index 9527729fc82..b5d516b5f4f 100644 --- a/cub/cub/detail/rfa.cuh +++ b/cub/cub/detail/rfa.cuh @@ -104,7 +104,7 @@ private: /// Return a binned floating-point bin [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE static ftype binned_bins(int index) { - ftype* bins = get_shared_bin_array(); + auto* bins = get_shared_bin_array(); return bins[index]; } diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 8e5ebbcb6eb..16ada67f864 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -103,9 +103,9 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large using InputBufferT = it_value_t; using OutputBufferT = it_value_t; - constexpr uint32_t BLOCK_THREADS = static_cast(policy.block_threads); - constexpr uint32_t ITEMS_PER_THREAD = static_cast(policy.bytes_per_thread); - constexpr BufferSizeT TILE_SIZE = static_cast(BLOCK_THREADS * ITEMS_PER_THREAD); + constexpr auto BLOCK_THREADS = static_cast(policy.block_threads); + constexpr auto ITEMS_PER_THREAD = static_cast(policy.bytes_per_thread); + constexpr auto TILE_SIZE = static_cast(BLOCK_THREADS * ITEMS_PER_THREAD); BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset); diff --git a/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh index 3c6001bd3bd..d89f717b637 100644 --- a/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh @@ -304,7 +304,7 @@ struct DispatchFixedSizeSegmentedReduce } // Alias the allocation for the privatized per-block reductions - AccumT* d_block_reductions = static_cast(allocations[0]); + auto* d_block_reductions = static_cast(allocations[0]); for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++) { diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 8cd0a08ff60..ec6c6fb0c9c 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -109,7 +109,7 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( ValueIt3, Offset, CompareOp>::type::items_per_tile; - const Offset diagonal_idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + const auto diagonal_idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); if (diagonal_idx < num_diagonals) { const Offset diagonal_num = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count); diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 786f8380f25..75d084719dc 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -556,8 +556,8 @@ private: // for testing purposes, one portion is <= 2**28 elements const PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; int num_passes = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS); - OffsetT num_portions = static_cast(::cuda::ceil_div(num_items, PORTION_SIZE)); - PortionOffsetT max_num_blocks = ::cuda::ceil_div( + auto num_portions = static_cast(::cuda::ceil_div(num_items, PORTION_SIZE)); + auto max_num_blocks = ::cuda::ceil_div( static_cast(::cuda::std::min(num_items, static_cast(PORTION_SIZE))), ONESWEEP_TILE_ITEMS); size_t value_size = KEYS_ONLY ? 0 : kernel_source.ValueSize(); @@ -587,11 +587,11 @@ private: return cudaSuccess; } - OffsetT* d_bins = (OffsetT*) allocations[0]; - AtomicOffsetT* d_lookback = (AtomicOffsetT*) allocations[1]; - KeyT* d_keys_tmp2 = (KeyT*) allocations[2]; - ValueT* d_values_tmp2 = (ValueT*) allocations[3]; - AtomicOffsetT* d_ctrs = (AtomicOffsetT*) allocations[4]; + auto* d_bins = (OffsetT*) allocations[0]; + auto* d_lookback = (AtomicOffsetT*) allocations[1]; + auto* d_keys_tmp2 = (KeyT*) allocations[2]; + auto* d_values_tmp2 = (ValueT*) allocations[3]; + auto* d_ctrs = (AtomicOffsetT*) allocations[4]; // initialization if (const auto error = @@ -688,7 +688,7 @@ private: int num_bits = ::cuda::std::min(end_bit - current_bit, RADIX_BITS); for (OffsetT portion = 0; portion < num_portions; ++portion) { - PortionOffsetT portion_num_items = static_cast( + auto portion_num_items = static_cast( ::cuda::std::min(num_items - portion * PORTION_SIZE, static_cast(PORTION_SIZE))); PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS); @@ -902,7 +902,7 @@ private: int alt_end_bit = ::cuda::std::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); // Alias the temporary storage allocations - OffsetT* d_spine = static_cast(allocations[0]); + auto* d_spine = static_cast(allocations[0]); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 1856a8ad1f7..ceca1c48800 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -376,7 +376,7 @@ struct DispatchReduce } // Alias the allocation for the privatized per-block reductions - AccumT* d_block_reductions = static_cast(allocations[0]); + auto* d_block_reductions = static_cast(allocations[0]); // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = even_share.grid_size; @@ -663,7 +663,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( } // Alias the allocation for the privatized per-block reductions - AccumT* d_block_reductions = static_cast(allocations[0]); + auto* d_block_reductions = static_cast(allocations[0]); // Get grid size for device_reduce_sweep_kernel const int reduce_grid_size = even_share.grid_size; diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index 0c0ef14a3c0..51318bf9889 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -221,7 +221,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok } // Alias the allocation for the privatized per-block reductions - DeterministicAccumT* d_block_reductions = static_cast(allocations[0]); + auto* d_block_reductions = static_cast(allocations[0]); auto d_chunk_block_reductions = d_block_reductions; for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++) diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index f27b2fd1e5a..09d9e718ade 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -749,7 +749,7 @@ private: cub::detail::temporary_storage::alias& group_sizes, WrappedPolicyT wrapped_policy) { - constexpr global_segment_offset_t num_segments_per_invocation_limit = + constexpr auto num_segments_per_invocation_limit = static_cast(::cuda::std::numeric_limits::max()); // We repeatedly invoke the partitioning and sorting kernels until all segments are processed. diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 3fa0cbd6e33..c8d642f58db 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -668,7 +668,7 @@ struct DispatchSelectIf // Initialize the streaming context with the temporary storage for double-buffering the previously selected items // and the total number (across all partitions) of items - num_total_items_t* tmp_num_selected_out = reinterpret_cast(allocations[2]); + auto* tmp_num_selected_out = reinterpret_cast(allocations[2]); streaming_context_t streaming_context{ tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)}; @@ -959,7 +959,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_policy( return cudaSuccess; } - OffsetT* tmp_num_selected_out = reinterpret_cast(allocations[2]); + auto* tmp_num_selected_out = reinterpret_cast(allocations[2]); streaming_context_t streaming_context{ tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)}; diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 9c87aed6546..bdc2bba48f9 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -277,7 +277,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce } // Pointer to the double-buffer of global accumulators, which aggregate cross-partition results - global_accum_t* const d_global_aggregates = static_cast(allocations[1]); + auto* const d_global_aggregates = static_cast(allocations[1]); accumulating_out_op = accumulating_transform_output_op_t{ true, diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index b1dbcee0c8a..958b1e192a4 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -243,7 +243,7 @@ struct DispatchThreeWayPartitionIf // Initialize the streaming context with the temporary storage for double-buffering the previously selected items // and the total number (across all partitions) of items - OffsetT* tmp_num_selected_out = static_cast(allocations[1]); + auto* tmp_num_selected_out = static_cast(allocations[1]); streaming_context_t streaming_context{ tmp_num_selected_out, (tmp_num_selected_out + num_counters_per_pass), (num_partitions <= 1)}; diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index 878ad58a231..721d5d3bce1 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -597,8 +597,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( #endif // CUB_DEBUG_LOG // Initialize address variables - counter_t* counter = static_cast(allocations[0]); - OffsetT* histogram = static_cast(allocations[1]); + auto* counter = static_cast(allocations[0]); + auto* histogram = static_cast(allocations[1]); // Pass 0: dedicated histogram-only kernel over the full input { diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh index 542032819b7..5b84092f897 100644 --- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh @@ -267,7 +267,7 @@ struct Transforms template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const { - const CommonT common_sample = static_cast(sample); + const auto common_sample = static_cast(sample); if (valid && this->SampleIsValid(common_sample, m_max, m_min)) { diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index 5ea003b55a6..d9d4f075936 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -158,7 +158,7 @@ _CCCL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, _CCCL_GRID_CONSTANT const int items_per_tile) { - const OffsetT partition_idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + const auto partition_idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); if (partition_idx < num_partitions) { AgentPartition{ diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh index 4d7c372311c..0616e31ee88 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh @@ -77,7 +77,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(int( constexpr int bin_length = AccumT::max_index + AccumT::max_fold; const int tid = block_threads * blockIdx.x + threadIdx.x; - ftype* shared_bins = detail::rfa::get_shared_bin_array(); + auto* shared_bins = detail::rfa::get_shared_bin_array(); _CCCL_PRAGMA_UNROLL_FULL() for (int index = threadIdx.x; index < bin_length; index += block_threads) @@ -217,7 +217,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__( using float_type = typename AccumT::ftype; constexpr int bin_length = AccumT::max_index + AccumT::max_fold; - float_type* shared_bins = detail::rfa::get_shared_bin_array(); + auto* shared_bins = detail::rfa::get_shared_bin_array(); _CCCL_PRAGMA_UNROLL_FULL() for (int index = threadIdx.x; index < bin_length; index += block_threads) diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index 70cc4cefa2a..ea9a1ab87b1 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -292,7 +292,7 @@ ThreadLoadVolatilePointer(const T* ptr, ::cuda::std::false_type /*is_primitive*/ constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); T retval; - VolatileWord* words = reinterpret_cast(&retval); + auto* words = reinterpret_cast(&retval); UnrolledCopy(reinterpret_cast(ptr), words); return retval; } diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index a65bcb9a8a3..c3764a4bf14 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -432,7 +432,7 @@ struct CachingDeviceAllocator } // Iterate through the range of cached blocks on the same device in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + auto block_itr = cached_blocks.lower_bound(search_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { // To prevent races with reusing blocks returned by the host but still @@ -525,7 +525,7 @@ struct CachingDeviceAllocator // Iterate the range of free blocks on the same device BlockDescriptor free_key(device); - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + auto block_itr = cached_blocks.lower_bound(free_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) { @@ -683,7 +683,7 @@ struct CachingDeviceAllocator // Find corresponding block descriptor bool recached = false; BlockDescriptor search_key(d_ptr, device); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); + auto block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks @@ -811,7 +811,7 @@ struct CachingDeviceAllocator while (!cached_blocks.empty()) { // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); + auto begin = cached_blocks.begin(); // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index ecff44b8fc0..aa5b54fc2a0 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -218,8 +218,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleUp(T input, int src_offset, int first_th constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; - ShuffleWord* output_alias = reinterpret_cast(&output); - ShuffleWord* input_alias = reinterpret_cast(&input); + auto* output_alias = reinterpret_cast(&output); + auto* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_UP_SYNC((unsigned int) input_alias[0], src_offset, first_thread | SHFL_C, member_mask); @@ -296,8 +296,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; - ShuffleWord* output_alias = reinterpret_cast(&output); - ShuffleWord* input_alias = reinterpret_cast(&input); + auto* output_alias = reinterpret_cast(&output); + auto* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_DOWN_SYNC((unsigned int) input_alias[0], src_offset, last_thread | SHFL_C, member_mask); @@ -370,8 +370,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; - ShuffleWord* output_alias = reinterpret_cast(&output); - ShuffleWord* input_alias = reinterpret_cast(&input); + auto* output_alias = reinterpret_cast(&output); + auto* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS); diff --git a/cub/examples/block/example_block_radix_sort.cu b/cub/examples/block/example_block_radix_sort.cu index 614a38a8e52..39e97f61187 100644 --- a/cub/examples/block/example_block_radix_sort.cu +++ b/cub/examples/block/example_block_radix_sort.cu @@ -140,9 +140,9 @@ void Test() constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays - Key* h_in = new Key[TILE_SIZE * g_grid_size]; - Key* h_reference = new Key[TILE_SIZE * g_grid_size]; - clock_t* h_elapsed = new clock_t[g_grid_size]; + auto* h_in = new Key[TILE_SIZE * g_grid_size]; + auto* h_reference = new Key[TILE_SIZE * g_grid_size]; + auto* h_elapsed = new clock_t[g_grid_size]; // Initialize problem and reference output on host Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE); diff --git a/cub/examples/device/example_device_partition_flagged.cu b/cub/examples/device/example_device_partition_flagged.cu index e91fadedb09..a68aa31eeeb 100644 --- a/cub/examples/device/example_device_partition_flagged.cu +++ b/cub/examples/device/example_device_partition_flagged.cu @@ -134,9 +134,9 @@ int main(int argc, char** argv) CubDebugExit(args.DeviceInit()); // Allocate host arrays - int* h_in = new int[num_items]; - int* h_reference = new int[num_items]; - unsigned char* h_flags = new unsigned char[num_items]; + auto* h_in = new int[num_items]; + auto* h_reference = new int[num_items]; + auto* h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); diff --git a/cub/examples/device/example_device_partition_if.cu b/cub/examples/device/example_device_partition_if.cu index 41c01523675..f149e8f4027 100644 --- a/cub/examples/device/example_device_partition_if.cu +++ b/cub/examples/device/example_device_partition_if.cu @@ -150,7 +150,7 @@ int main(int argc, char** argv) // DevicePartition a pivot index unsigned int pivot_index; - unsigned int max_int = (unsigned int) -1; + auto max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); diff --git a/cub/examples/device/example_device_radix_sort.cu b/cub/examples/device/example_device_radix_sort.cu index c8d64c46b42..417b71767c1 100644 --- a/cub/examples/device/example_device_radix_sort.cu +++ b/cub/examples/device/example_device_radix_sort.cu @@ -127,8 +127,8 @@ int main(int argc, char** argv) fflush(stdout); // Allocate host arrays - float* h_keys = new float[num_items]; - float* h_reference_keys = new float[num_items]; + auto* h_keys = new float[num_items]; + auto* h_reference_keys = new float[num_items]; int* h_values = new int[num_items]; int* h_reference_values = new int[num_items]; diff --git a/cub/examples/device/example_device_select_flagged.cu b/cub/examples/device/example_device_select_flagged.cu index 251ba65e570..70a23b8cdc2 100644 --- a/cub/examples/device/example_device_select_flagged.cu +++ b/cub/examples/device/example_device_select_flagged.cu @@ -134,9 +134,9 @@ int main(int argc, char** argv) CubDebugExit(args.DeviceInit()); // Allocate host arrays - int* h_in = new int[num_items]; - int* h_reference = new int[num_items]; - unsigned char* h_flags = new unsigned char[num_items]; + int* h_in = new int[num_items]; + int* h_reference = new int[num_items]; + auto* h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); diff --git a/cub/examples/device/example_device_select_if.cu b/cub/examples/device/example_device_select_if.cu index ed282c5afde..d238e6f4d47 100644 --- a/cub/examples/device/example_device_select_if.cu +++ b/cub/examples/device/example_device_select_if.cu @@ -150,7 +150,7 @@ int main(int argc, char** argv) // Select a pivot index unsigned int pivot_index; - unsigned int max_int = (unsigned int) -1; + auto max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); diff --git a/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/cub/examples/device/example_device_sort_find_non_trivial_runs.cu index aa27d140e4d..0ad839a922d 100644 --- a/cub/examples/device/example_device_sort_find_non_trivial_runs.cu +++ b/cub/examples/device/example_device_sort_find_non_trivial_runs.cu @@ -99,7 +99,7 @@ int Solve(Key* h_keys, Value* h_values, int num_items, int* h_offsets_reference, { // Sort - Pair* h_pairs = new Pair[num_items]; + auto* h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { h_pairs[i].key = h_keys[i]; @@ -197,7 +197,7 @@ int main(int argc, char** argv) // Allocate host arrays (problem and reference solution) Key* h_keys = new Key[num_items]; - Value* h_values = new Value[num_items]; + auto* h_values = new Value[num_items]; int* h_offsets_reference = new int[num_items]; int* h_lengths_reference = new int[num_items]; diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh index ecb501a287a..3587966f000 100644 --- a/cub/test/catch2_radix_sort_helper.cuh +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -186,7 +186,7 @@ c2h::host_vector get_striped_keys(const c2h::host_vector& h_keys, in for (std::size_t i = 0; i < h_keys.size(); i++) { - bit_ordered_t key = ::cuda::std::bit_cast(h_keys[i]); + auto key = ::cuda::std::bit_cast(h_keys[i]); if constexpr (::cuda::is_floating_point_v) { @@ -445,9 +445,9 @@ struct radix_offset_scan_op_t template void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector& offsets, std::size_t num_items) { - const std::size_t num_segments = offsets.size() - 1; - const OffsetT expected_segment_length = static_cast(::cuda::ceil_div(num_items, num_segments)); - const OffsetT max_segment_length = (expected_segment_length * 2) + 1; + const std::size_t num_segments = offsets.size() - 1; + const auto expected_segment_length = static_cast(::cuda::ceil_div(num_items, num_segments)); + const OffsetT max_segment_length = (expected_segment_length * 2) + 1; c2h::gen(seed, offsets, OffsetT{0}, max_segment_length); thrust::exclusive_scan( c2h::device_policy, diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index d0bd0a6d7c1..c7858c32a43 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -266,7 +266,7 @@ using unwrap_value_t = typename unwrap_value_t_impl::type; template __host__ __device__ __forceinline__ double compute_conversion_factor(int segment_size, T) { - const double max_value = static_cast(::cuda::std::numeric_limits::max()); + const auto max_value = static_cast(::cuda::std::numeric_limits::max()); return (max_value + 1) / segment_size; } @@ -433,8 +433,8 @@ private: // Compute the original input value corresponding to the current duplicate key. // NOTE: Keys and values are generated using opposing ascending/descending parameters, so the generated input // values are descending when generating ascending input keys for a descending sort. - const int conv_idx = sort_descending ? (segment_size - 1 - in_seg_idx) : in_seg_idx; - const ValueT current_value = static_cast(conv_idx * value_conversion); + const int conv_idx = sort_descending ? (segment_size - 1 - in_seg_idx) : in_seg_idx; + const auto current_value = static_cast(conv_idx * value_conversion); if constexpr (STABLE) { // For stable sorts, the output value must appear at an exact offset: diff --git a/cub/test/catch2_test_block_histogram.cu b/cub/test/catch2_test_block_histogram.cu index 6475a2c2568..c9920f1bdfd 100644 --- a/cub/test/catch2_test_block_histogram.cu +++ b/cub/test/catch2_test_block_histogram.cu @@ -78,7 +78,7 @@ C2H_TEST("Block histogram can be computed with uniform input", using params = params_t; using sample_t = typename params::sample_t; - const sample_t uniform_value = static_cast(GENERATE_COPY(take(10, random(0, params::bins - 1)))); + const auto uniform_value = static_cast(GENERATE_COPY(take(10, random(0, params::bins - 1)))); c2h::host_vector h_samples(params::num_samples, uniform_value); c2h::host_vector h_reference(params::bins); @@ -149,7 +149,7 @@ C2H_TEST("Block histogram can be computed with random input", c2h::device_vector d_histogram(params::bins); c2h::device_vector d_samples(params::num_samples); - const sample_t min_bin = static_cast(0); + const auto min_bin = static_cast(0); const sample_t max_bin = static_cast( std::min(static_cast(cuda::std::numeric_limits::max()), static_cast(params::bins - 1))); diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu index aadf74f0c76..1336393641a 100644 --- a/cub/test/catch2_test_device_find_if.cu +++ b/cub/test/catch2_test_device_find_if.cu @@ -165,7 +165,7 @@ C2H_TEST("Device find_if works with non primitive iterator", "[device][find_if]" constexpr offset_t min_items = 1; constexpr offset_t max_items = 10000000; // 10M items for reasonable test time - input_t val_to_find = static_cast(GENERATE_COPY(take(1, random(min_items, max_items)))); + auto val_to_find = static_cast(GENERATE_COPY(take(1, random(min_items, max_items)))); // Generate the input sizes to test for const offset_t num_items = GENERATE_COPY( take(1, random(min_items, max_items)), diff --git a/cub/test/catch2_test_device_merge_sort.cu b/cub/test/catch2_test_device_merge_sort.cu index 4cf70182cc9..a533754f310 100644 --- a/cub/test/catch2_test_device_merge_sort.cu +++ b/cub/test/catch2_test_device_merge_sort.cu @@ -412,7 +412,7 @@ C2H_TEST("DeviceMergeSort::StableSortPairs works for large inputs", // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items auto num_items_ull = std::min(static_cast(cuda::std::numeric_limits::max()) - 1, cuda::std::numeric_limits::max() + static_cast(2000000ULL)); - offset_t num_items = static_cast(num_items_ull); + auto num_items = static_cast(num_items_ull); SECTION("Random") { diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 4725ccafc8e..5b9509b9f4e 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -390,7 +390,7 @@ try using type = std::int64_t; using offset_t = typename c2h::get<0, TestType>; - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index f346aedeab7..ac6fc1f5a23 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -303,7 +303,7 @@ try using type = std::int64_t; using offset_t = typename c2h::get<0, TestType>; - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu index b02fccf3eb4..f1febc7bdd0 100644 --- a/cub/test/catch2_test_device_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_radix_sort_keys.cu @@ -168,8 +168,8 @@ C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sor using bits_t = typename cub::Traits::UnsignedBits; constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT; - const key_t positive_zero = cuda::std::bit_cast(bits_t(0)); - const key_t negative_zero = cuda::std::bit_cast(bits_t(1) << (num_bits - 1)); + const auto positive_zero = cuda::std::bit_cast(bits_t(0)); + const auto negative_zero = cuda::std::bit_cast(bits_t(1) << (num_bits - 1)); constexpr std::size_t max_num_items = 1 << 18; const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items))); @@ -461,7 +461,7 @@ void do_large_offset_test(std::size_t num_items) double_buffer_sort_t action(is_descending); action.initialize(); - const num_items_t typed_num_items = static_cast(num_items); + const auto typed_num_items = static_cast(num_items); launch(action, arrays.keys_buffer, typed_num_items, begin_bit(), end_bit()); arrays.keys_buffer.selector = action.selector(); diff --git a/cub/test/catch2_test_device_radix_sort_pairs.cu b/cub/test/catch2_test_device_radix_sort_pairs.cu index 05d3493bb56..3f6f9b422cb 100644 --- a/cub/test/catch2_test_device_radix_sort_pairs.cu +++ b/cub/test/catch2_test_device_radix_sort_pairs.cu @@ -146,7 +146,7 @@ void do_large_offset_test(std::size_t num_items) double_buffer_sort_t action(is_descending); action.initialize(); - const num_items_t typed_num_items = static_cast(num_items); + const auto typed_num_items = static_cast(num_items); launch(action, arrays.keys_buffer, arrays.values_buffer, typed_num_items, begin_bit(), end_bit()); TIME(timer.print_elapsed_seconds_and_reset("Device sort")); diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index fca7ca9a511..c7e7c844608 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -137,9 +137,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cuda::std::__accumulator_t; - output_t expected_result = - static_cast(compute_single_problem_reference(in_items, reduction_op, accum_t{})); + using accum_t = cuda::std::__accumulator_t; + auto expected_result = static_cast(compute_single_problem_reference(in_items, reduction_op, accum_t{})); // Run test c2h::device_vector out_result(num_segments); @@ -161,7 +160,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f using accum_t = cuda::std::__accumulator_t; // Prepare verification data - output_t expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); + auto expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); // Run test c2h::device_vector out_result(num_segments); @@ -219,8 +218,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items); // Verify result - result_t gpu_result = out_result[0]; - output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + result_t gpu_result = out_result[0]; + auto gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_extremum); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); } @@ -240,8 +239,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items); // Verify result - result_t gpu_result = out_result[0]; - output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + result_t gpu_result = out_result[0]; + auto gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_extremum); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); } @@ -259,7 +258,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Verify result for the deprecated interface result_t gpu_result = out_result[0]; - output_t gpu_value = static_cast(gpu_result.value); // Explicitly rewrap the gpu value + auto gpu_value = static_cast(gpu_result.value); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_value); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key); } @@ -277,7 +276,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Verify result for the deprecated interface result_t gpu_result = out_result[0]; - output_t gpu_value = static_cast(gpu_result.value); // Explicitly rewrap the gpu value + auto gpu_value = static_cast(gpu_result.value); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_value); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key); } @@ -300,8 +299,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op); // Verify result - result_t gpu_result = out_result[0]; - output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + result_t gpu_result = out_result[0]; + auto gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_extremum); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); } @@ -323,8 +322,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op); // Verify result - result_t gpu_result = out_result[0]; - output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + result_t gpu_result = out_result[0]; + auto gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value REQUIRE(expected_result[0] == gpu_extremum); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); } diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu index 75a1766921c..478c132e405 100644 --- a/cub/test/catch2_test_device_reduce_by_key.cu +++ b/cub/test/catch2_test_device_reduce_by_key.cu @@ -81,7 +81,7 @@ C2H_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); // Get array of keys from segment offsets - const offset_t num_segments = static_cast(segment_offsets.size() - 1); + const auto num_segments = static_cast(segment_offsets.size() - 1); c2h::device_vector segment_keys(num_items); c2h::init_key_segments(segment_offsets, segment_keys); auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data()); diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu index 1eb8a34a9ca..4557724696e 100644 --- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu +++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu @@ -52,7 +52,7 @@ C2H_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]" C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); // Get array of keys from segment offsets - const offset_t num_segments = static_cast(segment_offsets.size() - 1); + const auto num_segments = static_cast(segment_offsets.size() - 1); c2h::device_vector segment_keys(num_items); c2h::init_key_segments(segment_offsets, segment_keys); auto d_keys_it = segment_keys.cbegin(); diff --git a/cub/test/catch2_test_device_reduce_by_key_vsmem.cu b/cub/test/catch2_test_device_reduce_by_key_vsmem.cu index 6c722a7394e..47ba7b342db 100644 --- a/cub/test/catch2_test_device_reduce_by_key_vsmem.cu +++ b/cub/test/catch2_test_device_reduce_by_key_vsmem.cu @@ -38,7 +38,7 @@ C2H_TEST("Device reduce-by-key works with huge keys", "[by_key][reduce][device]" // Generate input segments c2h::device_vector segment_offsets = c2h::gen_uniform_offsets( C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); - const offset_t num_segments = static_cast(segment_offsets.size() - 1); + const auto num_segments = static_cast(segment_offsets.size() - 1); c2h::device_vector segment_keys(num_items); c2h::init_key_segments(segment_offsets, segment_keys); auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data()); diff --git a/cub/test/catch2_test_device_reduce_large_offsets.cu b/cub/test/catch2_test_device_reduce_large_offsets.cu index 9f691aaf56d..e4600d8bf43 100644 --- a/cub/test/catch2_test_device_reduce_large_offsets.cu +++ b/cub/test/catch2_test_device_reduce_large_offsets.cu @@ -67,7 +67,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", o CAPTURE(c2h::type_name()); - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; // Generate the input sizes to test for diff --git a/cub/test/catch2_test_device_rle_custom_policy_hub.cu b/cub/test/catch2_test_device_rle_custom_policy_hub.cu index 98e3cec75c1..d5391a57235 100644 --- a/cub/test/catch2_test_device_rle_custom_policy_hub.cu +++ b/cub/test/catch2_test_device_rle_custom_policy_hub.cu @@ -39,7 +39,7 @@ C2H_TEST("DeviceRleDispatch::Dispatch: custom policy hub", "[device][run_length_ using equal_t = cuda::std::equal_to<>; c2h::device_vector d_in{1, 1, 2, 2, 2, 3, 3, 4, 4}; - const offset_t num_items = static_cast(d_in.size()); + const auto num_items = static_cast(d_in.size()); c2h::device_vector d_offsets(4, thrust::no_init); c2h::device_vector d_lengths(4, thrust::no_init); diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index df23d5d99d9..7f02e320fe7 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -167,10 +167,10 @@ bool validate_results( const c2h::host_vector& h_out_lengths = out_lengths; const c2h::host_vector& h_out_num_runs = out_num_runs; - const cuda::std::size_t num_runs = static_cast(h_out_num_runs.front()); + const auto num_runs = static_cast(h_out_num_runs.front()); for (cuda::std::size_t run = 0; run < num_runs; ++run) { - const cuda::std::size_t first_index = static_cast(h_out_offsets[run]); + const auto first_index = static_cast(h_out_offsets[run]); const cuda::std::size_t final_index = first_index + static_cast(h_out_lengths[run]); // Ensure we started a new run diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index 5aeda0cbcbc..548a2703a2d 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -42,7 +42,7 @@ void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt resu using value_t = cub::detail::it_value_t; using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::it_value_t; - accum_t acc = static_cast(init); + auto acc = static_cast(init); for (; first != last; ++first) { auto v = *first; @@ -57,7 +57,7 @@ void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt resu using value_t = cub::detail::it_value_t; using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::it_value_t; - accum_t acc = static_cast(init); + auto acc = static_cast(init); for (; first != last; ++first) { acc = op(acc, *first); @@ -88,9 +88,9 @@ void compute_exclusive_scan_by_key_reference( { for (std::size_t i = 0; i < num_items;) { - accum_t val = static_cast(h_values_it[i]); - result_out_it[i] = init; - accum_t inclusive = static_cast(scan_op(init, val)); + auto val = static_cast(h_values_it[i]); + result_out_it[i] = init; + auto inclusive = static_cast(scan_op(init, val)); ++i; diff --git a/cub/test/catch2_test_device_scan_by_key_large_offsets.cu b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu index 66915bf87f9..22c202e10d5 100644 --- a/cub/test/catch2_test_device_scan_by_key_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu @@ -81,7 +81,7 @@ try using index_t = std::uint64_t; using offset_t = typename c2h::get<0, TestType>; - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9b7f4e443b4..2f49ffaa5b5 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -81,7 +81,7 @@ try using offset_t = typename c2h::get<0, TestType>; // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( @@ -124,7 +124,7 @@ try using offset_t = typename c2h::get<0, TestType>; // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 33aa1d9a54b..0e6a8396d4a 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -92,8 +92,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ // Generate input segments c2h::device_vector segment_offsets = c2h::gen_uniform_offsets( C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); - const offset_t num_segments = static_cast(segment_offsets.size() - 1); - auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + const auto num_segments = static_cast(segment_offsets.size() - 1); + auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); // Generate input data c2h::device_vector in_items(num_items); @@ -266,7 +266,7 @@ C2H_TEST("Device fixed size segmented reduce works with all device interfaces", auto d_out_it = thrust::raw_pointer_cast(out_result.data()); using init_t = cub::detail::it_value_t; - init_t init = static_cast(*unwrap_it(&default_constant)); + auto init = static_cast(*unwrap_it(&default_constant)); device_segmented_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, segment_size, reduction_op, init); // Verify result REQUIRE(expected_result == out_result); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index f8d89293d5b..497ba00d617 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -51,8 +51,8 @@ C2H_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d // Generate input segments c2h::device_vector segment_offsets = c2h::gen_uniform_offsets( C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); - const offset_t num_segments = static_cast(segment_offsets.size() - 1); - auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + const auto num_segments = static_cast(segment_offsets.size() - 1); + auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); // Prepare input data item_t default_constant{}; diff --git a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu index 478205f6451..f5e8576a3d3 100644 --- a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu +++ b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu @@ -69,8 +69,8 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes", // Generate input segments c2h::device_vector segment_offsets = c2h::gen_uniform_offsets(C2H_SEED(1), num_items, seg_size_min, seg_size_max); - const offset_t num_segments = static_cast(segment_offsets.size() - 1); - auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + const auto num_segments = static_cast(segment_offsets.size() - 1); + auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); // Generate input data c2h::device_vector in_items(num_items); diff --git a/cub/test/catch2_test_device_segmented_scan.cu b/cub/test/catch2_test_device_segmented_scan.cu index 30c2770456f..7cfdb2cee5b 100644 --- a/cub/test/catch2_test_device_segmented_scan.cu +++ b/cub/test/catch2_test_device_segmented_scan.cu @@ -145,8 +145,8 @@ C2H_TEST("Device segmented_scan works with all device interfaces", "[segmented][ // Generate input segments c2h::device_vector d_segment_offsets = c2h::gen_uniform_offsets( C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); - const offset_t num_segments = static_cast(d_segment_offsets.size() - 1); - auto d_offsets_it = thrust::raw_pointer_cast(d_segment_offsets.data()); + const auto num_segments = static_cast(d_segment_offsets.size() - 1); + auto d_offsets_it = thrust::raw_pointer_cast(d_segment_offsets.data()); INFO("Num segments: " << num_segments); INFO("Types: " << typeid(input_t).name() << " " << typeid(output_t).name() << " " << typeid(offset_t).name()); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 496ead85b73..0c0e436a85c 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -212,9 +212,9 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment constexpr auto max_segment_size = static_max_segment_size; c2h::device_vector segment_offsets = c2h::gen_uniform_offsets(C2H_SEED(3), num_items, min_segment_size, max_segment_size); - const segment_index_t num_segments = static_cast(segment_offsets.size() - 1); - auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); - auto segment_size_it = cuda::make_transform_iterator( + const auto num_segments = static_cast(segment_offsets.size() - 1); + auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + auto segment_size_it = cuda::make_transform_iterator( cuda::make_counting_iterator(segment_index_t{0}), segment_size_op{segment_offsets_it}); // Set the k value diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index 47a4bd4f950..24a188ead68 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -329,9 +329,9 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen constexpr auto max_segment_size = static_max_segment_size; c2h::device_vector segment_offsets = c2h::gen_uniform_offsets(C2H_SEED(3), num_items, min_segment_size, max_segment_size); - const segment_index_t num_segments = static_cast(segment_offsets.size() - 1); - auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); - auto segment_size_it = cuda::make_transform_iterator( + const auto num_segments = static_cast(segment_offsets.size() - 1); + auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + auto segment_size_it = cuda::make_transform_iterator( cuda::make_counting_iterator(segment_index_t{0}), segment_size_op{segment_offsets_it}); // Set the k value diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu index ced8389e96f..a6de1943e92 100644 --- a/cub/test/catch2_test_device_three_way_partition.cu +++ b/cub/test/catch2_test_device_three_way_partition.cu @@ -427,7 +427,7 @@ try { using offset_t = typename c2h::get<0, TestType>; - const offset_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0}; const offset_t num_items = GENERATE_COPY( values( diff --git a/cub/test/catch2_test_device_topk_keys.cu b/cub/test/catch2_test_device_topk_keys.cu index a0410883d89..7b47493466d 100644 --- a/cub/test/catch2_test_device_topk_keys.cu +++ b/cub/test/catch2_test_device_topk_keys.cu @@ -161,7 +161,7 @@ try using comparator_t = direction_to_comparator_t; // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - const num_items_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const num_items_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : num_items_t{0}; const num_items_t num_items = GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), num_items_t{1}, num_items_t{3}}), @@ -277,7 +277,7 @@ try using comparator_t = direction_to_comparator_t; // Set input size - const num_items_t num_items = detail::make_large_offset(); + const auto num_items = detail::make_large_offset(); // Set the k value const auto limit_k = static_cast( diff --git a/cub/test/catch2_test_device_topk_pairs.cu b/cub/test/catch2_test_device_topk_pairs.cu index af51baa3858..ea99262ba72 100644 --- a/cub/test/catch2_test_device_topk_pairs.cu +++ b/cub/test/catch2_test_device_topk_pairs.cu @@ -228,7 +228,7 @@ C2H_TEST("DeviceTopK::MaxPairs: Test for large num_items", "[pairs][topk][device // Set input size // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - const num_items_t num_items_max = detail::make_large_offset(); + const auto num_items_max = detail::make_large_offset(); const num_items_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : num_items_t{0}; const num_items_t num_items = GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), num_items_t{1}, num_items_t{3}}), diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 32b29a1d796..e0d5abe4e49 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -86,7 +86,7 @@ C2H_TEST("DeviceTransform::Transform with multiple inputs works for large number { using offset_t = c2h::get<0, TestType>; CAPTURE(c2h::type_name()); - const offset_t num_items = detail::make_large_offset(); + const auto num_items = detail::make_large_offset(); auto a_it = cuda::counting_iterator(offset_t{0}); auto b_it = cuda::constant_iterator(offset_t{42}); @@ -119,7 +119,7 @@ try using offset_t = c2h::get<0, TestType>; // make size a few thread blocks below/beyond 4GiB. need to make sure I32 num_items stays below 2^31 - constexpr offset_t num_items = static_cast((1ll << 31) + (sizeof(offset_t) == 4 ? -123456 : 123456)); + constexpr auto num_items = static_cast((1ll << 31) + (sizeof(offset_t) == 4 ? -123456 : 123456)); REQUIRE(num_items > 0); c2h::device_vector input(static_cast(num_items), thrust::no_init); diff --git a/cub/test/test_device_batch_copy.cu b/cub/test/test_device_batch_copy.cu index bba9b0eec7c..ff7199bfa61 100644 --- a/cub/test/test_device_batch_copy.cu +++ b/cub/test/test_device_batch_copy.cu @@ -55,7 +55,7 @@ template c2h::host_vector GetShuffledRangeOffsets(const c2h::host_vector& range_sizes, const std::uint_fast32_t seed = 320981U) { - RangeOffsetT num_ranges = static_cast(range_sizes.size()); + auto num_ranges = static_cast(range_sizes.size()); // We're remapping the i-th range to pmt_idxs[i] std::mt19937 rng(seed); @@ -380,12 +380,12 @@ int main(int argc, char** argv) for (const auto& size_range : size_ranges) { // The most granular type being copied. - using AtomicCopyT = int64_t; - RangeSizeT min_range_size = static_cast(cuda::round_up(size_range.first, sizeof(AtomicCopyT))); - RangeSizeT max_range_size = + using AtomicCopyT = int64_t; + auto min_range_size = static_cast(cuda::round_up(size_range.first, sizeof(AtomicCopyT))); + auto max_range_size = static_cast(cuda::round_up(size_range.second, static_cast(sizeof(AtomicCopyT)))); - double average_range_size = (min_range_size + max_range_size) / 2.0; - RangeOffsetT target_num_ranges = static_cast(target_copy_size / average_range_size); + double average_range_size = (min_range_size + max_range_size) / 2.0; + auto target_num_ranges = static_cast(target_copy_size / average_range_size); // Run tests with output ranges being consecutive RunTest( @@ -399,12 +399,12 @@ int main(int argc, char** argv) for (const auto& size_range : size_ranges) { // The most granular type being copied. - using AtomicCopyT = cuda::std::tuple; - RangeSizeT min_range_size = static_cast(cuda::round_up(size_range.first, sizeof(AtomicCopyT))); - RangeSizeT max_range_size = + using AtomicCopyT = cuda::std::tuple; + auto min_range_size = static_cast(cuda::round_up(size_range.first, sizeof(AtomicCopyT))); + auto max_range_size = static_cast(cuda::round_up(size_range.second, static_cast(sizeof(AtomicCopyT)))); - double average_range_size = (min_range_size + max_range_size) / 2.0; - RangeOffsetT target_num_ranges = static_cast(target_copy_size / average_range_size); + double average_range_size = (min_range_size + max_range_size) / 2.0; + auto target_num_ranges = static_cast(target_copy_size / average_range_size); // Run tests with output ranges being consecutive RunTest( diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 8ac38867f49..7d1b526c3ad 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -421,7 +421,7 @@ template T RandomValue(T max) { unsigned int bits; - unsigned int max_int = (unsigned int) -1; + auto max_int = (unsigned int) -1; do { RandomBits(bits); diff --git a/libcudacxx/include/cuda/__launch/host_launch.h b/libcudacxx/include/cuda/__launch/host_launch.h index 794e873ea02..3100a653944 100644 --- a/libcudacxx/include/cuda/__launch/host_launch.h +++ b/libcudacxx/include/cuda/__launch/host_launch.h @@ -100,9 +100,8 @@ _CCCL_HOST_API void host_launch(stream_ref __stream, _Callable __callable, _Args } else { - using _CallbackData = __stream_callback_data<_Callable, _Args...>; - _CallbackData* __callback_data_ptr = - new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}}; + using _CallbackData = __stream_callback_data<_Callable, _Args...>; + auto* __callback_data_ptr = new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}}; // We use the callback here to have it execute even on stream error, because it needs to free the above allocation ::cuda::__driver::__streamAddCallback( diff --git a/libcudacxx/include/cuda/__random/feistel_bijection.h b/libcudacxx/include/cuda/__random/feistel_bijection.h index 91aa3ac2861..8a4be8f1eba 100644 --- a/libcudacxx/include/cuda/__random/feistel_bijection.h +++ b/libcudacxx/include/cuda/__random/feistel_bijection.h @@ -81,14 +81,14 @@ class __feistel_bijection { // Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1 // (2022): 1-20. - uint32_t __L = static_cast(__val >> __R_bits_); - uint32_t __R = static_cast(__val & __R_mask_); - for (uint32_t __i = 0; __i < __num_rounds; __i++) + auto __L = static_cast(__val >> __R_bits_); + auto __R = static_cast(__val & __R_mask_); + for (const auto __key : __keys_) { constexpr uint64_t __m0 = 0xD2B74407B1CE6E93; const uint64_t __product = __m0 * __L; - uint32_t __F_k = (__product >> 32) ^ __keys_[__i]; - uint32_t __B_k = static_cast(__product); + uint32_t __F_k = (__product >> 32) ^ __key; + auto __B_k = static_cast(__product); uint32_t __L_prime = __F_k ^ __R; uint32_t __R_prime = (__B_k << (__R_bits_ - __L_bits_)) | __R >> __L_bits_; diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_n.h b/libcudacxx/include/cuda/std/__algorithm/copy_n.h index 8556ec6f9ff..d3b99755b4b 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy_n.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy_n.h @@ -39,7 +39,7 @@ _CCCL_API inline _CCCL_CONSTEXPR_CXX20 _OutputIterator copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { using _IntegralSize = decltype(__convert_to_integral(__orig_n)); - _IntegralSize __n = static_cast<_IntegralSize>(__orig_n); + auto __n = static_cast<_IntegralSize>(__orig_n); if (__n > 0) { *__result = *__first; @@ -62,7 +62,7 @@ template (__orig_n); + auto __n = static_cast<_IntegralSize>(__orig_n); return ::cuda::std::copy(__first, __first + __n, __result); } diff --git a/libcudacxx/include/cuda/std/__algorithm/generate_n.h b/libcudacxx/include/cuda/std/__algorithm/generate_n.h index 233c4566d98..54e5896c066 100644 --- a/libcudacxx/include/cuda/std/__algorithm/generate_n.h +++ b/libcudacxx/include/cuda/std/__algorithm/generate_n.h @@ -31,7 +31,7 @@ template _CCCL_API constexpr _OutputIterator generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) { using _IntegralSize = decltype(__convert_to_integral(__orig_n)); - _IntegralSize __n = static_cast<_IntegralSize>(__orig_n); + auto __n = static_cast<_IntegralSize>(__orig_n); for (; __n > 0; ++__first, (void) --__n) { *__first = __gen(); diff --git a/libcudacxx/include/cuda/std/__algorithm/search_n.h b/libcudacxx/include/cuda/std/__algorithm/search_n.h index cba94f2ecac..154b4424c51 100644 --- a/libcudacxx/include/cuda/std/__algorithm/search_n.h +++ b/libcudacxx/include/cuda/std/__algorithm/search_n.h @@ -95,7 +95,7 @@ template (__last - __first); + auto __len = static_cast<_Size>(__last - __first); if (__len < __count) { return __last; diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 7ebca48711f..f816aa91a58 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -55,7 +55,7 @@ _CCCL_DEVICE static void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1); - uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & (~__alignmask)); + auto* __aligned = (uint16_t*) ((intptr_t) __ptr & (~__alignmask)); const uint8_t __offset = uint16_t((intptr_t) __ptr & __alignmask) * 8; uint16_t __value = 0; @@ -70,7 +70,7 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __ { constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; - uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + auto* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); const uint8_t __offset = uint32_t((intptr_t) __ptr & __alignmask) * 8; const uint32_t __valueMask = __sizemask << __offset; const uint32_t __windowMask = ~__valueMask; @@ -115,7 +115,7 @@ _CCCL_DEVICE_API _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, { constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; - uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + auto* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); const uint8_t __offset = uint8_t((intptr_t) __ptr & __alignmask) * 8; const uint32_t __valueMask = __sizemask << __offset; const uint32_t __windowMask = ~__valueMask; diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h index 479815f4136..ab4b578f7d7 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h @@ -931,8 +931,8 @@ static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __ { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - const __proxy_t* __ptr_proxy = reinterpret_cast(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + const auto* __ptr_proxy = reinterpret_cast(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) {{return;}} __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); @@ -942,8 +942,8 @@ static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - const __proxy_t* __ptr_proxy = reinterpret_cast(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + const auto* __ptr_proxy = reinterpret_cast(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) {{return;}} __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy}; __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{}); @@ -1364,8 +1364,8 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) {{return;}} __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); @@ -1375,8 +1375,8 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) {{return;}} __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy}; __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); @@ -1969,9 +1969,9 @@ static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Ty { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); - __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + auto* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); bool __res = false; if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) {return __res;} __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; @@ -1982,9 +1982,9 @@ static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* _ { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); - __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + auto* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); bool __res = false; if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) {return __res;} __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy}; @@ -2576,9 +2576,9 @@ static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __ol { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); - __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + auto* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); if(__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) {{return;}} __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); @@ -2588,9 +2588,9 @@ static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _T { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); - __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + auto* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); if(__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) {{return;}} __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy}; __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{}); @@ -3044,9 +3044,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{}); @@ -3060,9 +3060,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{}); @@ -3249,9 +3249,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{}); @@ -3265,9 +3265,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{}); @@ -3614,9 +3614,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{}); @@ -3630,9 +3630,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{}); @@ -3979,9 +3979,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{}); @@ -3995,9 +3995,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{}); @@ -4184,9 +4184,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{}); @@ -4200,9 +4200,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{}); @@ -4389,9 +4389,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{}); @@ -4405,9 +4405,9 @@ template ::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; _Type __dst{}; - __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); - __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); - __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + auto* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;} __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy}; __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{}); diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h index 0a20a333ff8..1a6c594aad9 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/host.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h @@ -63,8 +63,7 @@ struct _CCCL_ALIGNAS(sizeof(_Tp)) __atomic_alignment_wrapper template __atomic_alignment_wrapper<_Tp>* __atomic_force_align_host(_Tp* __a) { - __atomic_alignment_wrapper<_Tp>* __w = - reinterpret_cast<__atomic_alignment_wrapper<_Tp>*>(const_cast*>(__a)); + auto* __w = reinterpret_cast<__atomic_alignment_wrapper<_Tp>*>(const_cast*>(__a)); return __w; } diff --git a/libcudacxx/include/cuda/std/__complex/exponential_functions.h b/libcudacxx/include/cuda/std/__complex/exponential_functions.h index 655c3cdf062..c7b4fe24051 100644 --- a/libcudacxx/include/cuda/std/__complex/exponential_functions.h +++ b/libcudacxx/include/cuda/std/__complex/exponential_functions.h @@ -201,7 +201,7 @@ _CCCL_API inline complex exp(const complex& __x) __exp_r_ldexp_factor = -151.0f; } - const int32_t __ans_ldexp_factor = static_cast(__exp_r_ldexp_factor); + const auto __ans_ldexp_factor = static_cast(__exp_r_ldexp_factor); // Split this j up into four parts to fit it into four float exponents's. // (Splitting j in 4 better than in 3). @@ -212,8 +212,8 @@ _CCCL_API inline complex exp(const complex& __x) __ans_ldexp_factor_quarter = (__ans_ldexp_factor_quarter + 127) << 23; __ans_ldexp_factor_remainder = (__ans_ldexp_factor_remainder + 127) << 23; - const float __ldexp_factor_1 = ::cuda::std::bit_cast(__ans_ldexp_factor_quarter); - const float __ldexp_factor_2 = ::cuda::std::bit_cast(__ans_ldexp_factor_remainder); + const auto __ldexp_factor_1 = ::cuda::std::bit_cast(__ans_ldexp_factor_quarter); + const auto __ldexp_factor_2 = ::cuda::std::bit_cast(__ans_ldexp_factor_remainder); // Need to order our multiplications to avoid intermediate under/overflow, including when __sin_r is denormal. // Experiment suggests this is (one of) the better ways to do it, there's not that many combinations that work for all @@ -287,7 +287,7 @@ _CCCL_API inline complex exp(const complex& __x) __exp_r_ldexp_factor = -1076.0; } - const int64_t __ans_ldexp_factor = static_cast(__exp_r_ldexp_factor); + const auto __ans_ldexp_factor = static_cast(__exp_r_ldexp_factor); // Split this j up into four parts to fit it into four float exponents's. // (Splitting j in 4 better than in 3). @@ -298,8 +298,8 @@ _CCCL_API inline complex exp(const complex& __x) __ans_ldexp_factor_quarter = (__ans_ldexp_factor_quarter + 1023) << 52; __ans_ldexp_factor_remainder = (__ans_ldexp_factor_remainder + 1023) << 52; - const double __ldexp_factor_1 = ::cuda::std::bit_cast(__ans_ldexp_factor_quarter); - const double __ldexp_factor_2 = ::cuda::std::bit_cast(__ans_ldexp_factor_remainder); + const auto __ldexp_factor_1 = ::cuda::std::bit_cast(__ans_ldexp_factor_quarter); + const auto __ldexp_factor_2 = ::cuda::std::bit_cast(__ans_ldexp_factor_remainder); // Need to order our multiplications to avoid intermediate under/overflow, including when __sin_r is denormal. // Experiment suggests this is (one of) the better ways to do it, there's not that many combinations that work for all diff --git a/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h b/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h index b72ad2fa576..cd2473f881c 100644 --- a/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h +++ b/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h @@ -195,7 +195,7 @@ template // but not small enough that the asinh(x) ~ log(2x) estimate does // not break down. We are not able to reduce this with a single simple reduction, // so we do a fast/inlined frexp/ldexp: - const int32_t __exp_biased = static_cast(::cuda::std::__fp_get_storage(__max) >> __mant_nbits); + const auto __exp_biased = static_cast(::cuda::std::__fp_get_storage(__max) >> __mant_nbits); // Get a factor such that (__max * __exp_mul_factor) <= __max_allowed_exponent const __uint_t __exp_reduce_factor = @@ -491,7 +491,7 @@ template // but not small enough that the acosh(x) ~ log(2x) estimate does // not break down. We are not able to reduce this with a single simple reduction, // so we do a fast/inlined frexp/ldexp: - const int32_t __exp_biased = static_cast(::cuda::std::__fp_get_storage(__max) >> __mant_nbits); + const auto __exp_biased = static_cast(::cuda::std::__fp_get_storage(__max) >> __mant_nbits); // Get a factor such that (__max * __exp_mul_factor) <= __max_allowed_exponent const __uint_t __exp_reduce_factor = diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h index 10ad3a20f0e..ebdc0c1d298 100644 --- a/libcudacxx/include/cuda/std/__iterator/advance.h +++ b/libcudacxx/include/cuda/std/__iterator/advance.h @@ -41,7 +41,7 @@ template ::difference_type; - _Difference __n = static_cast<_Difference>(::cuda::std::__convert_to_integral(__orig_n)); + auto __n = static_cast<_Difference>(::cuda::std::__convert_to_integral(__orig_n)); if constexpr (__has_random_access_traversal<_InputIter>) // To support pointers to incomplete types { __i += __n; diff --git a/libcudacxx/include/cuda/std/__memory/align.h b/libcudacxx/include/cuda/std/__memory/align.h index 678a4a266f3..522ac0f5a9a 100644 --- a/libcudacxx/include/cuda/std/__memory/align.h +++ b/libcudacxx/include/cuda/std/__memory/align.h @@ -46,7 +46,7 @@ _CCCL_API inline void* align(size_t __alignment, size_t __size, void*& __ptr, si char* __char_ptr = static_cast(__ptr); char* __aligned_ptr = reinterpret_cast( // NOLINT(performance-no-int-to-ptr) reinterpret_cast(__char_ptr + (__alignment - 1)) & -__alignment); - const size_t __diff = static_cast(__aligned_ptr - __char_ptr); + const auto __diff = static_cast(__aligned_ptr - __char_ptr); if (__diff > (__space - __size)) { return nullptr; diff --git a/libcudacxx/include/cuda/std/__new/allocate.h b/libcudacxx/include/cuda/std/__new/allocate.h index a52ddeef8ca..bed9a8fe825 100644 --- a/libcudacxx/include/cuda/std/__new/allocate.h +++ b/libcudacxx/include/cuda/std/__new/allocate.h @@ -79,7 +79,7 @@ _CCCL_API inline void* __cccl_allocate(size_t __size, [[maybe_unused]] size_t __ #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() if (::cuda::std::__is_overaligned_for_new(__align)) { - const align_val_t __align_val = static_cast(__align); + const auto __align_val = static_cast(__align); return ::cuda::std::__cccl_operator_new(__size, __align_val); } #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() @@ -101,7 +101,7 @@ _CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, [[maybe_unus #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() if (::cuda::std::__is_overaligned_for_new(__align)) { - const align_val_t __align_val = static_cast(__align); + const auto __align_val = static_cast(__align); return ::cuda::std::__do_deallocate_handle_size(__ptr, __size, __align_val); } #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() @@ -113,7 +113,7 @@ _CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, [[maybe_unused]] si #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() if (::cuda::std::__is_overaligned_for_new(__align)) { - const align_val_t __align_val = static_cast(__align); + const auto __align_val = static_cast(__align); return ::cuda::std::__cccl_operator_delete(__ptr, __align_val); } #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION() diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu index f7054e2def5..b006d9b9398 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu @@ -96,7 +96,7 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource allocation", "[memory_resource]", return; } #endif // _CCCL_CTK_AT_LEAST(13, 0) - managed_resource res = get_resource(); + auto res = get_resource(); cuda::stream stream{cuda::device_ref{0}}; { // allocate_sync / deallocate_sync @@ -188,9 +188,9 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource comparison", "[memory_resource]", return; } #endif // _CCCL_CTK_AT_LEAST(13, 0) - managed_resource first = get_resource(); + auto first = get_resource(); { // comparison against a plain managed_memory_resource - managed_resource second = get_resource(); + auto second = get_resource(); CHECK((first == second)); CHECK(!(first != second)); } @@ -212,7 +212,7 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource comparison", "[memory_resource]", #endif // _CCCL_CTK_AT_LEAST(13, 0) { // comparison against a managed_memory_resource wrapped inside a synchronous_resource_ref - managed_resource second = get_resource(); + auto second = get_resource(); cuda::mr::synchronous_resource_ref<::cuda::mr::device_accessible> second_ref{second}; CHECK((first == second_ref)); CHECK(!(first != second_ref)); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu index 09a00a57b80..48da4546bb2 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu @@ -138,7 +138,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES using memory_pool = TestType; SECTION("Construct from device id") { - memory_pool from_device = construct_pool(current_device); + auto from_device = construct_pool(current_device); ::cudaMemPool_t get = from_device.get(); CHECK(get != current_default_pool); @@ -156,7 +156,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES SECTION("Construct with empty properties") { cuda::memory_pool_properties props{}; - memory_pool from_defaulted_properties = construct_pool(current_device, props); + auto from_defaulted_properties = construct_pool(current_device, props); ::cudaMemPool_t get = from_defaulted_properties.get(); CHECK(get != current_default_pool); @@ -174,7 +174,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES SECTION("Construct with initial pool size") { cuda::memory_pool_properties props = {20, 42}; - memory_pool with_threshold = construct_pool(current_device, props); + auto with_threshold = construct_pool(current_device, props); ::cudaMemPool_t get = with_threshold.get(); CHECK(get != current_default_pool); @@ -261,7 +261,7 @@ C2H_CCCLRT_TEST_LIST("base_memory_pool construction", "[memory_resource]", TEST_ else # endif // _CCCL_CTK_AT_LEAST(13, 0) { - memory_pool with_max_pool_size = construct_pool(current_device, props); + auto with_max_pool_size = construct_pool(current_device, props); ::cudaMemPool_t get = with_max_pool_size.get(); CHECK(get != current_default_pool); @@ -333,9 +333,9 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool comparison", "[memory_resource]", TEST_ } using memory_pool = TestType; - memory_pool first = construct_pool(current_device); + auto first = construct_pool(current_device); { // comparison against a plain device_memory_pool - memory_pool second = construct_pool(current_device); + auto second = construct_pool(current_device); CHECK(first == first); CHECK(first != second); } @@ -357,7 +357,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T using memory_resource = typename memory_pool::reference_type; SECTION("device_memory_pool::set_attribute") { - memory_pool pool = construct_pool(current_device); + auto pool = construct_pool(current_device); { // cudaMemPoolReuseFollowEventDependencies // Get the attribute value @@ -511,7 +511,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T SECTION("device_memory_pool::trim_to") { - memory_pool pool = construct_pool(current_device); + auto pool = construct_pool(current_device); // prime the pool to a given size memory_resource resource{pool}; @@ -566,8 +566,8 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T SECTION("memory_pool::as_ref") { - memory_pool pool = construct_pool(current_device); - auto ref = pool.as_ref(); + auto pool = construct_pool(current_device); + auto ref = pool.as_ref(); static_assert(!cuda::std::copyable); static_assert(cuda::std::copyable); CHECK(ref == pool); diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu index 56b5a7e7174..51d5a80e4fd 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu @@ -83,7 +83,7 @@ C2H_CCCLRT_TEST_LIST("pinned_memory_resource allocation", "[memory_resource]", T return; } #endif // _CCCL_CTK_AT_LEAST(12, 9) - pinned_resource res = get_resource(); + auto res = get_resource(); cuda::stream stream{cuda::device_ref{0}}; { // allocate_sync / deallocate_sync @@ -202,15 +202,15 @@ static_assert(cuda::mr::synchronous_resource, ""); C2H_CCCLRT_TEST_LIST("pinned_memory_resource comparison", "[memory_resource]", TEST_TYPES) { using pinned_resource = TestType; - pinned_resource first = get_resource(); + auto first = get_resource(); { // comparison against a plain pinned_memory_resource - pinned_resource second = get_resource(); + auto second = get_resource(); CHECK((first == second)); CHECK(!(first != second)); } { // comparison against a pinned_memory_resource wrapped inside a synchronous_resource_ref - pinned_resource second = get_resource(); + auto second = get_resource(); cuda::mr::synchronous_resource_ref<::cuda::mr::device_accessible> const second_ref{second}; CHECK((first == second_ref)); @@ -221,7 +221,7 @@ C2H_CCCLRT_TEST_LIST("pinned_memory_resource comparison", "[memory_resource]", T if constexpr (cuda::mr::resource) { // comparison against a pinned_memory_resource wrapped inside a resource_ref - pinned_resource second = get_resource(); + auto second = get_resource(); cuda::mr::resource_ref<::cuda::mr::device_accessible> second_ref{second}; CHECK((first == second_ref)); diff --git a/thrust/examples/bucket_sort2d.cu b/thrust/examples/bucket_sort2d.cu index 25256f83f2a..1568de2f31d 100644 --- a/thrust/examples/bucket_sort2d.cu +++ b/thrust/examples/bucket_sort2d.cu @@ -37,8 +37,8 @@ struct point_to_bucket_index __host__ __device__ unsigned int operator()(const vec2& v) const { // find the raster indices of p's bucket - unsigned int x = static_cast(cuda::std::get<0>(v) * width); - unsigned int y = static_cast(cuda::std::get<1>(v) * height); + auto x = static_cast(cuda::std::get<0>(v) * width); + auto y = static_cast(cuda::std::get<1>(v) * height); // return the bucket's linear index return y * width + x; diff --git a/thrust/testing/copy.cu b/thrust/testing/copy.cu index 73a6892d66e..d96a9488bf4 100644 --- a/thrust/testing/copy.cu +++ b/thrust/testing/copy.cu @@ -30,8 +30,8 @@ void TestCopyFromConstIterator() std::vector v{0, 1, 2, 3, 4}; - std::vector::const_iterator begin = v.begin(); - std::vector::const_iterator end = v.end(); + auto begin = v.begin(); + auto end = v.end(); // copy to host_vector thrust::host_vector h(5, (T) 10); @@ -196,7 +196,7 @@ void TestCopyListTo() ASSERT_EQUAL(l.size(), 5lu); - typename std::list::const_iterator iter = l.begin(); + auto iter = l.begin(); ASSERT_EQUAL(*iter, T(0)); iter++; ASSERT_EQUAL(*iter, T(1)); @@ -466,7 +466,7 @@ void TestCopyIfNonTrivial() std::fill(buffer.begin(), buffer.end(), static_cast(0)); object_with_non_trivial_ctor initialized; - object_with_non_trivial_ctor* uninitialized = reinterpret_cast(buffer.data()); + auto* uninitialized = reinterpret_cast(buffer.data()); object_with_non_trivial_ctor source(42); initialized = source; diff --git a/thrust/testing/copy_n.cu b/thrust/testing/copy_n.cu index 9ee3039a706..35707f654fc 100644 --- a/thrust/testing/copy_n.cu +++ b/thrust/testing/copy_n.cu @@ -18,7 +18,7 @@ void TestCopyNFromConstIterator() std::vector v{0, 1, 2, 3, 4}; - std::vector::const_iterator begin = v.begin(); + auto begin = v.begin(); // copy to host_vector thrust::host_vector h(5, (T) 10); @@ -150,7 +150,7 @@ void TestCopyNListTo() ASSERT_EQUAL(l.size(), 5lu); - typename std::list::const_iterator iter = l.begin(); + auto iter = l.begin(); ASSERT_EQUAL(*iter, T(0)); iter++; ASSERT_EQUAL(*iter, T(1)); diff --git a/thrust/testing/cuda/copy_if.cu b/thrust/testing/cuda/copy_if.cu index b423713e911..45c960bc07b 100644 --- a/thrust/testing/cuda/copy_if.cu +++ b/thrust/testing/cuda/copy_if.cu @@ -306,7 +306,7 @@ void TestCopyIfWithMagnitude(int magnitude) auto selected_out_end = thrust::copy_if(begin, end, copied_out.begin(), mod_n{match_every_nth}); // Ensure number of selected items are correct - offset_t num_selected_out = static_cast(::cuda::std::distance(copied_out.begin(), selected_out_end)); + auto num_selected_out = static_cast(::cuda::std::distance(copied_out.begin(), selected_out_end)); ASSERT_EQUAL(num_selected_out, expected_num_copied); copied_out.resize(expected_num_copied); @@ -343,7 +343,7 @@ void TestCopyIfStencilWithMagnitude(int magnitude) auto selected_out_end = thrust::copy_if(begin, end, stencil, copied_out.begin(), mod_n{match_every_nth}); // Ensure number of selected items are correct - offset_t num_selected_out = static_cast(::cuda::std::distance(copied_out.begin(), selected_out_end)); + auto num_selected_out = static_cast(::cuda::std::distance(copied_out.begin(), selected_out_end)); ASSERT_EQUAL(num_selected_out, expected_num_copied); copied_out.resize(expected_num_copied); diff --git a/thrust/testing/cuda/unique.cu b/thrust/testing/cuda/unique.cu index 04675a91a09..872f04973d4 100644 --- a/thrust/testing/cuda/unique.cu +++ b/thrust/testing/cuda/unique.cu @@ -389,8 +389,8 @@ void TestUniqueWithMagnitude(int magnitude) using offset_t = std::int64_t; using equality_op_t = div_n_equality_op; - offset_t run_length_of_equal_items = offset_t{10}; - equality_op_t equality_op = equality_op_t{run_length_of_equal_items}; + auto run_length_of_equal_items = offset_t{10}; + auto equality_op = equality_op_t{run_length_of_equal_items}; // Prepare input offset_t num_items = offset_t{1ull} << magnitude; @@ -403,7 +403,7 @@ void TestUniqueWithMagnitude(int magnitude) auto unique_out_end = thrust::unique_copy(begin, end, unique_out.begin(), equality_op); // Ensure number of selected items are correct - offset_t num_selected_out = static_cast(cuda::std::distance(unique_out.begin(), unique_out_end)); + auto num_selected_out = static_cast(cuda::std::distance(unique_out.begin(), unique_out_end)); ASSERT_EQUAL(num_selected_out, expected_num_unique); unique_out.resize(expected_num_unique); diff --git a/thrust/testing/zip_iterator_reduce_by_key.cu b/thrust/testing/zip_iterator_reduce_by_key.cu index acf066c8112..56b81b467e8 100644 --- a/thrust/testing/zip_iterator_reduce_by_key.cu +++ b/thrust/testing/zip_iterator_reduce_by_key.cu @@ -71,7 +71,7 @@ struct TestZipIteratorReduceByKey // The tests below get miscompiled on Tesla hw for 8b types #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if (const CUDATestDriver* driver = dynamic_cast(&UnitTestDriver::s_driver())) + if (const auto* driver = dynamic_cast(&UnitTestDriver::s_driver())) { if (typeid(T) == typeid(unittest::uint8_t) && driver->current_device_architecture() < 200) { diff --git a/thrust/thrust/detail/execute_with_allocator.h b/thrust/thrust/detail/execute_with_allocator.h index 233d1466a19..e05c6e983cd 100644 --- a/thrust/thrust/detail/execute_with_allocator.h +++ b/thrust/thrust/detail/execute_with_allocator.h @@ -37,7 +37,7 @@ get_temporary_buffer(thrust::detail::execute_with_allocator(::cuda::ceil_div(sizeof(T) * n, sizeof(value_type))); + const auto num_elements = static_cast(::cuda::ceil_div(sizeof(T) * n, sizeof(value_type))); void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements); @@ -58,7 +58,7 @@ _CCCL_HOST void return_temporary_buffer( size_type num_elements = ::cuda::ceil_div(sizeof(T) * n, sizeof(value_type)); - pointer to_ptr = thrust::reinterpret_pointer_cast(p); + auto to_ptr = thrust::reinterpret_pointer_cast(p); alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements); } } // namespace detail diff --git a/thrust/thrust/detail/random_bijection.h b/thrust/thrust/detail/random_bijection.h index 4c38808dc4d..124c5e3e9da 100644 --- a/thrust/thrust/detail/random_bijection.h +++ b/thrust/thrust/detail/random_bijection.h @@ -58,14 +58,14 @@ class feistel_bijection // We cannot use the above because thrust PRNG generators incorrectly implement URBG requirements. // Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1 // (2022): 1-20. - uint32_t L = static_cast(val >> R_bits); - uint32_t R = static_cast(val & R_mask); - for (uint32_t i = 0; i < num_rounds; i++) + auto L = static_cast(val >> R_bits); + auto R = static_cast(val & R_mask); + for (const auto k : key) { constexpr uint64_t m0 = 0xD2B74407B1CE6E93; const uint64_t product = m0 * L; - uint32_t F_k = (product >> 32) ^ key[i]; - uint32_t B_k = static_cast(product); + uint32_t F_k = (product >> 32) ^ k; + auto B_k = static_cast(product); uint32_t L_prime = F_k ^ R; uint32_t R_prime = (B_k << (R_bits - L_bits)) | R >> L_bits; diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h index 8f6ee12e6b5..142e26ad5a1 100644 --- a/thrust/thrust/detail/reference.h +++ b/thrust/thrust/detail/reference.h @@ -439,7 +439,7 @@ class reference template _CCCL_HOST_DEVICE value_type strip_const_get_value(System const& system) const { - System& non_const_system = const_cast(system); + auto& non_const_system = const_cast(system); using thrust::system::detail::generic::get_value; return get_value(thrust::detail::derived_cast(non_const_system), ptr); @@ -464,7 +464,7 @@ class reference template _CCCL_HOST_DEVICE void strip_const_assign_value(System const& system, OtherPointer src) const { - System& non_const_system = const_cast(system); + auto& non_const_system = const_cast(system); using thrust::system::detail::generic::assign_value; assign_value(thrust::detail::derived_cast(non_const_system), ptr, src); diff --git a/thrust/thrust/detail/temporary_buffer.h b/thrust/thrust/detail/temporary_buffer.h index dbc61685e33..165712299d2 100644 --- a/thrust/thrust/detail/temporary_buffer.h +++ b/thrust/thrust/detail/temporary_buffer.h @@ -43,8 +43,7 @@ _CCCL_HOST_DEVICE ::cuda::std::pair, down_cast_pair(Pair p) { // XXX should use a hypothetical thrust::static_pointer_cast here - thrust::pointer ptr = - thrust::pointer(static_cast(thrust::raw_pointer_cast(p.first))); + auto ptr = thrust::pointer(static_cast(thrust::raw_pointer_cast(p.first))); using result_type = ::cuda::std::pair, typename thrust::pointer::difference_type>; diff --git a/thrust/thrust/mr/pool.h b/thrust/thrust/mr/pool.h index 12773e90dac..7d644934120 100644 --- a/thrust/thrust/mr/pool.h +++ b/thrust/thrust/mr/pool.h @@ -223,7 +223,7 @@ class unsynchronized_pool_resource final chunk_descriptor_ptr alloc = m_allocated; m_allocated = thrust::raw_reference_cast(*m_allocated).next; - void_ptr p = static_cast( + auto p = static_cast( static_cast(static_cast(alloc)) - thrust::raw_reference_cast(*alloc).size); m_upstream->do_deallocate( p, thrust::raw_reference_cast(*alloc).size + sizeof(chunk_descriptor), m_options.alignment); @@ -237,7 +237,7 @@ class unsynchronized_pool_resource final oversized_block_descriptor desc = thrust::raw_reference_cast(*alloc); - void_ptr p = static_cast(static_cast(static_cast(alloc)) - desc.current_size); + auto p = static_cast(static_cast(static_cast(alloc)) - desc.current_size); m_upstream->do_deallocate(p, desc.size + sizeof(oversized_block_descriptor), desc.alignment); } @@ -334,7 +334,7 @@ class unsynchronized_pool_resource final // no fitting cached block found; allocate a new one that's just up to the specs void_ptr allocated = m_upstream->do_allocate(bytes + sizeof(oversized_block_descriptor), alignment); - oversized_block_descriptor_ptr block = + auto block = static_cast(static_cast(static_cast(allocated) + bytes)); oversized_block_descriptor desc; @@ -397,7 +397,7 @@ class unsynchronized_pool_resource final std::size_t chunk_size = block_size * n; void_ptr allocated = m_upstream->do_allocate(chunk_size + sizeof(chunk_descriptor), m_options.alignment); - chunk_descriptor_ptr chunk = + auto chunk = static_cast(static_cast(static_cast(allocated) + chunk_size)); chunk_descriptor chunk_desc; @@ -408,7 +408,7 @@ class unsynchronized_pool_resource final for (std::size_t i = 0; i < n; ++i) { - block_descriptor_ptr block = static_cast( + auto block = static_cast( static_cast(static_cast(allocated) + block_size * i + bytes)); block_descriptor block_desc; @@ -435,8 +435,7 @@ class unsynchronized_pool_resource final // the deallocated block is oversized and/or overaligned if (n > m_options.largest_block_size || alignment > m_options.alignment) { - oversized_block_descriptor_ptr block = - static_cast(static_cast(static_cast(p) + n)); + auto block = static_cast(static_cast(static_cast(p) + n)); oversized_block_descriptor desc = *block; assert(desc.current_size == n); @@ -498,7 +497,7 @@ class unsynchronized_pool_resource final n = static_cast(1) << n_log2; - block_descriptor_ptr block = static_cast(static_cast(static_cast(p) + n)); + auto block = static_cast(static_cast(static_cast(p) + n)); block_descriptor desc; desc.next = bucket.free_list; diff --git a/thrust/thrust/random/detail/normal_distribution_base.h b/thrust/thrust/random/detail/normal_distribution_base.h index ebceb9b27bc..9df8ade382e 100644 --- a/thrust/thrust/random/detail/normal_distribution_base.h +++ b/thrust/thrust/random/detail/normal_distribution_base.h @@ -42,10 +42,10 @@ class normal_distribution_nvcc constexpr uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min; // Constants for conversion - constexpr RealType S1 = static_cast(1. / static_cast(urng_range)); + constexpr auto S1 = static_cast(1. / static_cast(urng_range)); constexpr RealType S2 = S1 / 2; - RealType S3 = static_cast(-1.4142135623730950488016887242097); // -sqrt(2) + auto S3 = static_cast(-1.4142135623730950488016887242097); // -sqrt(2) // Get the integer value uint_type u = urng() - UniformRandomNumberGenerator::min; @@ -118,7 +118,7 @@ class normal_distribution_portable m_valid = false; } - const RealType pi = RealType(3.14159265358979323846); + const auto pi = RealType(3.14159265358979323846); RealType result = m_cached_rho * (m_valid ? cos(RealType(2) * pi * m_r1) : sin(RealType(2) * pi * m_r1)); diff --git a/thrust/thrust/random/detail/uniform_int_distribution.inl b/thrust/thrust/random/detail/uniform_int_distribution.inl index dddca6aa706..111bd751708 100644 --- a/thrust/thrust/random/detail/uniform_int_distribution.inl +++ b/thrust/thrust/random/detail/uniform_int_distribution.inl @@ -54,8 +54,8 @@ uniform_int_distribution::operator()(UniformRandomNumberGenerator& urng using float_type = typename thrust::detail::largest_available_float::type; - const float_type real_min(static_cast(parm.first)); - const float_type real_max(static_cast(parm.second)); + const auto real_min(static_cast(parm.first)); + const auto real_max(static_cast(parm.second)); // add one to the right end of the interval because it is half-open // XXX adding 1.0 to a potentially large floating point number seems like a bad idea diff --git a/thrust/thrust/random/detail/uniform_real_distribution.inl b/thrust/thrust/random/detail/uniform_real_distribution.inl index 3f026ecfc89..294ad316911 100644 --- a/thrust/thrust/random/detail/uniform_real_distribution.inl +++ b/thrust/thrust/random/detail/uniform_real_distribution.inl @@ -49,7 +49,7 @@ _CCCL_HOST_DEVICE typename uniform_real_distribution::result_type uniform_real_distribution::operator()(UniformRandomNumberGenerator& urng, const param_type& parm) { // call the urng & map its result to [0,1) - result_type result = static_cast(urng() - UniformRandomNumberGenerator::min); + auto result = static_cast(urng() - UniformRandomNumberGenerator::min); // adding one to the denominator ensures that the interval is half-open at 1.0 // XXX adding 1.0 to a potentially large floating point number seems like a bad idea diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h index 65533cb08c6..52b0f204d28 100644 --- a/thrust/thrust/system/cuda/detail/copy_if.h +++ b/thrust/thrust/system/cuda/detail/copy_if.h @@ -123,7 +123,7 @@ struct DispatchCopyIf } // Memory allocation for the number of selected output items - OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); + auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); // Run algorithm status = cub:: @@ -166,7 +166,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( { using size_type = thrust::detail::it_difference_t; - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); cudaError_t status = cudaSuccess; size_t temp_storage_bytes = 0; diff --git a/thrust/thrust/system/cuda/detail/cross_system.h b/thrust/thrust/system/cuda/detail/cross_system.h index e2f94b123a5..2014c5a2ec5 100644 --- a/thrust/thrust/system/cuda/detail/cross_system.h +++ b/thrust/thrust/system/cuda/detail/cross_system.h @@ -200,8 +200,8 @@ _CCCL_HOST_DEVICE auto select_device_system(thrust::cuda::execution_policy _CCCL_HOST_DEVICE cross_system select_system( execution_policy const& sys1, thrust::cpp::execution_policy const& sys2) { - thrust::execution_policy& non_const_sys1 = const_cast&>(sys1); - thrust::cpp::execution_policy& non_const_sys2 = const_cast&>(sys2); + thrust::execution_policy& non_const_sys1 = const_cast&>(sys1); + auto& non_const_sys2 = const_cast&>(sys2); return cross_system(non_const_sys1, non_const_sys2); } @@ -210,8 +210,8 @@ template _CCCL_HOST_DEVICE cross_system select_system(thrust::cpp::execution_policy const& sys1, execution_policy const& sys2) { - thrust::cpp::execution_policy& non_const_sys1 = const_cast&>(sys1); - thrust::execution_policy& non_const_sys2 = const_cast&>(sys2); + auto& non_const_sys1 = const_cast&>(sys1); + thrust::execution_policy& non_const_sys2 = const_cast&>(sys2); return cross_system(non_const_sys1, non_const_sys2); } } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index d2d9f69dc80..fce273173a2 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -347,7 +347,7 @@ element(execution_policy& policy, ItemsIt first, ItemsIt last, BinaryPr using InputType = thrust::detail::it_value_t; using IndexType = thrust::detail::it_difference_t; - IndexType num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); using iterator_tuple = ::cuda::std::tuple>; using zip_iterator = thrust::zip_iterator; diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h index 90952c0cca2..46170484851 100644 --- a/thrust/thrust/system/cuda/detail/find.h +++ b/thrust/thrust/system/cuda/detail/find.h @@ -69,8 +69,8 @@ find_if_n_impl(execution_policy& policy, InputIt first, Size num_items, thrust::detail::temporary_array tmp(policy, sizeof(adjusted_size_type) + tmp_size); // Run find_if. - adjusted_size_type* result_ptr = thrust::detail::aligned_reinterpret_cast(tmp.data().get()); - void* tmp_ptr = static_cast((tmp.data() + sizeof(adjusted_size_type)).get()); + auto* result_ptr = thrust::detail::aligned_reinterpret_cast(tmp.data().get()); + void* tmp_ptr = static_cast((tmp.data() + sizeof(adjusted_size_type)).get()); status = cub::DeviceFind::FindIf(tmp_ptr, tmp_size, first, result_ptr, predicate, num_items_fixed, stream); cuda_cub::throw_on_error(status, "find_if: failed to run algorithm"); diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h index 6ca50350d17..8655cb14983 100644 --- a/thrust/thrust/system/cuda/detail/for_each.h +++ b/thrust/thrust/system/cuda/detail/for_each.h @@ -50,7 +50,7 @@ template Input _CCCL_API _CCCL_FORCEINLINE for_each(execution_policy& policy, Input first, Input last, UnaryOp op) { using size_type = thrust::detail::it_difference_t; - size_type count = static_cast(::cuda::std::distance(first, last)); + auto count = static_cast(::cuda::std::distance(first, last)); return THRUST_NS_QUALIFIER::cuda_cub::for_each_n(policy, first, count, op); } diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h index 96e7a536b0c..d270c4de479 100644 --- a/thrust/thrust/system/cuda/detail/partition.h +++ b/thrust/thrust/system/cuda/detail/partition.h @@ -100,7 +100,7 @@ struct DispatchPartitionIf } // Memory allocation for the number of selected output items - OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); + auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); // Run algorithm status = cub::DispatchSelectIf< @@ -200,7 +200,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair stable_p } using output_it_wrapper_t = cub::detail::select::partition_distinct_output_t; - std::size_t num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); std::size_t num_selected = partition(policy, first, last, stencil, output_it_wrapper_t{selected_result, rejected_result}, predicate); return ::cuda::std::make_pair(selected_result + num_selected, rejected_result + num_items - num_selected); @@ -216,8 +216,8 @@ THRUST_RUNTIME_FUNCTION InputIt inplace_partition( } // Element type of the input iterator - using value_t = thrust::detail::it_value_t; - std::size_t num_items = static_cast(::cuda::std::distance(first, last)); + using value_t = thrust::detail::it_value_t; + auto num_items = static_cast(::cuda::std::distance(first, last)); // Allocate temporary storage, which will serve as the input to the partition thrust::detail::temporary_array tmp(policy, num_items); diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index b8d326e27ca..664c762df2c 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -277,7 +277,7 @@ struct ReduceAgent T items[ITEMS_PER_THREAD]; - Vector* vec_items = reinterpret_cast(items); + auto* vec_items = reinterpret_cast(items); // Vector Input iterator wrapper type (for applying cache modifier) T* d_in_unqualified = const_cast(input_it) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); @@ -700,7 +700,7 @@ _CCCL_HOST_DEVICE T reduce(execution_policy& policy, InputIt first, Inp { using size_type = thrust::detail::it_difference_t; // FIXME: Check for RA iterator. - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); return cuda_cub::reduce_n(policy, first, num_items, init, binary_op); } @@ -724,7 +724,7 @@ reduce_into(execution_policy& policy, InputIt first, InputIt last, Outp { using size_type = thrust::detail::it_difference_t; // FIXME: Check for RA iterator. - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); cuda_cub::reduce_n_into(policy, first, num_items, output, init, binary_op); } diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 6caa5126960..9d5973b99cf 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -1059,8 +1059,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( status = tile_state.Init(static_cast(num_tiles), allocations[0], allocation_sizes[0]); _CUDA_CUB_RET_IF_FAIL(status); - ::cuda::std::pair* partitions = (::cuda::std::pair*) allocations[1]; - char* vshmem_ptr = vshmem_storage > 0 ? (char*) allocations[2] : nullptr; + auto* partitions = (::cuda::std::pair*) allocations[1]; + char* vshmem_ptr = vshmem_storage > 0 ? (char*) allocations[2] : nullptr; init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent"); ia.launch(tile_state, num_tiles); @@ -1115,8 +1115,8 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair set_oper { using size_type = thrust::detail::it_difference_t; - size_type num_keys1 = static_cast(::cuda::std::distance(keys1_first, keys1_last)); - size_type num_keys2 = static_cast(::cuda::std::distance(keys2_first, keys2_last)); + auto num_keys1 = static_cast(::cuda::std::distance(keys1_first, keys1_last)); + auto num_keys2 = static_cast(::cuda::std::distance(keys2_first, keys2_last)); if (num_keys1 + num_keys2 == 0) { @@ -1163,7 +1163,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair set_oper status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage"); - std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast(allocations[0]); + auto* d_output_count = thrust::detail::aligned_reinterpret_cast(allocations[0]); THRUST_DOUBLE_INDEX_TYPE_DISPATCH( status, diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h index 8fd01877e8c..1169be16bc5 100644 --- a/thrust/thrust/system/cuda/detail/sort.h +++ b/thrust/thrust/system/cuda/detail/sort.h @@ -128,7 +128,7 @@ THRUST_RUNTIME_FUNCTION void merge_sort( { using size_type = thrust::detail::it_difference_t; - size_type count = static_cast(::cuda::std::distance(keys_first, keys_last)); + auto count = static_cast(::cuda::std::distance(keys_first, keys_last)); size_t storage_size = 0; cudaStream_t stream = cuda_cub::stream(policy); diff --git a/thrust/thrust/system/cuda/detail/transform_reduce.h b/thrust/thrust/system/cuda/detail/transform_reduce.h index 0782fb5dcdf..47ca05d8b91 100644 --- a/thrust/thrust/system/cuda/detail/transform_reduce.h +++ b/thrust/thrust/system/cuda/detail/transform_reduce.h @@ -102,8 +102,8 @@ template & policy, InputIt first, InputIt last, TransformOp transform_op, T init, ReduceOp reduce_op) { - using size_type = thrust::detail::it_difference_t; - const size_type num_items = static_cast(::cuda::std::distance(first, last)); + using size_type = thrust::detail::it_difference_t; + const auto num_items = static_cast(::cuda::std::distance(first, last)); THRUST_CDP_DISPATCH( (init = thrust::cuda_cub::detail::transform_reduce_n_impl(policy, first, num_items, transform_op, init, reduce_op);), diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h index 19cb48c54f6..3e5755cd96e 100644 --- a/thrust/thrust/system/cuda/detail/transform_scan.h +++ b/thrust/thrust/system/cuda/detail/transform_scan.h @@ -40,7 +40,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan( using value_type = ::cuda::std::remove_cvref_t; using size_type = thrust::detail::it_difference_t; - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); using transformed_iterator_t = transform_iterator; return cuda_cub::inclusive_scan_n(policy, transformed_iterator_t(first, transform_op), num_items, result, scan_op); @@ -61,7 +61,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan( using value_type = ::cuda::std::remove_cvref_t; using size_type = thrust::detail::it_difference_t; - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); using transformed_iterator_t = transform_iterator; return cuda_cub::inclusive_scan_n( @@ -82,7 +82,7 @@ OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan( using result_type = ::cuda::std::remove_cvref_t; using size_type = thrust::detail::it_difference_t; - size_type num_items = static_cast(::cuda::std::distance(first, last)); + auto num_items = static_cast(::cuda::std::distance(first, last)); using transformed_iterator_t = transform_iterator; return cuda_cub::exclusive_scan_n( diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index e6544b79d01..738346fd409 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -89,7 +89,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t dispatch_select_unique( void* allocations[2] = {nullptr, nullptr}; // The flag iterator is not used for unique, so we set it to nullptr. - flag_iterator_t flag_it = static_cast(nullptr); + auto flag_it = static_cast(nullptr); // Query algorithm memory requirements status = cub::DispatchSelectIf< @@ -128,7 +128,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t dispatch_select_unique( } // Memory allocation for the number of selected output items - OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); + auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); // Run algorithm status = cub::DispatchSelectIf< diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h index 889bbad697c..0172d3ce66d 100644 --- a/thrust/thrust/system/cuda/detail/unique_by_key.h +++ b/thrust/thrust/system/cuda/detail/unique_by_key.h @@ -116,7 +116,7 @@ struct DispatchUniqueByKey } // Memory allocation for the number of selected output items - OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); + auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[1]); // Run algorithm status = cub::DeviceSelect::UniqueByKey( @@ -159,7 +159,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair unique_by_ke { using size_type = thrust::detail::it_difference_t; - size_type num_items = static_cast(::cuda::std::distance(keys_first, keys_last)); + auto num_items = static_cast(::cuda::std::distance(keys_first, keys_last)); ::cuda::std::pair result_end{}; cudaError_t status = cudaSuccess; size_t temp_storage_bytes = 0; diff --git a/thrust/thrust/system/detail/sequential/stable_radix_sort.h b/thrust/thrust/system/detail/sequential/stable_radix_sort.h index 8c968651125..4ac45003a49 100644 --- a/thrust/thrust/system/detail/sequential/stable_radix_sort.h +++ b/thrust/thrust/system/detail/sequential/stable_radix_sort.h @@ -237,7 +237,7 @@ _CCCL_HOST_DEVICE void radix_sort( const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits; const unsigned int HistogramSize = 1 << RadixBits; - const EncodedType BitMask = static_cast((1 << RadixBits) - 1); + const auto BitMask = static_cast((1 << RadixBits) - 1); Encoder encode; @@ -285,7 +285,7 @@ _CCCL_HOST_DEVICE void radix_sort( // shuffle keys and (optionally) values for (unsigned int i = 0; i < NumHistograms; i++) { - const EncodedType BitShift = static_cast(RadixBits * i); + const auto BitShift = static_cast(RadixBits * i); if (!skip_shuffle[i]) { diff --git a/thrust/thrust/system/omp/detail/reduce_intervals.h b/thrust/thrust/system/omp/detail/reduce_intervals.h index 580bd1ffc9e..3de0248bb3d 100644 --- a/thrust/thrust/system/omp/detail/reduce_intervals.h +++ b/thrust/thrust/system/omp/detail/reduce_intervals.h @@ -57,7 +57,7 @@ void reduce_intervals( using index_type = std::intptr_t; - index_type n = static_cast(decomp.size()); + auto n = static_cast(decomp.size()); THRUST_PRAGMA_OMP(parallel for) for (index_type i = 0; i < n; i++)