Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ Checks:
- '-modernize-type-traits'
- '-modernize-use-nullptr'
- '-modernize-return-braced-init-list'
- '-modernize-use-auto'
- '-modernize-use-equals-delete'
- '-modernize-pass-by-value'
- '-modernize-loop-convert'
Expand Down
4 changes: 2 additions & 2 deletions c/experimental/stf/test/test_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
stf_cuda_kernel_add_dep(k, lX, STF_READ);
stf_cuda_kernel_add_dep(k, lY, STF_RW);
stf_cuda_kernel_start(k);
double* dX = (double*) stf_cuda_kernel_get_arg(k, 0);
double* dY = (double*) stf_cuda_kernel_get_arg(k, 1);
auto* dX = (double*) stf_cuda_kernel_get_arg(k, 0);
auto* dY = (double*) stf_cuda_kernel_get_arg(k, 1);
const void* args[4] = {&N, &alpha, &dX, &dY};
cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
REQUIRE(err == cudaSuccess);
Expand Down
6 changes: 3 additions & 3 deletions c/experimental/stf/test/test_host_launch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ C2H_TEST("host_launch with stream context", "[host_launch]")
stf_task_set_symbol(t, "fill");
stf_task_add_dep(t, lData, STF_WRITE);
stf_task_start(t);
double* dData = (double*) stf_task_get(t, 0);
auto* dData = (double*) stf_task_get(t, 0);
fill_kernel<<<2, 128, 0, (cudaStream_t) stf_task_get_custream(t)>>>((int) N, dData, 42.0);
stf_task_end(t);
stf_task_destroy(t);
Expand Down Expand Up @@ -134,8 +134,8 @@ C2H_TEST("host_launch with graph context", "[host_launch]")
stf_task_add_dep(t, lData, STF_WRITE);
stf_task_enable_capture(t);
stf_task_start(t);
double* dData = (double*) stf_task_get(t, 0);
cudaStream_t stream = (cudaStream_t) stf_task_get_custream(t);
auto* dData = (double*) stf_task_get(t, 0);
auto stream = (cudaStream_t) stf_task_get_custream(t);
fill_kernel<<<2, 128, 0, stream>>>((int) N, dData, 42.0);
stf_task_end(t);
stf_task_destroy(t);
Expand Down
6 changes: 3 additions & 3 deletions c/experimental/stf/test/test_logical_data_with_place.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ C2H_TEST("stf_logical_data_with_place - host place (pinned memory)", "[logical_d
cudaError_t err = cudaMallocHost(&A_raw, N * sizeof(float));
REQUIRE(err == cudaSuccess);
std::unique_ptr<void, decltype(&cudaFreeHost)> A_owner(A_raw, cudaFreeHost);
float* A = static_cast<float*>(A_owner.get());
auto* A = static_cast<float*>(A_owner.get());
for (size_t i = 0; i < N; ++i)
{
A[i] = static_cast<float>(i);
Expand Down Expand Up @@ -114,7 +114,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
cudaError_t err = cudaMalloc(&d_raw, N * sizeof(float));
REQUIRE(err == cudaSuccess);
std::unique_ptr<void, decltype(&cudaFree)> d_data_owner(d_raw, cudaFree);
float* d_data = static_cast<float*>(d_data_owner.get());
auto* d_data = static_cast<float*>(d_data_owner.get());

std::vector<float> h_init(N);
for (size_t i = 0; i < N; ++i)
Expand All @@ -135,7 +135,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
stf_cuda_kernel_set_symbol(k, "scale_inplace");
stf_cuda_kernel_add_dep(k, lD, STF_RW);
stf_cuda_kernel_start(k);
float* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
auto* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
REQUIRE(arg_ptr == d_data);
int n = static_cast<int>(N);
const void* args[3] = {&n, &arg_ptr, &factor};
Expand Down
2 changes: 1 addition & 1 deletion c/experimental/stf/test/test_places.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_
{
part_size = 1;
}
int64_t c = static_cast<int64_t>(data_coords.x);
auto c = static_cast<int64_t>(data_coords.x);
int64_t place_x = c / static_cast<int64_t>(part_size);
if (place_x >= static_cast<int64_t>(nplaces))
{
Expand Down
6 changes: 3 additions & 3 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ struct bfloat16_t
/// Cast to float
__host__ __device__ __forceinline__ operator float() const
{
float f = 0;
uint32_t* p = reinterpret_cast<uint32_t*>(&f);
*p = uint32_t(__x) << 16;
float f = 0;
auto* p = reinterpret_cast<uint32_t*>(&f);
*p = uint32_t(__x) << 16;
return f;
}

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
}

const SampleT lower_level = 0;
const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
const auto upper_level = get_upper_level<SampleT>(num_bins, elements);

thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
thrust::device_vector<CounterT> hist(num_bins);
Expand All @@ -70,7 +70,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
std::size_t temp_storage_bytes{};

cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
auto num_row_pixels = static_cast<OffsetT>(elements);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto breaks alignment

OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
}

const SampleT lower_level_r = 0;
const SampleT upper_level_r = get_upper_level<SampleT>(num_bins, elements);
const auto upper_level_r = get_upper_level<SampleT>(num_bins, elements);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am against this one, it breaks alignnment

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if I made them all auto? And

const SampleT lower_level_r = 0;
// to
const auto lower_level_r = SamepleT{0};

I can silence them with // NOLINT but I'd rather avoid that if there are other code changes we can make to satisfy the linter.

Copy link
Copy Markdown
Contributor

@fbusato fbusato Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const SampleT lower_level_g = lower_level_r;
const SampleT upper_level_g = upper_level_r;
const SampleT lower_level_b = lower_level_g;
Expand All @@ -80,7 +80,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
std::size_t temp_storage_bytes{};

cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
auto num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
const int num_levels_b = num_levels_g;

const SampleT lower_level = 0;
const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
const auto upper_level = get_upper_level<SampleT>(num_bins, elements);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those also break alignment


SampleT step = (upper_level - lower_level) / num_bins;
thrust::device_vector<SampleT> levels_r(num_bins + 1);
Expand Down Expand Up @@ -80,7 +80,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
std::size_t temp_storage_bytes{};

cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
auto num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
const int num_levels = static_cast<int>(num_bins) + 1;

const SampleT lower_level = 0;
const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
const auto upper_level = get_upper_level<SampleT>(num_bins, elements);

SampleT step = (upper_level - lower_level) / num_bins;
thrust::device_vector<SampleT> levels(num_bins + 1);
Expand All @@ -69,7 +69,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
std::size_t temp_storage_bytes{};

cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
OffsetT num_row_pixels = static_cast<OffsetT>(elements);
auto num_row_pixels = static_cast<OffsetT>(elements);
OffsetT num_rows = 1;
OffsetT row_stride_samples = num_row_pixels;

Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);
const auto num_items = static_cast<offset_t>(elements);

auto dispatch_on_stream = [&](cudaStream_t stream) {
return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);
const auto num_items = static_cast<offset_t>(elements);

auto dispatch_on_stream = [&](cudaStream_t stream) {
return cub::detail::reduce_by_key::dispatch_streaming</* OverrideAccumT */ accum_t>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);
const auto num_items = static_cast<offset_t>(elements);

auto dispatch_on_stream = [&](cudaStream_t stream) {
cub::detail::rle::dispatch(
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ template <typename VectorT>
_CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out)
{
const uint32_t offset = reinterpret_cast<uintptr_t>(ptr) % 4U;
const uint32_t* aligned_ptr = reinterpret_cast<uint32_t const*>(ptr - offset);
auto* aligned_ptr = reinterpret_cast<uint32_t const*>(ptr - offset);
constexpr uint32_t bits_per_byte = 8U;
const uint32_t bit_shift = offset * bits_per_byte;

Expand Down Expand Up @@ -175,7 +175,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
uint32_t out_start_aligned = ::cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);

// Compute the beginning of the aligned ranges (output and input pointers)
VectorT* out_aligned_begin = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
auto* out_aligned_begin = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
const char* in_aligned_begin = in_ptr + (reinterpret_cast<char*>(out_aligned_begin) - out_ptr);

// If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
Expand All @@ -197,7 +197,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size;
}

VectorT* out_aligned_end = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
auto* out_aligned_end = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
const char* in_aligned_end = in_ptr + (reinterpret_cast<char*>(out_aligned_end) - out_ptr);

return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ struct AgentReduceImpl

// Load items as vector items
InputT input_items[ITEMS_PER_THREAD];
VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
auto* vec_items = reinterpret_cast<VectorT*>(input_items);

// Alias items as an array of VectorT and load it in striped fashion
static constexpr int words = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -613,8 +613,8 @@ struct AgentTopK
counter->k = k - prev;

// The number of candidates in the next pass
counter->len = cur - prev;
const unsigned int bucket = static_cast<unsigned int>(bin_idx);
counter->len = cur - prev;
const auto bucket = static_cast<unsigned int>(bin_idx);
// Update the "splitter" key by adding the radix digit of the k-th item bin of this pass
set_kth_key_bits<key_in_t, bits_per_pass>(counter->kth_key_bits, pass, bucket);
}
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -741,8 +741,8 @@ struct ScanTileState<T, true>
{
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;

TxnWord val = TxnWord();
TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
TxnWord val = TxnWord();
auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);

if (tile_idx < num_tiles)
{
Expand Down Expand Up @@ -1150,9 +1150,9 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
*/
_CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
{
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
TxnWord val = TxnWord();
TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
TxnWord val = TxnWord();
auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);

if (tile_idx < num_tiles)
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/block/block_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count,

while (keys1_begin < keys1_end)
{
const OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
const auto mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
// pull copies of the keys before calling binary_pred so proxy references are unwrapped
const detail::it_value_t<KeyIt1> key1 = keys1[mid];
const detail::it_value_t<KeyIt2> key2 = keys2[diag - 1 - mid];
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ private:
::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
DecomposerT decomposer = {})
{
bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remark: haven't seen a reinterpretation as a reference to an array in a long time


_CCCL_PRAGMA_UNROLL_FULL()
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
Expand Down Expand Up @@ -466,7 +466,7 @@ public:
::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
DecomposerT decomposer = {})
{
bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);

_CCCL_PRAGMA_UNROLL_FULL()
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_run_length_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ private:
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (::cuda::std::min) (mid, num_items - 1);
auto mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (::cuda::std::min) (mid, num_items - 1);

if (val < input[mid])
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ItemsPerTh
if (reinterpret_cast<uintptr_t>(block_ptr) % (alignof(Vector)) == 0)
{
// Alias global pointer
Vector* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
auto* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));

// Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
Vector raw_vector[VECTORS_PER_THREAD];
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/block/specializations/block_reduce_raking.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ struct BlockReduceRaking

// sync before re-using shmem (warp_storage/raking_grid are aliased)
static_assert(RAKING_THREADS <= warp_threads, "RAKING_THREADS must be <= warp size.");
unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
auto mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
__syncwarp(mask);

partial = WarpReduce(temp_storage.warp_storage)
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/block/specializations/block_topk_air.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,10 @@ private:
for (int pass = 0; pass < num_passes; ++pass)
{
// Bit-range & mask of the current pass
const int pass_end_bit = end_bit - pass * RadixBits;
const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
const int pass_bits = pass_end_bit - pass_begin_bit;
const bit_ordered_type pass_mask = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);
const int pass_end_bit = end_bit - pass * RadixBits;
const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
const int pass_bits = pass_end_bit - pass_begin_bit;
const auto pass_mask = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);

// Zero-initialize histograms for the current pass
init_histograms();
Expand Down Expand Up @@ -329,8 +329,8 @@ private:

// Get bit-twiddled sortkeys. For float keys, track which were -0.0 (normalized to +0.0 for ranking) so we can
// restore -0.0 in the output via a bitvector; no extra key buffer.
bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32);
auto& unsigned_keys = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32);
[[maybe_unused]] ::cuda::std::uint32_t flip_back_bits[flip_back_num_words] = {};
if constexpr (::cuda::is_floating_point_v<KeyT>)
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/rfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ private:
/// Return a binned floating-point bin
[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE static ftype binned_bins(int index)
{
ftype* bins = get_shared_bin_array<ftype, max_index + max_fold>();
auto* bins = get_shared_bin_array<ftype, max_index + max_fold>();
return bins[index];
}

Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large
using InputBufferT = it_value_t<InputBufferIt>;
using OutputBufferT = it_value_t<OutputBufferIt>;

constexpr uint32_t BLOCK_THREADS = static_cast<uint32_t>(policy.block_threads);
constexpr uint32_t ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
constexpr BufferSizeT TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
constexpr auto BLOCK_THREADS = static_cast<uint32_t>(policy.block_threads);
constexpr auto ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
constexpr auto TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);

BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ struct DispatchFixedSizeSegmentedReduce
}

// Alias the allocation for the privatized per-block reductions
AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
auto* d_block_reductions = static_cast<AccumT*>(allocations[0]);

for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
ValueIt3,
Offset,
CompareOp>::type::items_per_tile;
const Offset diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
const auto diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
if (diagonal_idx < num_diagonals)
{
const Offset diagonal_num = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count);
Expand Down
Loading
Loading