NVIDIA · Jacobfaib · Apr 9, 2026 · Apr 15, 2026 · miscco · Apr 16, 2026
diff --git a/.clang-tidy b/.clang-tidy
@@ -36,7 +36,6 @@ Checks:
   - '-modernize-type-traits'
   - '-modernize-use-nullptr'
   - '-modernize-return-braced-init-list'
-  - '-modernize-use-auto'
   - '-modernize-use-equals-delete'
   - '-modernize-pass-by-value'
   - '-modernize-loop-convert'

@@ -68,8 +68,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   stf_cuda_kernel_add_dep(k, lX, STF_READ);
   stf_cuda_kernel_add_dep(k, lY, STF_RW);
   stf_cuda_kernel_start(k);
-  double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
-  double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
+  auto* dX            = (double*) stf_cuda_kernel_get_arg(k, 0);
+  auto* dY            = (double*) stf_cuda_kernel_get_arg(k, 1);
   const void* args[4] = {&N, &alpha, &dX, &dY};
   cudaError_t err     = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
   REQUIRE(err == cudaSuccess);

@@ -84,7 +84,7 @@ C2H_TEST("host_launch with stream context", "[host_launch]")
   stf_task_set_symbol(t, "fill");
   stf_task_add_dep(t, lData, STF_WRITE);
   stf_task_start(t);
-  double* dData = (double*) stf_task_get(t, 0);
+  auto* dData = (double*) stf_task_get(t, 0);
   fill_kernel<<<2, 128, 0, (cudaStream_t) stf_task_get_custream(t)>>>((int) N, dData, 42.0);
   stf_task_end(t);
   stf_task_destroy(t);
@@ -134,8 +134,8 @@ C2H_TEST("host_launch with graph context", "[host_launch]")
   stf_task_add_dep(t, lData, STF_WRITE);
   stf_task_enable_capture(t);
   stf_task_start(t);
-  double* dData       = (double*) stf_task_get(t, 0);
-  cudaStream_t stream = (cudaStream_t) stf_task_get_custream(t);
+  auto* dData = (double*) stf_task_get(t, 0);
+  auto stream = (cudaStream_t) stf_task_get_custream(t);
   fill_kernel<<<2, 128, 0, stream>>>((int) N, dData, 42.0);
   stf_task_end(t);
   stf_task_destroy(t);

@@ -75,7 +75,7 @@ C2H_TEST("stf_logical_data_with_place - host place (pinned memory)", "[logical_d
   cudaError_t err = cudaMallocHost(&A_raw, N * sizeof(float));
   REQUIRE(err == cudaSuccess);
   std::unique_ptr<void, decltype(&cudaFreeHost)> A_owner(A_raw, cudaFreeHost);
-  float* A = static_cast<float*>(A_owner.get());
+  auto* A = static_cast<float*>(A_owner.get());
   for (size_t i = 0; i < N; ++i)
   {
     A[i] = static_cast<float>(i);
@@ -114,7 +114,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
   cudaError_t err = cudaMalloc(&d_raw, N * sizeof(float));
   REQUIRE(err == cudaSuccess);
   std::unique_ptr<void, decltype(&cudaFree)> d_data_owner(d_raw, cudaFree);
-  float* d_data = static_cast<float*>(d_data_owner.get());
+  auto* d_data = static_cast<float*>(d_data_owner.get());
 
   std::vector<float> h_init(N);
   for (size_t i = 0; i < N; ++i)
@@ -135,7 +135,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
   stf_cuda_kernel_set_symbol(k, "scale_inplace");
   stf_cuda_kernel_add_dep(k, lD, STF_RW);
   stf_cuda_kernel_start(k);
-  float* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
+  auto* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
   REQUIRE(arg_ptr == d_data);
   int n               = static_cast<int>(N);
   const void* args[3] = {&n, &arg_ptr, &factor};

@@ -28,7 +28,7 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_
   {
     part_size = 1;
   }
-  int64_t c       = static_cast<int64_t>(data_coords.x);
+  auto c          = static_cast<int64_t>(data_coords.x);
   int64_t place_x = c / static_cast<int64_t>(part_size);
   if (place_x >= static_cast<int64_t>(nplaces))
   {

@@ -106,9 +106,9 @@ struct bfloat16_t
   /// Cast to float
   __host__ __device__ __forceinline__ operator float() const
   {
-    float f     = 0;
-    uint32_t* p = reinterpret_cast<uint32_t*>(&f);
-    *p          = uint32_t(__x) << 16;
+    float f = 0;
+    auto* p = reinterpret_cast<uint32_t*>(&f);
+    *p      = uint32_t(__x) << 16;
     return f;
   }
 

@@ -58,7 +58,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   }
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
   thrust::device_vector<CounterT> hist(num_bins);
@@ -70,7 +70,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 

@@ -60,7 +60,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   }
 
   const SampleT lower_level_r = 0;
-  const SampleT upper_level_r = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level_r    = get_upper_level<SampleT>(num_bins, elements);
   const SampleT lower_level_g = lower_level_r;
   const SampleT upper_level_g = upper_level_r;
   const SampleT lower_level_b = lower_level_g;
@@ -80,7 +80,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 

@@ -52,7 +52,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   const int num_levels_b = num_levels_g;
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   SampleT step = (upper_level - lower_level) / num_bins;
   thrust::device_vector<SampleT> levels_r(num_bins + 1);
@@ -80,7 +80,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 

@@ -50,7 +50,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   const int num_levels = static_cast<int>(num_bins) + 1;
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   SampleT step = (upper_level - lower_level) / num_bins;
   thrust::device_vector<SampleT> levels(num_bins + 1);
@@ -69,7 +69,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 

@@ -57,7 +57,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(

@@ -66,7 +66,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     return cub::detail::reduce_by_key::dispatch_streaming</* OverrideAccumT */ accum_t>(

@@ -64,7 +64,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     cub::detail::rle::dispatch(

@@ -98,7 +98,7 @@ template <typename VectorT>
 _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out)
 {
   const uint32_t offset            = reinterpret_cast<uintptr_t>(ptr) % 4U;
-  const uint32_t* aligned_ptr      = reinterpret_cast<uint32_t const*>(ptr - offset);
+  auto* aligned_ptr                = reinterpret_cast<uint32_t const*>(ptr - offset);
   constexpr uint32_t bits_per_byte = 8U;
   const uint32_t bit_shift         = offset * bits_per_byte;
 
@@ -175,7 +175,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
   uint32_t out_start_aligned = ::cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);
 
   // Compute the beginning of the aligned ranges (output and input pointers)
-  VectorT* out_aligned_begin   = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
+  auto* out_aligned_begin      = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
   const char* in_aligned_begin = in_ptr + (reinterpret_cast<char*>(out_aligned_begin) - out_ptr);
 
   // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
@@ -197,7 +197,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
     out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size;
   }
 
-  VectorT* out_aligned_end   = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
+  auto* out_aligned_end      = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
   const char* in_aligned_end = in_ptr + (reinterpret_cast<char*>(out_aligned_end) - out_ptr);
 
   return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};

@@ -311,7 +311,7 @@ struct AgentReduceImpl
 
       // Load items as vector items
       InputT input_items[ITEMS_PER_THREAD];
-      VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+      auto* vec_items = reinterpret_cast<VectorT*>(input_items);
 
       // Alias items as an array of VectorT and load it in striped fashion
       static constexpr int words = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;

@@ -613,8 +613,8 @@ struct AgentTopK
         counter->k = k - prev;
 
         // The number of candidates in the next pass
-        counter->len              = cur - prev;
-        const unsigned int bucket = static_cast<unsigned int>(bin_idx);
+        counter->len      = cur - prev;
+        const auto bucket = static_cast<unsigned int>(bin_idx);
         // Update the "splitter" key by adding the radix digit of the k-th item bin of this pass
         set_kth_key_bits<key_in_t, bits_per_pass>(counter->kth_key_bits, pass, bucket);
       }

@@ -741,8 +741,8 @@ struct ScanTileState<T, true>
   {
     int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-    TxnWord val                = TxnWord();
-    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+    TxnWord val      = TxnWord();
+    auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);
 
     if (tile_idx < num_tiles)
     {
@@ -1150,9 +1150,9 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
    */
   _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
   {
-    int tile_idx               = (blockIdx.x * blockDim.x) + threadIdx.x;
-    TxnWord val                = TxnWord();
-    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+    int tile_idx     = (blockIdx.x * blockDim.x) + threadIdx.x;
+    TxnWord val      = TxnWord();
+    auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);
 
     if (tile_idx < num_tiles)
     {

@@ -43,7 +43,7 @@ MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count,
 
   while (keys1_begin < keys1_end)
   {
-    const OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+    const auto mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
     // pull copies of the keys before calling binary_pred so proxy references are unwrapped
     const detail::it_value_t<KeyIt1> key1 = keys1[mid];
     const detail::it_value_t<KeyIt2> key2 = keys2[diag - 1 - mid];

@@ -387,7 +387,7 @@ private:
     ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
     DecomposerT decomposer = {})
   {
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
 
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int KEY = 0; KEY < ItemsPerThread; KEY++)
@@ -466,7 +466,7 @@ public:
     ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
     DecomposerT decomposer = {})
   {
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
 
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int KEY = 0; KEY < ItemsPerThread; KEY++)

@@ -277,8 +277,8 @@ private:
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
     {
-      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
-      mid         = (::cuda::std::min) (mid, num_items - 1);
+      auto mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+      mid      = (::cuda::std::min) (mid, num_items - 1);
 
       if (val < input[mid])
       {

@@ -174,7 +174,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ItemsPerTh
   if (reinterpret_cast<uintptr_t>(block_ptr) % (alignof(Vector)) == 0)
   {
     // Alias global pointer
-    Vector* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+    auto* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
 
     // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
     Vector raw_vector[VECTORS_PER_THREAD];

@@ -195,7 +195,7 @@ struct BlockReduceRaking
 
         // sync before re-using shmem (warp_storage/raking_grid are aliased)
         static_assert(RAKING_THREADS <= warp_threads, "RAKING_THREADS must be <= warp size.");
-        unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
+        auto mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
         __syncwarp(mask);
 
         partial = WarpReduce(temp_storage.warp_storage)

@@ -242,10 +242,10 @@ private:
     for (int pass = 0; pass < num_passes; ++pass)
     {
       // Bit-range & mask of the current pass
-      const int pass_end_bit           = end_bit - pass * RadixBits;
-      const int pass_begin_bit         = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
-      const int pass_bits              = pass_end_bit - pass_begin_bit;
-      const bit_ordered_type pass_mask = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);
+      const int pass_end_bit   = end_bit - pass * RadixBits;
+      const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
+      const int pass_bits      = pass_end_bit - pass_begin_bit;
+      const auto pass_mask     = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);
 
       // Zero-initialize histograms for the current pass
       init_histograms();
@@ -329,8 +329,8 @@ private:
 
     // Get bit-twiddled sortkeys. For float keys, track which were -0.0 (normalized to +0.0 for ranking) so we can
     // restore -0.0 in the output via a bitvector; no extra key buffer.
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
-    constexpr int flip_back_num_words                = ::cuda::ceil_div(items_per_thread, 32);
+    auto& unsigned_keys               = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32);
     [[maybe_unused]] ::cuda::std::uint32_t flip_back_bits[flip_back_num_words] = {};
     if constexpr (::cuda::is_floating_point_v<KeyT>)
     {

@@ -104,7 +104,7 @@ private:
   /// Return a binned floating-point bin
   [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE static ftype binned_bins(int index)
   {
-    ftype* bins = get_shared_bin_array<ftype, max_index + max_fold>();
+    auto* bins = get_shared_bin_array<ftype, max_index + max_fold>();
     return bins[index];
   }
 

@@ -103,9 +103,9 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large
   using InputBufferT  = it_value_t<InputBufferIt>;
   using OutputBufferT = it_value_t<OutputBufferIt>;
 
-  constexpr uint32_t BLOCK_THREADS    = static_cast<uint32_t>(policy.block_threads);
-  constexpr uint32_t ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
-  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
+  constexpr auto BLOCK_THREADS    = static_cast<uint32_t>(policy.block_threads);
+  constexpr auto ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
+  constexpr auto TILE_SIZE        = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
 
   BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);
 

@@ -304,7 +304,7 @@ struct DispatchFixedSizeSegmentedReduce
     }
 
     // Alias the allocation for the privatized per-block reductions
-    AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
+    auto* d_block_reductions = static_cast<AccumT*>(allocations[0]);
 
     for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
     {

@@ -109,7 +109,7 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
                        ValueIt3,
                        Offset,
                        CompareOp>::type::items_per_tile;
-  const Offset diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
+  const auto diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
   if (diagonal_idx < num_diagonals)
   {
     const Offset diagonal_num      = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count);