diff --git a/.clang-tidy b/.clang-tidy
index 1c2737ab8fc..24f7205393d 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -36,7 +36,6 @@ Checks:
   - '-modernize-type-traits'
   - '-modernize-use-nullptr'
   - '-modernize-return-braced-init-list'
-  - '-modernize-use-auto'
   - '-modernize-use-equals-delete'
   - '-modernize-pass-by-value'
   - '-modernize-loop-convert'
diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index 446792886a0..52fbb1be988 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -68,8 +68,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   stf_cuda_kernel_add_dep(k, lX, STF_READ);
   stf_cuda_kernel_add_dep(k, lY, STF_RW);
   stf_cuda_kernel_start(k);
-  double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
-  double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
+  auto* dX            = (double*) stf_cuda_kernel_get_arg(k, 0);
+  auto* dY            = (double*) stf_cuda_kernel_get_arg(k, 1);
   const void* args[4] = {&N, &alpha, &dX, &dY};
   cudaError_t err     = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
   REQUIRE(err == cudaSuccess);
diff --git a/c/experimental/stf/test/test_host_launch.cu b/c/experimental/stf/test/test_host_launch.cu
index 1782fef304c..a848bb31e8b 100644
--- a/c/experimental/stf/test/test_host_launch.cu
+++ b/c/experimental/stf/test/test_host_launch.cu
@@ -84,7 +84,7 @@ C2H_TEST("host_launch with stream context", "[host_launch]")
   stf_task_set_symbol(t, "fill");
   stf_task_add_dep(t, lData, STF_WRITE);
   stf_task_start(t);
-  double* dData = (double*) stf_task_get(t, 0);
+  auto* dData = (double*) stf_task_get(t, 0);
   fill_kernel<<<2, 128, 0, (cudaStream_t) stf_task_get_custream(t)>>>((int) N, dData, 42.0);
   stf_task_end(t);
   stf_task_destroy(t);
@@ -134,8 +134,8 @@ C2H_TEST("host_launch with graph context", "[host_launch]")
   stf_task_add_dep(t, lData, STF_WRITE);
   stf_task_enable_capture(t);
   stf_task_start(t);
-  double* dData       = (double*) stf_task_get(t, 0);
-  cudaStream_t stream = (cudaStream_t) stf_task_get_custream(t);
+  auto* dData = (double*) stf_task_get(t, 0);
+  auto stream = (cudaStream_t) stf_task_get_custream(t);
   fill_kernel<<<2, 128, 0, stream>>>((int) N, dData, 42.0);
   stf_task_end(t);
   stf_task_destroy(t);
diff --git a/c/experimental/stf/test/test_logical_data_with_place.cu b/c/experimental/stf/test/test_logical_data_with_place.cu
index 517d5937ef1..b6e17b0c71d 100644
--- a/c/experimental/stf/test/test_logical_data_with_place.cu
+++ b/c/experimental/stf/test/test_logical_data_with_place.cu
@@ -75,7 +75,7 @@ C2H_TEST("stf_logical_data_with_place - host place (pinned memory)", "[logical_d
   cudaError_t err = cudaMallocHost(&A_raw, N * sizeof(float));
   REQUIRE(err == cudaSuccess);
   std::unique_ptr<void, decltype(&cudaFreeHost)> A_owner(A_raw, cudaFreeHost);
-  float* A = static_cast<float*>(A_owner.get());
+  auto* A = static_cast<float*>(A_owner.get());
   for (size_t i = 0; i < N; ++i)
   {
     A[i] = static_cast<float>(i);
@@ -114,7 +114,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
   cudaError_t err = cudaMalloc(&d_raw, N * sizeof(float));
   REQUIRE(err == cudaSuccess);
   std::unique_ptr<void, decltype(&cudaFree)> d_data_owner(d_raw, cudaFree);
-  float* d_data = static_cast<float*>(d_data_owner.get());
+  auto* d_data = static_cast<float*>(d_data_owner.get());
 
   std::vector<float> h_init(N);
   for (size_t i = 0; i < N; ++i)
@@ -135,7 +135,7 @@ C2H_TEST("stf_logical_data_with_place - device place (data on current device)",
   stf_cuda_kernel_set_symbol(k, "scale_inplace");
   stf_cuda_kernel_add_dep(k, lD, STF_RW);
   stf_cuda_kernel_start(k);
-  float* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
+  auto* arg_ptr = static_cast<float*>(stf_cuda_kernel_get_arg(k, 0));
   REQUIRE(arg_ptr == d_data);
   int n               = static_cast<int>(N);
   const void* args[3] = {&n, &arg_ptr, &factor};
diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index 62386212d6e..8c413429647 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -28,7 +28,7 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_
   {
     part_size = 1;
   }
-  int64_t c       = static_cast<int64_t>(data_coords.x);
+  auto c          = static_cast<int64_t>(data_coords.x);
   int64_t place_x = c / static_cast<int64_t>(part_size);
   if (place_x >= static_cast<int64_t>(nplaces))
   {
diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
index 4a83a2146e6..27418d3329a 100644
--- a/c2h/include/c2h/bfloat16.cuh
+++ b/c2h/include/c2h/bfloat16.cuh
@@ -106,9 +106,9 @@ struct bfloat16_t
   /// Cast to float
   __host__ __device__ __forceinline__ operator float() const
   {
-    float f     = 0;
-    uint32_t* p = reinterpret_cast<uint32_t*>(&f);
-    *p          = uint32_t(__x) << 16;
+    float f = 0;
+    auto* p = reinterpret_cast<uint32_t*>(&f);
+    *p      = uint32_t(__x) << 16;
     return f;
   }
 
diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu
index 96498ad8dd2..915180e0e0b 100644
--- a/cub/benchmarks/bench/histogram/even.cu
+++ b/cub/benchmarks/bench/histogram/even.cu
@@ -58,7 +58,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   }
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
   thrust::device_vector<CounterT> hist(num_bins);
@@ -70,7 +70,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 
diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu
index 8ff340388c2..1960051d528 100644
--- a/cub/benchmarks/bench/histogram/multi/even.cu
+++ b/cub/benchmarks/bench/histogram/multi/even.cu
@@ -60,7 +60,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   }
 
   const SampleT lower_level_r = 0;
-  const SampleT upper_level_r = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level_r    = get_upper_level<SampleT>(num_bins, elements);
   const SampleT lower_level_g = lower_level_r;
   const SampleT upper_level_g = upper_level_r;
   const SampleT lower_level_b = lower_level_g;
@@ -80,7 +80,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 
diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu
index 9cfb2f07994..66501c74dc9 100644
--- a/cub/benchmarks/bench/histogram/multi/range.cu
+++ b/cub/benchmarks/bench/histogram/multi/range.cu
@@ -52,7 +52,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   const int num_levels_b = num_levels_g;
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   SampleT step = (upper_level - lower_level) / num_bins;
   thrust::device_vector<SampleT> levels_r(num_bins + 1);
@@ -80,7 +80,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 
diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu
index 5e83868fc28..33f2e25b87c 100644
--- a/cub/benchmarks/bench/histogram/range.cu
+++ b/cub/benchmarks/bench/histogram/range.cu
@@ -50,7 +50,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   const int num_levels = static_cast<int>(num_bins) + 1;
 
   const SampleT lower_level = 0;
-  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+  const auto upper_level    = get_upper_level<SampleT>(num_bins, elements);
 
   SampleT step = (upper_level - lower_level) / num_bins;
   thrust::device_vector<SampleT> levels(num_bins + 1);
@@ -69,7 +69,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
   std::size_t temp_storage_bytes{};
 
   cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;
-  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  auto num_row_pixels        = static_cast<OffsetT>(elements);
   OffsetT num_rows           = 1;
   OffsetT row_stride_samples = num_row_pixels;
 
diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index c91feb9904e..e491fe916d2 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -57,7 +57,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(
diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index f5ce392b536..25944fedc7e 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -66,7 +66,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     return cub::detail::reduce_by_key::dispatch_streaming</* OverrideAccumT */ accum_t>(
diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
index fc902393c9d..6c5c08c323b 100644
--- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
+++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
@@ -64,7 +64,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
-  const offset_t num_items = static_cast<offset_t>(elements);
+  const auto num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
     cub::detail::rle::dispatch(
diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index 5ac2cf3bad7..5bf8999f72c 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -98,7 +98,7 @@ template <typename VectorT>
 _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out)
 {
   const uint32_t offset            = reinterpret_cast<uintptr_t>(ptr) % 4U;
-  const uint32_t* aligned_ptr      = reinterpret_cast<uint32_t const*>(ptr - offset);
+  auto* aligned_ptr                = reinterpret_cast<uint32_t const*>(ptr - offset);
   constexpr uint32_t bits_per_byte = 8U;
   const uint32_t bit_shift         = offset * bits_per_byte;
 
@@ -175,7 +175,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
   uint32_t out_start_aligned = ::cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);
 
   // Compute the beginning of the aligned ranges (output and input pointers)
-  VectorT* out_aligned_begin   = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
+  auto* out_aligned_begin      = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
   const char* in_aligned_begin = in_ptr + (reinterpret_cast<char*>(out_aligned_begin) - out_ptr);
 
   // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
@@ -197,7 +197,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
     out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size;
   }
 
-  VectorT* out_aligned_end   = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
+  auto* out_aligned_end      = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
   const char* in_aligned_end = in_ptr + (reinterpret_cast<char*>(out_aligned_end) - out_ptr);
 
   return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index 16963e15262..5b99bccfc27 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -311,7 +311,7 @@ struct AgentReduceImpl
 
       // Load items as vector items
       InputT input_items[ITEMS_PER_THREAD];
-      VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+      auto* vec_items = reinterpret_cast<VectorT*>(input_items);
 
       // Alias items as an array of VectorT and load it in striped fashion
       static constexpr int words = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;
diff --git a/cub/cub/agent/agent_topk.cuh b/cub/cub/agent/agent_topk.cuh
index 21422d4c10a..34a9d823dc3 100644
--- a/cub/cub/agent/agent_topk.cuh
+++ b/cub/cub/agent/agent_topk.cuh
@@ -613,8 +613,8 @@ struct AgentTopK
         counter->k = k - prev;
 
         // The number of candidates in the next pass
-        counter->len              = cur - prev;
-        const unsigned int bucket = static_cast<unsigned int>(bin_idx);
+        counter->len      = cur - prev;
+        const auto bucket = static_cast<unsigned int>(bin_idx);
         // Update the "splitter" key by adding the radix digit of the k-th item bin of this pass
         set_kth_key_bits<key_in_t, bits_per_pass>(counter->kth_key_bits, pass, bucket);
       }
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index c29bfe2d3cf..f361e23b136 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -741,8 +741,8 @@ struct ScanTileState<T, true>
   {
     int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-    TxnWord val                = TxnWord();
-    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+    TxnWord val      = TxnWord();
+    auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);
 
     if (tile_idx < num_tiles)
     {
@@ -1150,9 +1150,9 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
    */
   _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
   {
-    int tile_idx               = (blockIdx.x * blockDim.x) + threadIdx.x;
-    TxnWord val                = TxnWord();
-    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+    int tile_idx     = (blockIdx.x * blockDim.x) + threadIdx.x;
+    TxnWord val      = TxnWord();
+    auto* descriptor = reinterpret_cast<TileDescriptor*>(&val);
 
     if (tile_idx < num_tiles)
     {
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index dee105ed3b1..4b8acd7a538 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -43,7 +43,7 @@ MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count,
 
   while (keys1_begin < keys1_end)
   {
-    const OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+    const auto mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
     // pull copies of the keys before calling binary_pred so proxy references are unwrapped
     const detail::it_value_t<KeyIt1> key1 = keys1[mid];
     const detail::it_value_t<KeyIt2> key2 = keys2[diag - 1 - mid];
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 23a5e25f6fc..b8d0c6e3bd7 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -387,7 +387,7 @@ private:
     ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
     DecomposerT decomposer = {})
   {
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
 
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int KEY = 0; KEY < ItemsPerThread; KEY++)
@@ -466,7 +466,7 @@ public:
     ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
     DecomposerT decomposer = {})
   {
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    auto(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
 
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int KEY = 0; KEY < ItemsPerThread; KEY++)
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 1ce6cbcb73a..7cca1c1a359 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -277,8 +277,8 @@ private:
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
     {
-      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
-      mid         = (::cuda::std::min) (mid, num_items - 1);
+      auto mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+      mid      = (::cuda::std::min) (mid, num_items - 1);
 
       if (val < input[mid])
       {
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index d55367c4a6c..8527ca7c243 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -174,7 +174,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ItemsPerTh
   if (reinterpret_cast<uintptr_t>(block_ptr) % (alignof(Vector)) == 0)
   {
     // Alias global pointer
-    Vector* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+    auto* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
 
     // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
     Vector raw_vector[VECTORS_PER_THREAD];
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 2b740da8f97..688dabcf46d 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -195,7 +195,7 @@ struct BlockReduceRaking
 
         // sync before re-using shmem (warp_storage/raking_grid are aliased)
         static_assert(RAKING_THREADS <= warp_threads, "RAKING_THREADS must be <= warp size.");
-        unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
+        auto mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
         __syncwarp(mask);
 
         partial = WarpReduce(temp_storage.warp_storage)
diff --git a/cub/cub/block/specializations/block_topk_air.cuh b/cub/cub/block/specializations/block_topk_air.cuh
index 43c4d2a59d6..2fd3b59278c 100644
--- a/cub/cub/block/specializations/block_topk_air.cuh
+++ b/cub/cub/block/specializations/block_topk_air.cuh
@@ -242,10 +242,10 @@ private:
     for (int pass = 0; pass < num_passes; ++pass)
     {
       // Bit-range & mask of the current pass
-      const int pass_end_bit           = end_bit - pass * RadixBits;
-      const int pass_begin_bit         = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
-      const int pass_bits              = pass_end_bit - pass_begin_bit;
-      const bit_ordered_type pass_mask = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);
+      const int pass_end_bit   = end_bit - pass * RadixBits;
+      const int pass_begin_bit = (::cuda::std::max) (pass_end_bit - RadixBits, begin_bit);
+      const int pass_bits      = pass_end_bit - pass_begin_bit;
+      const auto pass_mask     = ::cuda::bitmask<bit_ordered_type>(pass_begin_bit, pass_bits);
 
       // Zero-initialize histograms for the current pass
       init_histograms();
@@ -329,8 +329,8 @@ private:
 
     // Get bit-twiddled sortkeys. For float keys, track which were -0.0 (normalized to +0.0 for ranking) so we can
     // restore -0.0 in the output via a bitvector; no extra key buffer.
-    bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
-    constexpr int flip_back_num_words                = ::cuda::ceil_div(items_per_thread, 32);
+    auto& unsigned_keys               = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
+    constexpr int flip_back_num_words = ::cuda::ceil_div(items_per_thread, 32);
     [[maybe_unused]] ::cuda::std::uint32_t flip_back_bits[flip_back_num_words] = {};
     if constexpr (::cuda::is_floating_point_v<KeyT>)
     {
diff --git a/cub/cub/detail/rfa.cuh b/cub/cub/detail/rfa.cuh
index 9527729fc82..b5d516b5f4f 100644
--- a/cub/cub/detail/rfa.cuh
+++ b/cub/cub/detail/rfa.cuh
@@ -104,7 +104,7 @@ private:
   /// Return a binned floating-point bin
   [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE static ftype binned_bins(int index)
   {
-    ftype* bins = get_shared_bin_array<ftype, max_index + max_fold>();
+    auto* bins = get_shared_bin_array<ftype, max_index + max_fold>();
     return bins[index];
   }
 
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 8e5ebbcb6eb..16ada67f864 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -103,9 +103,9 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large
   using InputBufferT  = it_value_t<InputBufferIt>;
   using OutputBufferT = it_value_t<OutputBufferIt>;
 
-  constexpr uint32_t BLOCK_THREADS    = static_cast<uint32_t>(policy.block_threads);
-  constexpr uint32_t ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
-  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
+  constexpr auto BLOCK_THREADS    = static_cast<uint32_t>(policy.block_threads);
+  constexpr auto ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
+  constexpr auto TILE_SIZE        = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
 
   BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);
 
diff --git a/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh
index 3c6001bd3bd..d89f717b637 100644
--- a/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh
@@ -304,7 +304,7 @@ struct DispatchFixedSizeSegmentedReduce
     }
 
     // Alias the allocation for the privatized per-block reductions
-    AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
+    auto* d_block_reductions = static_cast<AccumT*>(allocations[0]);
 
     for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
     {
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 8cd0a08ff60..ec6c6fb0c9c 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -109,7 +109,7 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
                        ValueIt3,
                        Offset,
                        CompareOp>::type::items_per_tile;
-  const Offset diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
+  const auto diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
   if (diagonal_idx < num_diagonals)
   {
     const Offset diagonal_num      = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count);
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 786f8380f25..75d084719dc 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -556,8 +556,8 @@ private:
     // for testing purposes, one portion is <= 2**28 elements
     const PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
     int num_passes                    = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
-    OffsetT num_portions              = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
-    PortionOffsetT max_num_blocks     = ::cuda::ceil_div(
+    auto num_portions                 = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
+    auto max_num_blocks               = ::cuda::ceil_div(
       static_cast<int>(::cuda::std::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
 
     size_t value_size         = KEYS_ONLY ? 0 : kernel_source.ValueSize();
@@ -587,11 +587,11 @@ private:
       return cudaSuccess;
     }
 
-    OffsetT* d_bins           = (OffsetT*) allocations[0];
-    AtomicOffsetT* d_lookback = (AtomicOffsetT*) allocations[1];
-    KeyT* d_keys_tmp2         = (KeyT*) allocations[2];
-    ValueT* d_values_tmp2     = (ValueT*) allocations[3];
-    AtomicOffsetT* d_ctrs     = (AtomicOffsetT*) allocations[4];
+    auto* d_bins        = (OffsetT*) allocations[0];
+    auto* d_lookback    = (AtomicOffsetT*) allocations[1];
+    auto* d_keys_tmp2   = (KeyT*) allocations[2];
+    auto* d_values_tmp2 = (ValueT*) allocations[3];
+    auto* d_ctrs        = (AtomicOffsetT*) allocations[4];
 
     // initialization
     if (const auto error =
@@ -688,7 +688,7 @@ private:
       int num_bits = ::cuda::std::min(end_bit - current_bit, RADIX_BITS);
       for (OffsetT portion = 0; portion < num_portions; ++portion)
       {
-        PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
+        auto portion_num_items = static_cast<PortionOffsetT>(
           ::cuda::std::min(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
 
         PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
@@ -902,7 +902,7 @@ private:
     int alt_end_bit        = ::cuda::std::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
 
     // Alias the temporary storage allocations
-    OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
+    auto* d_spine = static_cast<OffsetT*>(allocations[0]);
 
     DoubleBuffer<KeyT> d_keys_remaining_passes(
       (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 1856a8ad1f7..ceca1c48800 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -376,7 +376,7 @@ struct DispatchReduce
     }
 
     // Alias the allocation for the privatized per-block reductions
-    AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
+    auto* d_block_reductions = static_cast<AccumT*>(allocations[0]);
 
     // Get grid size for device_reduce_sweep_kernel
     int reduce_grid_size = even_share.grid_size;
@@ -663,7 +663,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes(
   }
 
   // Alias the allocation for the privatized per-block reductions
-  AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
+  auto* d_block_reductions = static_cast<AccumT*>(allocations[0]);
 
   // Get grid size for device_reduce_sweep_kernel
   const int reduce_grid_size = even_share.grid_size;
diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
index 0c0ef14a3c0..51318bf9889 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
@@ -221,7 +221,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok
   }
 
   // Alias the allocation for the privatized per-block reductions
-  DeterministicAccumT* d_block_reductions = static_cast<DeterministicAccumT*>(allocations[0]);
+  auto* d_block_reductions = static_cast<DeterministicAccumT*>(allocations[0]);
 
   auto d_chunk_block_reductions = d_block_reductions;
   for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index f27b2fd1e5a..09d9e718ade 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -749,7 +749,7 @@ private:
     cub::detail::temporary_storage::alias<local_segment_index_t>& group_sizes,
     WrappedPolicyT wrapped_policy)
   {
-    constexpr global_segment_offset_t num_segments_per_invocation_limit =
+    constexpr auto num_segments_per_invocation_limit =
       static_cast<global_segment_offset_t>(::cuda::std::numeric_limits<int>::max());
 
     // We repeatedly invoke the partitioning and sorting kernels until all segments are processed.
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 3fa0cbd6e33..c8d642f58db 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -668,7 +668,7 @@ struct DispatchSelectIf
 
       // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
       // and the total number (across all partitions) of items
-      num_total_items_t* tmp_num_selected_out = reinterpret_cast<num_total_items_t*>(allocations[2]);
+      auto* tmp_num_selected_out = reinterpret_cast<num_total_items_t*>(allocations[2]);
       streaming_context_t streaming_context{
         tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)};
 
@@ -959,7 +959,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_policy(
     return cudaSuccess;
   }
 
-  OffsetT* tmp_num_selected_out = reinterpret_cast<OffsetT*>(allocations[2]);
+  auto* tmp_num_selected_out = reinterpret_cast<OffsetT*>(allocations[2]);
   streaming_context_t streaming_context{
     tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)};
 
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
index 9c87aed6546..bdc2bba48f9 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
@@ -277,7 +277,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce
   }
 
   // Pointer to the double-buffer of global accumulators, which aggregate cross-partition results
-  global_accum_t* const d_global_aggregates = static_cast<global_accum_t*>(allocations[1]);
+  auto* const d_global_aggregates = static_cast<global_accum_t*>(allocations[1]);
 
   accumulating_out_op = accumulating_transform_output_op_t{
     true,
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index b1dbcee0c8a..958b1e192a4 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -243,7 +243,7 @@ struct DispatchThreeWayPartitionIf
 
     // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
     // and the total number (across all partitions) of items
-    OffsetT* tmp_num_selected_out = static_cast<OffsetT*>(allocations[1]);
+    auto* tmp_num_selected_out = static_cast<OffsetT*>(allocations[1]);
     streaming_context_t streaming_context{
       tmp_num_selected_out, (tmp_num_selected_out + num_counters_per_pass), (num_partitions <= 1)};
 
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index 878ad58a231..721d5d3bce1 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -597,8 +597,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 #endif // CUB_DEBUG_LOG
 
     // Initialize address variables
-    counter_t* counter = static_cast<counter_t*>(allocations[0]);
-    OffsetT* histogram = static_cast<decltype(histogram)>(allocations[1]);
+    auto* counter   = static_cast<counter_t*>(allocations[0]);
+    auto* histogram = static_cast<OffsetT*>(allocations[1]);
 
     // Pass 0: dedicated histogram-only kernel over the full input
     {
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index 542032819b7..5b84092f897 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -267,7 +267,7 @@ struct Transforms
     template <CacheLoadModifier LOAD_MODIFIER>
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const
     {
-      const CommonT common_sample = static_cast<CommonT>(sample);
+      const auto common_sample = static_cast<CommonT>(sample);
 
       if (valid && this->SampleIsValid(common_sample, m_max, m_min))
       {
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index 5ea003b55a6..d9d4f075936 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -158,7 +158,7 @@ _CCCL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
   _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
   _CCCL_GRID_CONSTANT const int items_per_tile)
 {
-  const OffsetT partition_idx = static_cast<OffsetT>(blockDim.x * blockIdx.x + threadIdx.x);
+  const auto partition_idx = static_cast<OffsetT>(blockDim.x * blockIdx.x + threadIdx.x);
   if (partition_idx < num_partitions)
   {
     AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT>{
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh
index 4d7c372311c..0616e31ee88 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh
@@ -77,7 +77,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   constexpr int bin_length = AccumT::max_index + AccumT::max_fold;
   const int tid            = block_threads * blockIdx.x + threadIdx.x;
 
-  ftype* shared_bins = detail::rfa::get_shared_bin_array<ftype, bin_length>();
+  auto* shared_bins = detail::rfa::get_shared_bin_array<ftype, bin_length>();
 
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int index = threadIdx.x; index < bin_length; index += block_threads)
@@ -217,7 +217,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(
   using float_type         = typename AccumT::ftype;
   constexpr int bin_length = AccumT::max_index + AccumT::max_fold;
 
-  float_type* shared_bins = detail::rfa::get_shared_bin_array<float_type, bin_length>();
+  auto* shared_bins = detail::rfa::get_shared_bin_array<float_type, bin_length>();
 
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int index = threadIdx.x; index < bin_length; index += block_threads)
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index 70cc4cefa2a..ea9a1ab87b1 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -292,7 +292,7 @@ ThreadLoadVolatilePointer(const T* ptr, ::cuda::std::false_type /*is_primitive*/
   constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
 
   T retval;
-  VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
+  auto* words = reinterpret_cast<VolatileWord*>(&retval);
   UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
   return retval;
 }
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index a65bcb9a8a3..c3764a4bf14 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -432,7 +432,7 @@ struct CachingDeviceAllocator
       }
 
       // Iterate through the range of cached blocks on the same device in the same bin
-      CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+      auto block_itr = cached_blocks.lower_bound(search_key);
       while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin))
       {
         // To prevent races with reusing blocks returned by the host but still
@@ -525,7 +525,7 @@ struct CachingDeviceAllocator
 
         // Iterate the range of free blocks on the same device
         BlockDescriptor free_key(device);
-        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+        auto block_itr = cached_blocks.lower_bound(free_key);
 
         while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
         {
@@ -683,7 +683,7 @@ struct CachingDeviceAllocator
     // Find corresponding block descriptor
     bool recached = false;
     BlockDescriptor search_key(d_ptr, device);
-    BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+    auto block_itr = live_blocks.find(search_key);
     if (block_itr != live_blocks.end())
     {
       // Remove from live blocks
@@ -811,7 +811,7 @@ struct CachingDeviceAllocator
     while (!cached_blocks.empty())
     {
       // Get first block
-      CachedBlocks::iterator begin = cached_blocks.begin();
+      auto begin = cached_blocks.begin();
 
       // Get entry-point device ordinal if necessary
       if (entrypoint_device == INVALID_DEVICE_ORDINAL)
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index ecff44b8fc0..aa5b54fc2a0 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -218,8 +218,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleUp(T input, int src_offset, int first_th
   constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
 
   T output;
-  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
-  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+  auto* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  auto* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
 
   unsigned int shuffle_word;
   shuffle_word    = SHFL_UP_SYNC((unsigned int) input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
@@ -296,8 +296,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t
   constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
 
   T output;
-  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
-  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+  auto* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  auto* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
 
   unsigned int shuffle_word;
   shuffle_word    = SHFL_DOWN_SYNC((unsigned int) input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
@@ -370,8 +370,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in
   constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
 
   T output;
-  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
-  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+  auto* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  auto* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
 
   unsigned int shuffle_word;
   shuffle_word    = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS);
diff --git a/cub/examples/block/example_block_radix_sort.cu b/cub/examples/block/example_block_radix_sort.cu
index 614a38a8e52..39e97f61187 100644
--- a/cub/examples/block/example_block_radix_sort.cu
+++ b/cub/examples/block/example_block_radix_sort.cu
@@ -140,9 +140,9 @@ void Test()
   constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
 
   // Allocate host arrays
-  Key* h_in          = new Key[TILE_SIZE * g_grid_size];
-  Key* h_reference   = new Key[TILE_SIZE * g_grid_size];
-  clock_t* h_elapsed = new clock_t[g_grid_size];
+  auto* h_in        = new Key[TILE_SIZE * g_grid_size];
+  auto* h_reference = new Key[TILE_SIZE * g_grid_size];
+  auto* h_elapsed   = new clock_t[g_grid_size];
 
   // Initialize problem and reference output on host
   Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
diff --git a/cub/examples/device/example_device_partition_flagged.cu b/cub/examples/device/example_device_partition_flagged.cu
index e91fadedb09..a68aa31eeeb 100644
--- a/cub/examples/device/example_device_partition_flagged.cu
+++ b/cub/examples/device/example_device_partition_flagged.cu
@@ -134,9 +134,9 @@ int main(int argc, char** argv)
   CubDebugExit(args.DeviceInit());
 
   // Allocate host arrays
-  int* h_in              = new int[num_items];
-  int* h_reference       = new int[num_items];
-  unsigned char* h_flags = new unsigned char[num_items];
+  auto* h_in        = new int[num_items];
+  auto* h_reference = new int[num_items];
+  auto* h_flags     = new unsigned char[num_items];
 
   // Initialize problem and solution
   Initialize(h_in, h_flags, num_items, max_segment);
diff --git a/cub/examples/device/example_device_partition_if.cu b/cub/examples/device/example_device_partition_if.cu
index 41c01523675..f149e8f4027 100644
--- a/cub/examples/device/example_device_partition_if.cu
+++ b/cub/examples/device/example_device_partition_if.cu
@@ -150,7 +150,7 @@ int main(int argc, char** argv)
 
   // DevicePartition a pivot index
   unsigned int pivot_index;
-  unsigned int max_int = (unsigned int) -1;
+  auto max_int = (unsigned int) -1;
   RandomBits(pivot_index);
   pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
   printf("Pivot idx: %d\n", pivot_index);
diff --git a/cub/examples/device/example_device_radix_sort.cu b/cub/examples/device/example_device_radix_sort.cu
index c8d64c46b42..417b71767c1 100644
--- a/cub/examples/device/example_device_radix_sort.cu
+++ b/cub/examples/device/example_device_radix_sort.cu
@@ -127,8 +127,8 @@ int main(int argc, char** argv)
   fflush(stdout);
 
   // Allocate host arrays
-  float* h_keys           = new float[num_items];
-  float* h_reference_keys = new float[num_items];
+  auto* h_keys            = new float[num_items];
+  auto* h_reference_keys  = new float[num_items];
   int* h_values           = new int[num_items];
   int* h_reference_values = new int[num_items];
 
diff --git a/cub/examples/device/example_device_select_flagged.cu b/cub/examples/device/example_device_select_flagged.cu
index 251ba65e570..70a23b8cdc2 100644
--- a/cub/examples/device/example_device_select_flagged.cu
+++ b/cub/examples/device/example_device_select_flagged.cu
@@ -134,9 +134,9 @@ int main(int argc, char** argv)
   CubDebugExit(args.DeviceInit());
 
   // Allocate host arrays
-  int* h_in              = new int[num_items];
-  int* h_reference       = new int[num_items];
-  unsigned char* h_flags = new unsigned char[num_items];
+  int* h_in        = new int[num_items];
+  int* h_reference = new int[num_items];
+  auto* h_flags    = new unsigned char[num_items];
 
   // Initialize problem and solution
   Initialize(h_in, h_flags, num_items, max_segment);
diff --git a/cub/examples/device/example_device_select_if.cu b/cub/examples/device/example_device_select_if.cu
index ed282c5afde..d238e6f4d47 100644
--- a/cub/examples/device/example_device_select_if.cu
+++ b/cub/examples/device/example_device_select_if.cu
@@ -150,7 +150,7 @@ int main(int argc, char** argv)
 
   // Select a pivot index
   unsigned int pivot_index;
-  unsigned int max_int = (unsigned int) -1;
+  auto max_int = (unsigned int) -1;
   RandomBits(pivot_index);
   pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
   printf("Pivot idx: %d\n", pivot_index);
diff --git a/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
index aa27d140e4d..0ad839a922d 100644
--- a/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
+++ b/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
@@ -99,7 +99,7 @@ int Solve(Key* h_keys, Value* h_values, int num_items, int* h_offsets_reference,
 {
   // Sort
 
-  Pair<Key, Value>* h_pairs = new Pair<Key, Value>[num_items];
+  auto* h_pairs = new Pair<Key, Value>[num_items];
   for (int i = 0; i < num_items; ++i)
   {
     h_pairs[i].key   = h_keys[i];
@@ -197,7 +197,7 @@ int main(int argc, char** argv)
   // Allocate host arrays (problem and reference solution)
 
   Key* h_keys              = new Key[num_items];
-  Value* h_values          = new Value[num_items];
+  auto* h_values           = new Value[num_items];
   int* h_offsets_reference = new int[num_items];
   int* h_lengths_reference = new int[num_items];
 
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index ecb501a287a..3587966f000 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -186,7 +186,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
 
   for (std::size_t i = 0; i < h_keys.size(); i++)
   {
-    bit_ordered_t key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);
+    auto key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);
 
     if constexpr (::cuda::is_floating_point_v<KeyT>)
     {
@@ -445,9 +445,9 @@ struct radix_offset_scan_op_t
 template <class OffsetT>
 void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& offsets, std::size_t num_items)
 {
-  const std::size_t num_segments        = offsets.size() - 1;
-  const OffsetT expected_segment_length = static_cast<OffsetT>(::cuda::ceil_div(num_items, num_segments));
-  const OffsetT max_segment_length      = (expected_segment_length * 2) + 1;
+  const std::size_t num_segments     = offsets.size() - 1;
+  const auto expected_segment_length = static_cast<OffsetT>(::cuda::ceil_div(num_items, num_segments));
+  const OffsetT max_segment_length   = (expected_segment_length * 2) + 1;
   c2h::gen(seed, offsets, OffsetT{0}, max_segment_length);
   thrust::exclusive_scan(
     c2h::device_policy,
diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
index d0bd0a6d7c1..c7858c32a43 100644
--- a/cub/test/catch2_segmented_sort_helper.cuh
+++ b/cub/test/catch2_segmented_sort_helper.cuh
@@ -266,7 +266,7 @@ using unwrap_value_t = typename unwrap_value_t_impl<T>::type;
 template <typename T>
 __host__ __device__ __forceinline__ double compute_conversion_factor(int segment_size, T)
 {
-  const double max_value = static_cast<double>(::cuda::std::numeric_limits<T>::max());
+  const auto max_value = static_cast<double>(::cuda::std::numeric_limits<T>::max());
   return (max_value + 1) / segment_size;
 }
 
@@ -433,8 +433,8 @@ private:
         // Compute the original input value corresponding to the current duplicate key.
         // NOTE: Keys and values are generated using opposing ascending/descending parameters, so the generated input
         // values are descending when generating ascending input keys for a descending sort.
-        const int conv_idx         = sort_descending ? (segment_size - 1 - in_seg_idx) : in_seg_idx;
-        const ValueT current_value = static_cast<ValueT>(conv_idx * value_conversion);
+        const int conv_idx       = sort_descending ? (segment_size - 1 - in_seg_idx) : in_seg_idx;
+        const auto current_value = static_cast<ValueT>(conv_idx * value_conversion);
         if constexpr (STABLE)
         {
           // For stable sorts, the output value must appear at an exact offset:
diff --git a/cub/test/catch2_test_block_histogram.cu b/cub/test/catch2_test_block_histogram.cu
index 6475a2c2568..c9920f1bdfd 100644
--- a/cub/test/catch2_test_block_histogram.cu
+++ b/cub/test/catch2_test_block_histogram.cu
@@ -78,7 +78,7 @@ C2H_TEST("Block histogram can be computed with uniform input",
   using params   = params_t<TestType>;
   using sample_t = typename params::sample_t;
 
-  const sample_t uniform_value = static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));
+  const auto uniform_value = static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));
 
   c2h::host_vector<sample_t> h_samples(params::num_samples, uniform_value);
   c2h::host_vector<int> h_reference(params::bins);
@@ -149,7 +149,7 @@ C2H_TEST("Block histogram can be computed with random input",
   c2h::device_vector<int> d_histogram(params::bins);
   c2h::device_vector<sample_t> d_samples(params::num_samples);
 
-  const sample_t min_bin = static_cast<sample_t>(0);
+  const auto min_bin     = static_cast<sample_t>(0);
   const sample_t max_bin = static_cast<sample_t>(
     std::min(static_cast<std::int32_t>(cuda::std::numeric_limits<sample_t>::max()),
              static_cast<std::int32_t>(params::bins - 1)));
diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
index aadf74f0c76..1336393641a 100644
--- a/cub/test/catch2_test_device_find_if.cu
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -165,7 +165,7 @@ C2H_TEST("Device find_if works with non primitive iterator", "[device][find_if]"
   constexpr offset_t min_items = 1;
   constexpr offset_t max_items = 10000000; // 10M items for reasonable test time
 
-  input_t val_to_find = static_cast<input_t>(GENERATE_COPY(take(1, random(min_items, max_items))));
+  auto val_to_find = static_cast<input_t>(GENERATE_COPY(take(1, random(min_items, max_items))));
   // Generate the input sizes to test for
   const offset_t num_items = GENERATE_COPY(
     take(1, random(min_items, max_items)),
diff --git a/cub/test/catch2_test_device_merge_sort.cu b/cub/test/catch2_test_device_merge_sort.cu
index 4cf70182cc9..a533754f310 100644
--- a/cub/test/catch2_test_device_merge_sort.cu
+++ b/cub/test/catch2_test_device_merge_sort.cu
@@ -412,7 +412,7 @@ C2H_TEST("DeviceMergeSort::StableSortPairs works for large inputs",
   // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
   auto num_items_ull = std::min(static_cast<std::size_t>(cuda::std::numeric_limits<offset_t>::max()) - 1,
                                 cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items = static_cast<offset_t>(num_items_ull);
+  auto num_items     = static_cast<offset_t>(num_items_ull);
 
   SECTION("Random")
   {
diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 4725ccafc8e..5b9509b9f4e 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -390,7 +390,7 @@ try
   using type     = std::int64_t;
   using offset_t = typename c2h::get<0, TestType>;
 
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index f346aedeab7..ac6fc1f5a23 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -303,7 +303,7 @@ try
   using type     = std::int64_t;
   using offset_t = typename c2h::get<0, TestType>;
 
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu
index b02fccf3eb4..f1febc7bdd0 100644
--- a/cub/test/catch2_test_device_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_radix_sort_keys.cu
@@ -168,8 +168,8 @@ C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sor
   using bits_t = typename cub::Traits<key_t>::UnsignedBits;
 
   constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
-  const key_t positive_zero      = cuda::std::bit_cast<key_t>(bits_t(0));
-  const key_t negative_zero      = cuda::std::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
+  const auto positive_zero       = cuda::std::bit_cast<key_t>(bits_t(0));
+  const auto negative_zero       = cuda::std::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
 
   constexpr std::size_t max_num_items = 1 << 18;
   const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
@@ -461,7 +461,7 @@ void do_large_offset_test(std::size_t num_items)
 
     double_buffer_sort_t action(is_descending);
     action.initialize();
-    const num_items_t typed_num_items = static_cast<num_items_t>(num_items);
+    const auto typed_num_items = static_cast<num_items_t>(num_items);
     launch(action, arrays.keys_buffer, typed_num_items, begin_bit<key_t>(), end_bit<key_t>());
 
     arrays.keys_buffer.selector = action.selector();
diff --git a/cub/test/catch2_test_device_radix_sort_pairs.cu b/cub/test/catch2_test_device_radix_sort_pairs.cu
index 05d3493bb56..3f6f9b422cb 100644
--- a/cub/test/catch2_test_device_radix_sort_pairs.cu
+++ b/cub/test/catch2_test_device_radix_sort_pairs.cu
@@ -146,7 +146,7 @@ void do_large_offset_test(std::size_t num_items)
 
     double_buffer_sort_t action(is_descending);
     action.initialize();
-    const num_items_t typed_num_items = static_cast<num_items_t>(num_items);
+    const auto typed_num_items = static_cast<num_items_t>(num_items);
     launch(action, arrays.keys_buffer, arrays.values_buffer, typed_num_items, begin_bit<key_t>(), end_bit<key_t>());
 
     TIME(timer.print_elapsed_seconds_and_reset("Device sort"));
diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu
index fca7ca9a511..c7e7c844608 100644
--- a/cub/test/catch2_test_device_reduce.cu
+++ b/cub/test/catch2_test_device_reduce.cu
@@ -137,9 +137,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
 
     // Prepare verification data
-    using accum_t = cuda::std::__accumulator_t<op_t, item_t, output_t>;
-    output_t expected_result =
-      static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));
+    using accum_t        = cuda::std::__accumulator_t<op_t, item_t, output_t>;
+    auto expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));
 
     // Run test
     c2h::device_vector<output_t> out_result(num_segments);
@@ -161,7 +160,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     using accum_t = cuda::std::__accumulator_t<op_t, item_t, output_t>;
 
     // Prepare verification data
-    output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
+    auto expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
 
     // Run test
     c2h::device_vector<output_t> out_result(num_segments);
@@ -219,8 +218,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items);
 
     // Verify result
-    result_t gpu_result   = out_result[0];
-    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    result_t gpu_result = out_result[0];
+    auto gpu_extremum   = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_extremum);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
   }
@@ -240,8 +239,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items);
 
     // Verify result
-    result_t gpu_result   = out_result[0];
-    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    result_t gpu_result = out_result[0];
+    auto gpu_extremum   = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_extremum);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
   }
@@ -259,7 +258,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
 
     // Verify result for the deprecated interface
     result_t gpu_result = out_result[0];
-    output_t gpu_value  = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
+    auto gpu_value      = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
@@ -277,7 +276,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
 
     // Verify result for the deprecated interface
     result_t gpu_result = out_result[0];
-    output_t gpu_value  = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
+    auto gpu_value      = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
@@ -300,8 +299,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op);
 
     // Verify result
-    result_t gpu_result   = out_result[0];
-    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    result_t gpu_result = out_result[0];
+    auto gpu_extremum   = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_extremum);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
   }
@@ -323,8 +322,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op);
 
     // Verify result
-    result_t gpu_result   = out_result[0];
-    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    result_t gpu_result = out_result[0];
+    auto gpu_extremum   = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
     REQUIRE(expected_result[0] == gpu_extremum);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
   }
diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu
index 75a1766921c..478c132e405 100644
--- a/cub/test/catch2_test_device_reduce_by_key.cu
+++ b/cub/test/catch2_test_device_reduce_by_key.cu
@@ -81,7 +81,7 @@ C2H_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
 
   // Get array of keys from segment offsets
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
   c2h::device_vector<key_t> segment_keys(num_items);
   c2h::init_key_segments(segment_offsets, segment_keys);
   auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data());
diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu
index 1eb8a34a9ca..4557724696e 100644
--- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu
+++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu
@@ -52,7 +52,7 @@ C2H_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]"
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
 
   // Get array of keys from segment offsets
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
   c2h::device_vector<key_t> segment_keys(num_items);
   c2h::init_key_segments(segment_offsets, segment_keys);
   auto d_keys_it = segment_keys.cbegin();
diff --git a/cub/test/catch2_test_device_reduce_by_key_vsmem.cu b/cub/test/catch2_test_device_reduce_by_key_vsmem.cu
index 6c722a7394e..47ba7b342db 100644
--- a/cub/test/catch2_test_device_reduce_by_key_vsmem.cu
+++ b/cub/test/catch2_test_device_reduce_by_key_vsmem.cu
@@ -38,7 +38,7 @@ C2H_TEST("Device reduce-by-key works with huge keys", "[by_key][reduce][device]"
   // Generate input segments
   c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
   c2h::device_vector<key_t> segment_keys(num_items);
   c2h::init_key_segments(segment_offsets, segment_keys);
   auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data());
diff --git a/cub/test/catch2_test_device_reduce_large_offsets.cu b/cub/test/catch2_test_device_reduce_large_offsets.cu
index 9f691aaf56d..e4600d8bf43 100644
--- a/cub/test/catch2_test_device_reduce_large_offsets.cu
+++ b/cub/test/catch2_test_device_reduce_large_offsets.cu
@@ -67,7 +67,7 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", o
 
   CAPTURE(c2h::type_name<offset_t>());
 
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
 
   // Generate the input sizes to test for
diff --git a/cub/test/catch2_test_device_rle_custom_policy_hub.cu b/cub/test/catch2_test_device_rle_custom_policy_hub.cu
index 98e3cec75c1..d5391a57235 100644
--- a/cub/test/catch2_test_device_rle_custom_policy_hub.cu
+++ b/cub/test/catch2_test_device_rle_custom_policy_hub.cu
@@ -39,7 +39,7 @@ C2H_TEST("DeviceRleDispatch::Dispatch: custom policy hub", "[device][run_length_
   using equal_t  = cuda::std::equal_to<>;
 
   c2h::device_vector<input_t> d_in{1, 1, 2, 2, 2, 3, 3, 4, 4};
-  const offset_t num_items = static_cast<offset_t>(d_in.size());
+  const auto num_items = static_cast<offset_t>(d_in.size());
 
   c2h::device_vector<offset_t> d_offsets(4, thrust::no_init);
   c2h::device_vector<length_t> d_lengths(4, thrust::no_init);
diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
index df23d5d99d9..7f02e320fe7 100644
--- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
+++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
@@ -167,10 +167,10 @@ bool validate_results(
   const c2h::host_vector<Index>& h_out_lengths  = out_lengths;
   const c2h::host_vector<Index>& h_out_num_runs = out_num_runs;
 
-  const cuda::std::size_t num_runs = static_cast<cuda::std::size_t>(h_out_num_runs.front());
+  const auto num_runs = static_cast<cuda::std::size_t>(h_out_num_runs.front());
   for (cuda::std::size_t run = 0; run < num_runs; ++run)
   {
-    const cuda::std::size_t first_index = static_cast<cuda::std::size_t>(h_out_offsets[run]);
+    const auto first_index              = static_cast<cuda::std::size_t>(h_out_offsets[run]);
     const cuda::std::size_t final_index = first_index + static_cast<cuda::std::size_t>(h_out_lengths[run]);
 
     // Ensure we started a new run
diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh
index 5aeda0cbcbc..548a2703a2d 100644
--- a/cub/test/catch2_test_device_scan.cuh
+++ b/cub/test/catch2_test_device_scan.cuh
@@ -42,7 +42,7 @@ void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt resu
   using value_t  = cub::detail::it_value_t<InputIt>;
   using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
   using output_t = cub::detail::it_value_t<OutputIt>;
-  accum_t acc    = static_cast<accum_t>(init);
+  auto acc       = static_cast<accum_t>(init);
   for (; first != last; ++first)
   {
     auto v    = *first;
@@ -57,7 +57,7 @@ void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt resu
   using value_t  = cub::detail::it_value_t<InputIt>;
   using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
   using output_t = cub::detail::it_value_t<OutputIt>;
-  accum_t acc    = static_cast<accum_t>(init);
+  auto acc       = static_cast<accum_t>(init);
   for (; first != last; ++first)
   {
     acc       = op(acc, *first);
@@ -88,9 +88,9 @@ void compute_exclusive_scan_by_key_reference(
   {
     for (std::size_t i = 0; i < num_items;)
     {
-      accum_t val       = static_cast<accum_t>(h_values_it[i]);
-      result_out_it[i]  = init;
-      accum_t inclusive = static_cast<accum_t>(scan_op(init, val));
+      auto val         = static_cast<accum_t>(h_values_it[i]);
+      result_out_it[i] = init;
+      auto inclusive   = static_cast<accum_t>(scan_op(init, val));
 
       ++i;
 
diff --git a/cub/test/catch2_test_device_scan_by_key_large_offsets.cu b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
index 66915bf87f9..22c202e10d5 100644
--- a/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
+++ b/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
@@ -81,7 +81,7 @@ try
   using index_t  = std::uint64_t;
   using offset_t = typename c2h::get<0, TestType>;
 
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu
index 9b7f4e443b4..2f49ffaa5b5 100644
--- a/cub/test/catch2_test_device_scan_large_offsets.cu
+++ b/cub/test/catch2_test_device_scan_large_offsets.cu
@@ -81,7 +81,7 @@ try
   using offset_t = typename c2h::get<0, TestType>;
 
   // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
@@ -124,7 +124,7 @@ try
   using offset_t = typename c2h::get<0, TestType>;
 
   // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu
index 33aa1d9a54b..0e6a8396d4a 100644
--- a/cub/test/catch2_test_device_segmented_reduce.cu
+++ b/cub/test/catch2_test_device_segmented_reduce.cu
@@ -92,8 +92,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][
   // Generate input segments
   c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
-  auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  auto d_offsets_it       = thrust::raw_pointer_cast(segment_offsets.data());
 
   // Generate input data
   c2h::device_vector<input_t> in_items(num_items);
@@ -266,7 +266,7 @@ C2H_TEST("Device fixed size segmented reduce works with all device interfaces",
     auto d_out_it = thrust::raw_pointer_cast(out_result.data());
 
     using init_t = cub::detail::it_value_t<decltype(unwrap_it(d_out_it))>;
-    init_t init  = static_cast<init_t>(*unwrap_it(&default_constant));
+    auto init    = static_cast<init_t>(*unwrap_it(&default_constant));
     device_segmented_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, segment_size, reduction_op, init);
     // Verify result
     REQUIRE(expected_result == out_result);
diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
index f8d89293d5b..497ba00d617 100644
--- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
@@ -51,8 +51,8 @@ C2H_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d
   // Generate input segments
   c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
-  auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  auto d_offsets_it       = thrust::raw_pointer_cast(segment_offsets.data());
 
   // Prepare input data
   item_t default_constant{};
diff --git a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu
index 478205f6451..f5e8576a3d3 100644
--- a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu
@@ -69,8 +69,8 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes",
   // Generate input segments
   c2h::device_vector<offset_t> segment_offsets =
     c2h::gen_uniform_offsets<offset_t>(C2H_SEED(1), num_items, seg_size_min, seg_size_max);
-  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
-  auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
+  const auto num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  auto d_offsets_it       = thrust::raw_pointer_cast(segment_offsets.data());
 
   // Generate input data
   c2h::device_vector<input_t> in_items(num_items);
diff --git a/cub/test/catch2_test_device_segmented_scan.cu b/cub/test/catch2_test_device_segmented_scan.cu
index 30c2770456f..7cfdb2cee5b 100644
--- a/cub/test/catch2_test_device_segmented_scan.cu
+++ b/cub/test/catch2_test_device_segmented_scan.cu
@@ -145,8 +145,8 @@ C2H_TEST("Device segmented_scan works with all device interfaces", "[segmented][
   // Generate input segments
   c2h::device_vector<offset_t> d_segment_offsets = c2h::gen_uniform_offsets<offset_t>(
     C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
-  const offset_t num_segments = static_cast<offset_t>(d_segment_offsets.size() - 1);
-  auto d_offsets_it           = thrust::raw_pointer_cast(d_segment_offsets.data());
+  const auto num_segments = static_cast<offset_t>(d_segment_offsets.size() - 1);
+  auto d_offsets_it       = thrust::raw_pointer_cast(d_segment_offsets.data());
 
   INFO("Num segments: " << num_segments);
   INFO("Types: " << typeid(input_t).name() << " " << typeid(output_t).name() << " " << typeid(offset_t).name());
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index 496ead85b73..0c0e436a85c 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -212,9 +212,9 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   constexpr auto max_segment_size           = static_max_segment_size;
   c2h::device_vector<segment_size_t> segment_offsets =
     c2h::gen_uniform_offsets<segment_size_t>(C2H_SEED(3), num_items, min_segment_size, max_segment_size);
-  const segment_index_t num_segments = static_cast<segment_index_t>(segment_offsets.size() - 1);
-  auto segment_offsets_it            = thrust::raw_pointer_cast(segment_offsets.data());
-  auto segment_size_it               = cuda::make_transform_iterator(
+  const auto num_segments = static_cast<segment_index_t>(segment_offsets.size() - 1);
+  auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data());
+  auto segment_size_it    = cuda::make_transform_iterator(
     cuda::make_counting_iterator(segment_index_t{0}), segment_size_op<segment_size_t*>{segment_offsets_it});
 
   // Set the k value
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index 47a4bd4f950..24a188ead68 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -329,9 +329,9 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
   constexpr auto max_segment_size           = static_max_segment_size;
   c2h::device_vector<segment_size_t> segment_offsets =
     c2h::gen_uniform_offsets<segment_size_t>(C2H_SEED(3), num_items, min_segment_size, max_segment_size);
-  const segment_index_t num_segments = static_cast<segment_index_t>(segment_offsets.size() - 1);
-  auto segment_offsets_it            = thrust::raw_pointer_cast(segment_offsets.data());
-  auto segment_size_it               = cuda::make_transform_iterator(
+  const auto num_segments = static_cast<segment_index_t>(segment_offsets.size() - 1);
+  auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data());
+  auto segment_size_it    = cuda::make_transform_iterator(
     cuda::make_counting_iterator(segment_index_t{0}), segment_size_op<segment_size_t*>{segment_offsets_it});
 
   // Set the k value
diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu
index ced8389e96f..a6de1943e92 100644
--- a/cub/test/catch2_test_device_three_way_partition.cu
+++ b/cub/test/catch2_test_device_three_way_partition.cu
@@ -427,7 +427,7 @@ try
 {
   using offset_t = typename c2h::get<0, TestType>;
 
-  const offset_t num_items_max = detail::make_large_offset<offset_t>();
+  const auto num_items_max     = detail::make_large_offset<offset_t>();
   const offset_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : offset_t{0};
   const offset_t num_items     = GENERATE_COPY(
     values(
diff --git a/cub/test/catch2_test_device_topk_keys.cu b/cub/test/catch2_test_device_topk_keys.cu
index a0410883d89..7b47493466d 100644
--- a/cub/test/catch2_test_device_topk_keys.cu
+++ b/cub/test/catch2_test_device_topk_keys.cu
@@ -161,7 +161,7 @@ try
   using comparator_t       = direction_to_comparator_t<direction>;
 
   // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  const num_items_t num_items_max = detail::make_large_offset<num_items_t>();
+  const auto num_items_max        = detail::make_large_offset<num_items_t>();
   const num_items_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : num_items_t{0};
   const num_items_t num_items =
     GENERATE_COPY(values({num_items_max, static_cast<num_items_t>(num_items_max - 1), num_items_t{1}, num_items_t{3}}),
@@ -277,7 +277,7 @@ try
   using comparator_t       = direction_to_comparator_t<direction>;
 
   // Set input size
-  const num_items_t num_items = detail::make_large_offset<num_items_t>();
+  const auto num_items = detail::make_large_offset<num_items_t>();
 
   // Set the k value
   const auto limit_k = static_cast<k_items_t>(
diff --git a/cub/test/catch2_test_device_topk_pairs.cu b/cub/test/catch2_test_device_topk_pairs.cu
index af51baa3858..ea99262ba72 100644
--- a/cub/test/catch2_test_device_topk_pairs.cu
+++ b/cub/test/catch2_test_device_topk_pairs.cu
@@ -228,7 +228,7 @@ C2H_TEST("DeviceTopK::MaxPairs: Test for large num_items", "[pairs][topk][device
 
   // Set input size
   // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  const num_items_t num_items_max = detail::make_large_offset<num_items_t>();
+  const auto num_items_max        = detail::make_large_offset<num_items_t>();
   const num_items_t num_items_min = num_items_max > 10000 ? num_items_max - 10000ULL : num_items_t{0};
   const num_items_t num_items =
     GENERATE_COPY(values({num_items_max, static_cast<num_items_t>(num_items_max - 1), num_items_t{1}, num_items_t{3}}),
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
index 32b29a1d796..e0d5abe4e49 100644
--- a/cub/test/catch2_test_device_transform.cu
+++ b/cub/test/catch2_test_device_transform.cu
@@ -86,7 +86,7 @@ C2H_TEST("DeviceTransform::Transform with multiple inputs works for large number
 {
   using offset_t = c2h::get<0, TestType>;
   CAPTURE(c2h::type_name<offset_t>());
-  const offset_t num_items = detail::make_large_offset<offset_t>();
+  const auto num_items = detail::make_large_offset<offset_t>();
 
   auto a_it               = cuda::counting_iterator(offset_t{0});
   auto b_it               = cuda::constant_iterator(offset_t{42});
@@ -119,7 +119,7 @@ try
   using offset_t = c2h::get<0, TestType>;
 
   // make size a few thread blocks below/beyond 4GiB. need to make sure I32 num_items stays below 2^31
-  constexpr offset_t num_items = static_cast<offset_t>((1ll << 31) + (sizeof(offset_t) == 4 ? -123456 : 123456));
+  constexpr auto num_items = static_cast<offset_t>((1ll << 31) + (sizeof(offset_t) == 4 ? -123456 : 123456));
   REQUIRE(num_items > 0);
 
   c2h::device_vector<type> input(static_cast<size_t>(num_items), thrust::no_init);
diff --git a/cub/test/test_device_batch_copy.cu b/cub/test/test_device_batch_copy.cu
index bba9b0eec7c..ff7199bfa61 100644
--- a/cub/test/test_device_batch_copy.cu
+++ b/cub/test/test_device_batch_copy.cu
@@ -55,7 +55,7 @@ template <typename RangeOffsetT, typename ByteOffsetT, typename RangeSizeT>
 c2h::host_vector<ByteOffsetT>
 GetShuffledRangeOffsets(const c2h::host_vector<RangeSizeT>& range_sizes, const std::uint_fast32_t seed = 320981U)
 {
-  RangeOffsetT num_ranges = static_cast<RangeOffsetT>(range_sizes.size());
+  auto num_ranges = static_cast<RangeOffsetT>(range_sizes.size());
 
   // We're remapping the i-th range to pmt_idxs[i]
   std::mt19937 rng(seed);
@@ -380,12 +380,12 @@ int main(int argc, char** argv)
   for (const auto& size_range : size_ranges)
   {
     // The most granular type being copied.
-    using AtomicCopyT         = int64_t;
-    RangeSizeT min_range_size = static_cast<RangeSizeT>(cuda::round_up(size_range.first, sizeof(AtomicCopyT)));
-    RangeSizeT max_range_size =
+    using AtomicCopyT   = int64_t;
+    auto min_range_size = static_cast<RangeSizeT>(cuda::round_up(size_range.first, sizeof(AtomicCopyT)));
+    auto max_range_size =
       static_cast<RangeSizeT>(cuda::round_up(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
-    double average_range_size      = (min_range_size + max_range_size) / 2.0;
-    RangeOffsetT target_num_ranges = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+    double average_range_size = (min_range_size + max_range_size) / 2.0;
+    auto target_num_ranges    = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
 
     // Run tests with output ranges being consecutive
     RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
@@ -399,12 +399,12 @@ int main(int argc, char** argv)
   for (const auto& size_range : size_ranges)
   {
     // The most granular type being copied.
-    using AtomicCopyT         = cuda::std::tuple<int64_t, int32_t, int16_t, char, char>;
-    RangeSizeT min_range_size = static_cast<RangeSizeT>(cuda::round_up(size_range.first, sizeof(AtomicCopyT)));
-    RangeSizeT max_range_size =
+    using AtomicCopyT   = cuda::std::tuple<int64_t, int32_t, int16_t, char, char>;
+    auto min_range_size = static_cast<RangeSizeT>(cuda::round_up(size_range.first, sizeof(AtomicCopyT)));
+    auto max_range_size =
       static_cast<RangeSizeT>(cuda::round_up(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
-    double average_range_size      = (min_range_size + max_range_size) / 2.0;
-    RangeOffsetT target_num_ranges = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+    double average_range_size = (min_range_size + max_range_size) / 2.0;
+    auto target_num_ranges    = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
 
     // Run tests with output ranges being consecutive
     RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index 8ac38867f49..7d1b526c3ad 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -421,7 +421,7 @@ template <typename T>
 T RandomValue(T max)
 {
   unsigned int bits;
-  unsigned int max_int = (unsigned int) -1;
+  auto max_int = (unsigned int) -1;
   do
   {
     RandomBits(bits);
diff --git a/libcudacxx/include/cuda/__launch/host_launch.h b/libcudacxx/include/cuda/__launch/host_launch.h
index 794e873ea02..3100a653944 100644
--- a/libcudacxx/include/cuda/__launch/host_launch.h
+++ b/libcudacxx/include/cuda/__launch/host_launch.h
@@ -100,9 +100,8 @@ _CCCL_HOST_API void host_launch(stream_ref __stream, _Callable __callable, _Args
   }
   else
   {
-    using _CallbackData = __stream_callback_data<_Callable, _Args...>;
-    _CallbackData* __callback_data_ptr =
-      new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}};
+    using _CallbackData       = __stream_callback_data<_Callable, _Args...>;
+    auto* __callback_data_ptr = new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}};
 
     // We use the callback here to have it execute even on stream error, because it needs to free the above allocation
     ::cuda::__driver::__streamAddCallback(
diff --git a/libcudacxx/include/cuda/__random/feistel_bijection.h b/libcudacxx/include/cuda/__random/feistel_bijection.h
index 91aa3ac2861..8a4be8f1eba 100644
--- a/libcudacxx/include/cuda/__random/feistel_bijection.h
+++ b/libcudacxx/include/cuda/__random/feistel_bijection.h
@@ -81,14 +81,14 @@ class __feistel_bijection
   {
     // Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1
     // (2022): 1-20.
-    uint32_t __L = static_cast<uint32_t>(__val >> __R_bits_);
-    uint32_t __R = static_cast<uint32_t>(__val & __R_mask_);
-    for (uint32_t __i = 0; __i < __num_rounds; __i++)
+    auto __L = static_cast<uint32_t>(__val >> __R_bits_);
+    auto __R = static_cast<uint32_t>(__val & __R_mask_);
+    for (const auto __key : __keys_)
     {
       constexpr uint64_t __m0  = 0xD2B74407B1CE6E93;
       const uint64_t __product = __m0 * __L;
-      uint32_t __F_k           = (__product >> 32) ^ __keys_[__i];
-      uint32_t __B_k           = static_cast<uint32_t>(__product);
+      uint32_t __F_k           = (__product >> 32) ^ __key;
+      auto __B_k               = static_cast<uint32_t>(__product);
       uint32_t __L_prime       = __F_k ^ __R;
 
       uint32_t __R_prime = (__B_k << (__R_bits_ - __L_bits_)) | __R >> __L_bits_;
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_n.h b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
index 8556ec6f9ff..d3b99755b4b 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
@@ -39,7 +39,7 @@ _CCCL_API inline _CCCL_CONSTEXPR_CXX20 _OutputIterator
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
-  _IntegralSize __n   = static_cast<_IntegralSize>(__orig_n);
+  auto __n            = static_cast<_IntegralSize>(__orig_n);
   if (__n > 0)
   {
     *__result = *__first;
@@ -62,7 +62,7 @@ template <class _InputIterator,
 _CCCL_API constexpr _OutputIterator copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
-  _IntegralSize __n   = static_cast<_IntegralSize>(__orig_n);
+  auto __n            = static_cast<_IntegralSize>(__orig_n);
   return ::cuda::std::copy(__first, __first + __n, __result);
 }
 
diff --git a/libcudacxx/include/cuda/std/__algorithm/generate_n.h b/libcudacxx/include/cuda/std/__algorithm/generate_n.h
index 233c4566d98..54e5896c066 100644
--- a/libcudacxx/include/cuda/std/__algorithm/generate_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/generate_n.h
@@ -31,7 +31,7 @@ template <class _OutputIterator, class _Size, class _Generator>
 _CCCL_API constexpr _OutputIterator generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
-  _IntegralSize __n   = static_cast<_IntegralSize>(__orig_n);
+  auto __n            = static_cast<_IntegralSize>(__orig_n);
   for (; __n > 0; ++__first, (void) --__n)
   {
     *__first = __gen();
diff --git a/libcudacxx/include/cuda/std/__algorithm/search_n.h b/libcudacxx/include/cuda/std/__algorithm/search_n.h
index cba94f2ecac..154b4424c51 100644
--- a/libcudacxx/include/cuda/std/__algorithm/search_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/search_n.h
@@ -95,7 +95,7 @@ template <class _BinaryPredicate, class _RandomAccessIterator, class _Size, clas
   {
     return __first;
   }
-  _Size __len = static_cast<_Size>(__last - __first);
+  auto __len = static_cast<_Size>(__last - __first);
   if (__len < __count)
   {
     return __last;
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 7ebca48711f..f816aa91a58 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -55,7 +55,7 @@ _CCCL_DEVICE static void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
   constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1);
-  uint16_t* __aligned            = (uint16_t*) ((intptr_t) __ptr & (~__alignmask));
+  auto* __aligned                = (uint16_t*) ((intptr_t) __ptr & (~__alignmask));
   const uint8_t __offset         = uint16_t((intptr_t) __ptr & __alignmask) * 8;
 
   uint16_t __value = 0;
@@ -70,7 +70,7 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __
 {
   constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
   constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
-  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  auto* __aligned                = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
   const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
   const uint32_t __valueMask     = __sizemask << __offset;
   const uint32_t __windowMask    = ~__valueMask;
@@ -115,7 +115,7 @@ _CCCL_DEVICE_API _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op,
 {
   constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
   constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
-  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  auto* __aligned                = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
   const uint8_t __offset         = uint8_t((intptr_t) __ptr & __alignmask) * 8;
   const uint32_t __valueMask     = __sizemask << __offset;
   const uint32_t __windowMask    = ~__valueMask;
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 479815f4136..ab4b578f7d7 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -931,8 +931,8 @@ static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  const auto* __ptr_proxy = reinterpret_cast<const __proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
   if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) {{return;}}
   __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
   __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
@@ -942,8 +942,8 @@ static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr,
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  const auto* __ptr_proxy = reinterpret_cast<const __proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
   if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) {{return;}}
   __cuda_atomic_bind_load<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_load{__ptr_proxy, __dst_proxy};
   __cuda_atomic_load_memory_order_dispatch(__bound_load, __memorder, _Sco{});
@@ -1364,8 +1364,8 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val,
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
   if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) {{return;}}
   __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
   __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
@@ -1375,8 +1375,8 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
   if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) {{return;}}
   __cuda_atomic_bind_store<__proxy_t, __proxy_tag, _Sco, __atomic_cuda_mmio_disable> __bound_store{__ptr_proxy, __val_proxy};
   __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
@@ -1969,9 +1969,9 @@ static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type* __ptr, _Ty
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
-  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  auto* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
   bool __res = false;
   if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) {return __res;}
   __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
@@ -1982,9 +1982,9 @@ static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type volatile* _
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
-  __proxy_t* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  auto* __des_proxy  = reinterpret_cast<__proxy_t*>(&__des);
   bool __res = false;
   if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) {return __res;}
   __cuda_atomic_bind_compare_exchange<__proxy_t, __proxy_tag, _Sco> __bound_compare_swap{__ptr_proxy, __exp_proxy, __des_proxy};
@@ -2576,9 +2576,9 @@ static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __ol
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
-  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  auto* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
   if(__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) {{return;}}
   __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
   __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
@@ -2588,9 +2588,9 @@ static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type volatile* __ptr, _T
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
-  __proxy_t* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  auto* __new_proxy  = reinterpret_cast<__proxy_t*>(&__new);
   if(__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) {{return;}}
   __cuda_atomic_bind_exchange<__proxy_t, __proxy_tag, _Sco> __bound_swap{__ptr_proxy, __old_proxy, __new_proxy};
   __cuda_atomic_exchange_memory_order_dispatch(__bound_swap, __memorder, _Sco{});
@@ -3044,9 +3044,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmet
   using __proxy_t        = typename __atomic_cuda_deduce_arithmetic<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{});
@@ -3060,9 +3060,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmet
   using __proxy_t        = typename __atomic_cuda_deduce_arithmetic<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_arithmetic<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_add<__proxy_t, __proxy_tag, _Sco> __bound_add{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_add, __memorder, _Sco{});
@@ -3249,9 +3249,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{});
@@ -3265,9 +3265,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_and<__proxy_t, __proxy_tag, _Sco> __bound_and{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_and, __memorder, _Sco{});
@@ -3614,9 +3614,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_
   using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{});
@@ -3630,9 +3630,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_
   using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_max<__proxy_t, __proxy_tag, _Sco> __bound_max{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_max, __memorder, _Sco{});
@@ -3979,9 +3979,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_
   using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{});
@@ -3995,9 +3995,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_
   using __proxy_t        = typename __atomic_cuda_deduce_minmax<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_min<__proxy_t, __proxy_tag, _Sco> __bound_min{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_min, __memorder, _Sco{});
@@ -4184,9 +4184,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{});
@@ -4200,9 +4200,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_or<__proxy_t, __proxy_tag, _Sco> __bound_or{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_or, __memorder, _Sco{});
@@ -4389,9 +4389,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{});
@@ -4405,9 +4405,9 @@ template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
   _Type __dst{};
-  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
-  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
-  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  auto* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  auto* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  auto* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
   if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) {return __dst;}
   __cuda_atomic_bind_fetch_xor<__proxy_t, __proxy_tag, _Sco> __bound_xor{__ptr_proxy, __dst_proxy, __op_proxy};
   __cuda_atomic_fetch_memory_order_dispatch(__bound_xor, __memorder, _Sco{});
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 0a20a333ff8..1a6c594aad9 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -63,8 +63,7 @@ struct _CCCL_ALIGNAS(sizeof(_Tp)) __atomic_alignment_wrapper
 template <typename _Tp>
 __atomic_alignment_wrapper<_Tp>* __atomic_force_align_host(_Tp* __a)
 {
-  __atomic_alignment_wrapper<_Tp>* __w =
-    reinterpret_cast<__atomic_alignment_wrapper<_Tp>*>(const_cast<remove_cv_t<_Tp>*>(__a));
+  auto* __w = reinterpret_cast<__atomic_alignment_wrapper<_Tp>*>(const_cast<remove_cv_t<_Tp>*>(__a));
   return __w;
 }
 
diff --git a/libcudacxx/include/cuda/std/__complex/exponential_functions.h b/libcudacxx/include/cuda/std/__complex/exponential_functions.h
index 655c3cdf062..c7b4fe24051 100644
--- a/libcudacxx/include/cuda/std/__complex/exponential_functions.h
+++ b/libcudacxx/include/cuda/std/__complex/exponential_functions.h
@@ -201,7 +201,7 @@ _CCCL_API inline complex<float> exp(const complex<float>& __x)
     __exp_r_ldexp_factor = -151.0f;
   }
 
-  const int32_t __ans_ldexp_factor = static_cast<int32_t>(__exp_r_ldexp_factor);
+  const auto __ans_ldexp_factor = static_cast<int32_t>(__exp_r_ldexp_factor);
 
   // Split this j up into four parts to fit it into four float exponents's.
   // (Splitting j in 4 better than in 3).
@@ -212,8 +212,8 @@ _CCCL_API inline complex<float> exp(const complex<float>& __x)
   __ans_ldexp_factor_quarter   = (__ans_ldexp_factor_quarter + 127) << 23;
   __ans_ldexp_factor_remainder = (__ans_ldexp_factor_remainder + 127) << 23;
 
-  const float __ldexp_factor_1 = ::cuda::std::bit_cast<float>(__ans_ldexp_factor_quarter);
-  const float __ldexp_factor_2 = ::cuda::std::bit_cast<float>(__ans_ldexp_factor_remainder);
+  const auto __ldexp_factor_1 = ::cuda::std::bit_cast<float>(__ans_ldexp_factor_quarter);
+  const auto __ldexp_factor_2 = ::cuda::std::bit_cast<float>(__ans_ldexp_factor_remainder);
 
   // Need to order our multiplications to avoid intermediate under/overflow, including when __sin_r is denormal.
   // Experiment suggests this is (one of) the better ways to do it, there's not that many combinations that work for all
@@ -287,7 +287,7 @@ _CCCL_API inline complex<double> exp<double>(const complex<double>& __x)
     __exp_r_ldexp_factor = -1076.0;
   }
 
-  const int64_t __ans_ldexp_factor = static_cast<int64_t>(__exp_r_ldexp_factor);
+  const auto __ans_ldexp_factor = static_cast<int64_t>(__exp_r_ldexp_factor);
 
   // Split this j up into four parts to fit it into four float exponents's.
   // (Splitting j in 4 better than in 3).
@@ -298,8 +298,8 @@ _CCCL_API inline complex<double> exp<double>(const complex<double>& __x)
   __ans_ldexp_factor_quarter   = (__ans_ldexp_factor_quarter + 1023) << 52;
   __ans_ldexp_factor_remainder = (__ans_ldexp_factor_remainder + 1023) << 52;
 
-  const double __ldexp_factor_1 = ::cuda::std::bit_cast<double>(__ans_ldexp_factor_quarter);
-  const double __ldexp_factor_2 = ::cuda::std::bit_cast<double>(__ans_ldexp_factor_remainder);
+  const auto __ldexp_factor_1 = ::cuda::std::bit_cast<double>(__ans_ldexp_factor_quarter);
+  const auto __ldexp_factor_2 = ::cuda::std::bit_cast<double>(__ans_ldexp_factor_remainder);
 
   // Need to order our multiplications to avoid intermediate under/overflow, including when __sin_r is denormal.
   // Experiment suggests this is (one of) the better ways to do it, there's not that many combinations that work for all
diff --git a/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h b/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h
index b72ad2fa576..cd2473f881c 100644
--- a/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h
+++ b/libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_functions.h
@@ -195,7 +195,7 @@ template <class _Tp>
     // but not small enough that the asinh(x) ~ log(2x) estimate does
     // not break down. We are not able to reduce this with a single simple reduction,
     // so we do a fast/inlined frexp/ldexp:
-    const int32_t __exp_biased = static_cast<int32_t>(::cuda::std::__fp_get_storage(__max) >> __mant_nbits);
+    const auto __exp_biased = static_cast<int32_t>(::cuda::std::__fp_get_storage(__max) >> __mant_nbits);
 
     // Get a factor such that (__max * __exp_mul_factor) <= __max_allowed_exponent
     const __uint_t __exp_reduce_factor =
@@ -491,7 +491,7 @@ template <class _Tp>
     // but not small enough that the acosh(x) ~ log(2x) estimate does
     // not break down. We are not able to reduce this with a single simple reduction,
     // so we do a fast/inlined frexp/ldexp:
-    const int32_t __exp_biased = static_cast<int32_t>(::cuda::std::__fp_get_storage(__max) >> __mant_nbits);
+    const auto __exp_biased = static_cast<int32_t>(::cuda::std::__fp_get_storage(__max) >> __mant_nbits);
 
     // Get a factor such that (__max * __exp_mul_factor) <= __max_allowed_exponent
     const __uint_t __exp_reduce_factor =
diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h
index 10ad3a20f0e..ebdc0c1d298 100644
--- a/libcudacxx/include/cuda/std/__iterator/advance.h
+++ b/libcudacxx/include/cuda/std/__iterator/advance.h
@@ -41,7 +41,7 @@ template <class _InputIter,
 _CCCL_API constexpr void advance(_InputIter& __i, _Distance __orig_n)
 {
   using _Difference = typename iterator_traits<_InputIter>::difference_type;
-  _Difference __n   = static_cast<_Difference>(::cuda::std::__convert_to_integral(__orig_n));
+  auto __n          = static_cast<_Difference>(::cuda::std::__convert_to_integral(__orig_n));
   if constexpr (__has_random_access_traversal<_InputIter>) // To support pointers to incomplete types
   {
     __i += __n;
diff --git a/libcudacxx/include/cuda/std/__memory/align.h b/libcudacxx/include/cuda/std/__memory/align.h
index 678a4a266f3..522ac0f5a9a 100644
--- a/libcudacxx/include/cuda/std/__memory/align.h
+++ b/libcudacxx/include/cuda/std/__memory/align.h
@@ -46,7 +46,7 @@ _CCCL_API inline void* align(size_t __alignment, size_t __size, void*& __ptr, si
   char* __char_ptr    = static_cast<char*>(__ptr);
   char* __aligned_ptr = reinterpret_cast<char*>( // NOLINT(performance-no-int-to-ptr)
     reinterpret_cast<uintptr_t>(__char_ptr + (__alignment - 1)) & -__alignment);
-  const size_t __diff = static_cast<size_t>(__aligned_ptr - __char_ptr);
+  const auto __diff   = static_cast<size_t>(__aligned_ptr - __char_ptr);
   if (__diff > (__space - __size))
   {
     return nullptr;
diff --git a/libcudacxx/include/cuda/std/__new/allocate.h b/libcudacxx/include/cuda/std/__new/allocate.h
index a52ddeef8ca..bed9a8fe825 100644
--- a/libcudacxx/include/cuda/std/__new/allocate.h
+++ b/libcudacxx/include/cuda/std/__new/allocate.h
@@ -79,7 +79,7 @@ _CCCL_API inline void* __cccl_allocate(size_t __size, [[maybe_unused]] size_t __
 #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
   if (::cuda::std::__is_overaligned_for_new(__align))
   {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
+    const auto __align_val = static_cast<align_val_t>(__align);
     return ::cuda::std::__cccl_operator_new(__size, __align_val);
   }
 #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
@@ -101,7 +101,7 @@ _CCCL_API inline void __cccl_deallocate(void* __ptr, size_t __size, [[maybe_unus
 #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
   if (::cuda::std::__is_overaligned_for_new(__align))
   {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
+    const auto __align_val = static_cast<align_val_t>(__align);
     return ::cuda::std::__do_deallocate_handle_size(__ptr, __size, __align_val);
   }
 #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
@@ -113,7 +113,7 @@ _CCCL_API inline void __cccl_deallocate_unsized(void* __ptr, [[maybe_unused]] si
 #if _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
   if (::cuda::std::__is_overaligned_for_new(__align))
   {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
+    const auto __align_val = static_cast<align_val_t>(__align);
     return ::cuda::std::__cccl_operator_delete(__ptr, __align_val);
   }
 #endif // _LIBCUDACXX_HAS_ALIGNED_ALLOCATION()
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu
index f7054e2def5..b006d9b9398 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/managed_memory_resource.cu
@@ -96,7 +96,7 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource allocation", "[memory_resource]",
     return;
   }
 #endif // _CCCL_CTK_AT_LEAST(13, 0)
-  managed_resource res = get_resource<managed_resource>();
+  auto res = get_resource<managed_resource>();
   cuda::stream stream{cuda::device_ref{0}};
 
   { // allocate_sync / deallocate_sync
@@ -188,9 +188,9 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource comparison", "[memory_resource]",
     return;
   }
 #endif // _CCCL_CTK_AT_LEAST(13, 0)
-  managed_resource first = get_resource<managed_resource>();
+  auto first = get_resource<managed_resource>();
   { // comparison against a plain managed_memory_resource
-    managed_resource second = get_resource<managed_resource>();
+    auto second = get_resource<managed_resource>();
     CHECK((first == second));
     CHECK(!(first != second));
   }
@@ -212,7 +212,7 @@ C2H_CCCLRT_TEST_LIST("managed_memory_resource comparison", "[memory_resource]",
 #endif // _CCCL_CTK_AT_LEAST(13, 0)
 
   { // comparison against a managed_memory_resource wrapped inside a synchronous_resource_ref<device_accessible>
-    managed_resource second = get_resource<managed_resource>();
+    auto second = get_resource<managed_resource>();
     cuda::mr::synchronous_resource_ref<::cuda::mr::device_accessible> second_ref{second};
     CHECK((first == second_ref));
     CHECK(!(first != second_ref));
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu
index 09a00a57b80..48da4546bb2 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/memory_pools.cu
@@ -138,7 +138,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES
   using memory_pool = TestType;
   SECTION("Construct from device id")
   {
-    memory_pool from_device = construct_pool<memory_pool>(current_device);
+    auto from_device = construct_pool<memory_pool>(current_device);
 
     ::cudaMemPool_t get = from_device.get();
     CHECK(get != current_default_pool);
@@ -156,7 +156,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES
   SECTION("Construct with empty properties")
   {
     cuda::memory_pool_properties props{};
-    memory_pool from_defaulted_properties = construct_pool<memory_pool>(current_device, props);
+    auto from_defaulted_properties = construct_pool<memory_pool>(current_device, props);
 
     ::cudaMemPool_t get = from_defaulted_properties.get();
     CHECK(get != current_default_pool);
@@ -174,7 +174,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool construction", "[memory_resource]", TES
   SECTION("Construct with initial pool size")
   {
     cuda::memory_pool_properties props = {20, 42};
-    memory_pool with_threshold         = construct_pool<memory_pool>(current_device, props);
+    auto with_threshold                = construct_pool<memory_pool>(current_device, props);
 
     ::cudaMemPool_t get = with_threshold.get();
     CHECK(get != current_default_pool);
@@ -261,7 +261,7 @@ C2H_CCCLRT_TEST_LIST("base_memory_pool construction", "[memory_resource]", TEST_
     else
 #  endif // _CCCL_CTK_AT_LEAST(13, 0)
     {
-      memory_pool with_max_pool_size = construct_pool<memory_pool>(current_device, props);
+      auto with_max_pool_size = construct_pool<memory_pool>(current_device, props);
 
       ::cudaMemPool_t get = with_max_pool_size.get();
       CHECK(get != current_default_pool);
@@ -333,9 +333,9 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool comparison", "[memory_resource]", TEST_
   }
 
   using memory_pool = TestType;
-  memory_pool first = construct_pool<memory_pool>(current_device);
+  auto first        = construct_pool<memory_pool>(current_device);
   { // comparison against a plain device_memory_pool
-    memory_pool second = construct_pool<memory_pool>(current_device);
+    auto second = construct_pool<memory_pool>(current_device);
     CHECK(first == first);
     CHECK(first != second);
   }
@@ -357,7 +357,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T
   using memory_resource = typename memory_pool::reference_type;
   SECTION("device_memory_pool::set_attribute")
   {
-    memory_pool pool = construct_pool<memory_pool>(current_device);
+    auto pool = construct_pool<memory_pool>(current_device);
 
     { // cudaMemPoolReuseFollowEventDependencies
       // Get the attribute value
@@ -511,7 +511,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T
 
   SECTION("device_memory_pool::trim_to")
   {
-    memory_pool pool = construct_pool<memory_pool>(current_device);
+    auto pool = construct_pool<memory_pool>(current_device);
 
     // prime the pool to a given size
     memory_resource resource{pool};
@@ -566,8 +566,8 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T
 
   SECTION("memory_pool::as_ref")
   {
-    memory_pool pool = construct_pool<memory_pool>(current_device);
-    auto ref         = pool.as_ref();
+    auto pool = construct_pool<memory_pool>(current_device);
+    auto ref  = pool.as_ref();
     static_assert(!cuda::std::copyable<memory_pool>);
     static_assert(cuda::std::copyable<decltype(ref)>);
     CHECK(ref == pool);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu
index 56b5a7e7174..51d5a80e4fd 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/pinned_memory_resource.cu
@@ -83,7 +83,7 @@ C2H_CCCLRT_TEST_LIST("pinned_memory_resource allocation", "[memory_resource]", T
     return;
   }
 #endif // _CCCL_CTK_AT_LEAST(12, 9)
-  pinned_resource res = get_resource<pinned_resource>();
+  auto res = get_resource<pinned_resource>();
   cuda::stream stream{cuda::device_ref{0}};
 
   { // allocate_sync / deallocate_sync
@@ -202,15 +202,15 @@ static_assert(cuda::mr::synchronous_resource<derived_pinned_resource>, "");
 C2H_CCCLRT_TEST_LIST("pinned_memory_resource comparison", "[memory_resource]", TEST_TYPES)
 {
   using pinned_resource = TestType;
-  pinned_resource first = get_resource<pinned_resource>();
+  auto first            = get_resource<pinned_resource>();
   { // comparison against a plain pinned_memory_resource
-    pinned_resource second = get_resource<pinned_resource>();
+    auto second = get_resource<pinned_resource>();
     CHECK((first == second));
     CHECK(!(first != second));
   }
 
   { // comparison against a pinned_memory_resource wrapped inside a synchronous_resource_ref<device_accessible>
-    pinned_resource second = get_resource<pinned_resource>();
+    auto second = get_resource<pinned_resource>();
     cuda::mr::synchronous_resource_ref<::cuda::mr::device_accessible> const second_ref{second};
 
     CHECK((first == second_ref));
@@ -221,7 +221,7 @@ C2H_CCCLRT_TEST_LIST("pinned_memory_resource comparison", "[memory_resource]", T
 
   if constexpr (cuda::mr::resource<pinned_resource>)
   { // comparison against a pinned_memory_resource wrapped inside a resource_ref
-    pinned_resource second = get_resource<pinned_resource>();
+    auto second = get_resource<pinned_resource>();
     cuda::mr::resource_ref<::cuda::mr::device_accessible> second_ref{second};
 
     CHECK((first == second_ref));
diff --git a/thrust/examples/bucket_sort2d.cu b/thrust/examples/bucket_sort2d.cu
index 25256f83f2a..1568de2f31d 100644
--- a/thrust/examples/bucket_sort2d.cu
+++ b/thrust/examples/bucket_sort2d.cu
@@ -37,8 +37,8 @@ struct point_to_bucket_index
   __host__ __device__ unsigned int operator()(const vec2& v) const
   {
     // find the raster indices of p's bucket
-    unsigned int x = static_cast<unsigned int>(cuda::std::get<0>(v) * width);
-    unsigned int y = static_cast<unsigned int>(cuda::std::get<1>(v) * height);
+    auto x = static_cast<unsigned int>(cuda::std::get<0>(v) * width);
+    auto y = static_cast<unsigned int>(cuda::std::get<1>(v) * height);
 
     // return the bucket's linear index
     return y * width + x;
diff --git a/thrust/testing/copy.cu b/thrust/testing/copy.cu
index 73a6892d66e..d96a9488bf4 100644
--- a/thrust/testing/copy.cu
+++ b/thrust/testing/copy.cu
@@ -30,8 +30,8 @@ void TestCopyFromConstIterator()
 
   std::vector<T> v{0, 1, 2, 3, 4};
 
-  std::vector<int>::const_iterator begin = v.begin();
-  std::vector<int>::const_iterator end   = v.end();
+  auto begin = v.begin();
+  auto end   = v.end();
 
   // copy to host_vector
   thrust::host_vector<T> h(5, (T) 10);
@@ -196,7 +196,7 @@ void TestCopyListTo()
 
   ASSERT_EQUAL(l.size(), 5lu);
 
-  typename std::list<T>::const_iterator iter = l.begin();
+  auto iter = l.begin();
   ASSERT_EQUAL(*iter, T(0));
   iter++;
   ASSERT_EQUAL(*iter, T(1));
@@ -466,7 +466,7 @@ void TestCopyIfNonTrivial()
     std::fill(buffer.begin(), buffer.end(), static_cast<unsigned char>(0));
 
     object_with_non_trivial_ctor initialized;
-    object_with_non_trivial_ctor* uninitialized = reinterpret_cast<object_with_non_trivial_ctor*>(buffer.data());
+    auto* uninitialized = reinterpret_cast<object_with_non_trivial_ctor*>(buffer.data());
 
     object_with_non_trivial_ctor source(42);
     initialized    = source;
diff --git a/thrust/testing/copy_n.cu b/thrust/testing/copy_n.cu
index 9ee3039a706..35707f654fc 100644
--- a/thrust/testing/copy_n.cu
+++ b/thrust/testing/copy_n.cu
@@ -18,7 +18,7 @@ void TestCopyNFromConstIterator()
 
   std::vector<T> v{0, 1, 2, 3, 4};
 
-  std::vector<int>::const_iterator begin = v.begin();
+  auto begin = v.begin();
 
   // copy to host_vector
   thrust::host_vector<T> h(5, (T) 10);
@@ -150,7 +150,7 @@ void TestCopyNListTo()
 
   ASSERT_EQUAL(l.size(), 5lu);
 
-  typename std::list<T>::const_iterator iter = l.begin();
+  auto iter = l.begin();
   ASSERT_EQUAL(*iter, T(0));
   iter++;
   ASSERT_EQUAL(*iter, T(1));
diff --git a/thrust/testing/cuda/copy_if.cu b/thrust/testing/cuda/copy_if.cu
index b423713e911..45c960bc07b 100644
--- a/thrust/testing/cuda/copy_if.cu
+++ b/thrust/testing/cuda/copy_if.cu
@@ -306,7 +306,7 @@ void TestCopyIfWithMagnitude(int magnitude)
   auto selected_out_end = thrust::copy_if(begin, end, copied_out.begin(), mod_n<offset_t>{match_every_nth});
 
   // Ensure number of selected items are correct
-  offset_t num_selected_out = static_cast<offset_t>(::cuda::std::distance(copied_out.begin(), selected_out_end));
+  auto num_selected_out = static_cast<offset_t>(::cuda::std::distance(copied_out.begin(), selected_out_end));
   ASSERT_EQUAL(num_selected_out, expected_num_copied);
   copied_out.resize(expected_num_copied);
 
@@ -343,7 +343,7 @@ void TestCopyIfStencilWithMagnitude(int magnitude)
   auto selected_out_end = thrust::copy_if(begin, end, stencil, copied_out.begin(), mod_n<offset_t>{match_every_nth});
 
   // Ensure number of selected items are correct
-  offset_t num_selected_out = static_cast<offset_t>(::cuda::std::distance(copied_out.begin(), selected_out_end));
+  auto num_selected_out = static_cast<offset_t>(::cuda::std::distance(copied_out.begin(), selected_out_end));
   ASSERT_EQUAL(num_selected_out, expected_num_copied);
   copied_out.resize(expected_num_copied);
 
diff --git a/thrust/testing/cuda/unique.cu b/thrust/testing/cuda/unique.cu
index 04675a91a09..872f04973d4 100644
--- a/thrust/testing/cuda/unique.cu
+++ b/thrust/testing/cuda/unique.cu
@@ -389,8 +389,8 @@ void TestUniqueWithMagnitude(int magnitude)
   using offset_t      = std::int64_t;
   using equality_op_t = div_n_equality_op<offset_t>;
 
-  offset_t run_length_of_equal_items = offset_t{10};
-  equality_op_t equality_op          = equality_op_t{run_length_of_equal_items};
+  auto run_length_of_equal_items = offset_t{10};
+  auto equality_op               = equality_op_t{run_length_of_equal_items};
 
   // Prepare input
   offset_t num_items = offset_t{1ull} << magnitude;
@@ -403,7 +403,7 @@ void TestUniqueWithMagnitude(int magnitude)
   auto unique_out_end = thrust::unique_copy(begin, end, unique_out.begin(), equality_op);
 
   // Ensure number of selected items are correct
-  offset_t num_selected_out = static_cast<offset_t>(cuda::std::distance(unique_out.begin(), unique_out_end));
+  auto num_selected_out = static_cast<offset_t>(cuda::std::distance(unique_out.begin(), unique_out_end));
   ASSERT_EQUAL(num_selected_out, expected_num_unique);
   unique_out.resize(expected_num_unique);
 
diff --git a/thrust/testing/zip_iterator_reduce_by_key.cu b/thrust/testing/zip_iterator_reduce_by_key.cu
index acf066c8112..56b81b467e8 100644
--- a/thrust/testing/zip_iterator_reduce_by_key.cu
+++ b/thrust/testing/zip_iterator_reduce_by_key.cu
@@ -71,7 +71,7 @@ struct TestZipIteratorReduceByKey
     // The tests below get miscompiled on Tesla hw for 8b types
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if (const CUDATestDriver* driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
+    if (const auto* driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
     {
       if (typeid(T) == typeid(unittest::uint8_t) && driver->current_device_architecture() < 200)
       {
diff --git a/thrust/thrust/detail/execute_with_allocator.h b/thrust/thrust/detail/execute_with_allocator.h
index 233d1466a19..e05c6e983cd 100644
--- a/thrust/thrust/detail/execute_with_allocator.h
+++ b/thrust/thrust/detail/execute_with_allocator.h
@@ -37,7 +37,7 @@ get_temporary_buffer(thrust::detail::execute_with_allocator<Allocator, BaseSyste
 
   // How many elements of type value_type do we need to accommodate n elements
   // of type T?
-  const size_type num_elements = static_cast<size_type>(::cuda::ceil_div(sizeof(T) * n, sizeof(value_type)));
+  const auto num_elements = static_cast<size_type>(::cuda::ceil_div(sizeof(T) * n, sizeof(value_type)));
 
   void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements);
 
@@ -58,7 +58,7 @@ _CCCL_HOST void return_temporary_buffer(
 
   size_type num_elements = ::cuda::ceil_div(sizeof(T) * n, sizeof(value_type));
 
-  pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
+  auto to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
   alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
 }
 } // namespace detail
diff --git a/thrust/thrust/detail/random_bijection.h b/thrust/thrust/detail/random_bijection.h
index 4c38808dc4d..124c5e3e9da 100644
--- a/thrust/thrust/detail/random_bijection.h
+++ b/thrust/thrust/detail/random_bijection.h
@@ -58,14 +58,14 @@ class feistel_bijection
     // We cannot use the above because thrust PRNG generators incorrectly implement URBG requirements.
     // Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1
     // (2022): 1-20.
-    uint32_t L = static_cast<uint32_t>(val >> R_bits);
-    uint32_t R = static_cast<uint32_t>(val & R_mask);
-    for (uint32_t i = 0; i < num_rounds; i++)
+    auto L = static_cast<uint32_t>(val >> R_bits);
+    auto R = static_cast<uint32_t>(val & R_mask);
+    for (const auto k : key)
     {
       constexpr uint64_t m0  = 0xD2B74407B1CE6E93;
       const uint64_t product = m0 * L;
-      uint32_t F_k           = (product >> 32) ^ key[i];
-      uint32_t B_k           = static_cast<uint32_t>(product);
+      uint32_t F_k           = (product >> 32) ^ k;
+      auto B_k               = static_cast<uint32_t>(product);
       uint32_t L_prime       = F_k ^ R;
 
       uint32_t R_prime = (B_k << (R_bits - L_bits)) | R >> L_bits;
diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h
index 8f6ee12e6b5..142e26ad5a1 100644
--- a/thrust/thrust/detail/reference.h
+++ b/thrust/thrust/detail/reference.h
@@ -439,7 +439,7 @@ class reference
   template <typename System>
   _CCCL_HOST_DEVICE value_type strip_const_get_value(System const& system) const
   {
-    System& non_const_system = const_cast<System&>(system);
+    auto& non_const_system = const_cast<System&>(system);
 
     using thrust::system::detail::generic::get_value;
     return get_value(thrust::detail::derived_cast(non_const_system), ptr);
@@ -464,7 +464,7 @@ class reference
   template <typename System, typename OtherPointer>
   _CCCL_HOST_DEVICE void strip_const_assign_value(System const& system, OtherPointer src) const
   {
-    System& non_const_system = const_cast<System&>(system);
+    auto& non_const_system = const_cast<System&>(system);
 
     using thrust::system::detail::generic::assign_value;
     assign_value(thrust::detail::derived_cast(non_const_system), ptr, src);
diff --git a/thrust/thrust/detail/temporary_buffer.h b/thrust/thrust/detail/temporary_buffer.h
index dbc61685e33..165712299d2 100644
--- a/thrust/thrust/detail/temporary_buffer.h
+++ b/thrust/thrust/detail/temporary_buffer.h
@@ -43,8 +43,7 @@ _CCCL_HOST_DEVICE ::cuda::std::pair<thrust::pointer<T, DerivedPolicy>,
 down_cast_pair(Pair p)
 {
   // XXX should use a hypothetical thrust::static_pointer_cast here
-  thrust::pointer<T, DerivedPolicy> ptr =
-    thrust::pointer<T, DerivedPolicy>(static_cast<T*>(thrust::raw_pointer_cast(p.first)));
+  auto ptr = thrust::pointer<T, DerivedPolicy>(static_cast<T*>(thrust::raw_pointer_cast(p.first)));
 
   using result_type =
     ::cuda::std::pair<thrust::pointer<T, DerivedPolicy>, typename thrust::pointer<T, DerivedPolicy>::difference_type>;
diff --git a/thrust/thrust/mr/pool.h b/thrust/thrust/mr/pool.h
index 12773e90dac..7d644934120 100644
--- a/thrust/thrust/mr/pool.h
+++ b/thrust/thrust/mr/pool.h
@@ -223,7 +223,7 @@ class unsynchronized_pool_resource final
       chunk_descriptor_ptr alloc = m_allocated;
       m_allocated                = thrust::raw_reference_cast(*m_allocated).next;
 
-      void_ptr p = static_cast<void_ptr>(
+      auto p = static_cast<void_ptr>(
         static_cast<char_ptr>(static_cast<void_ptr>(alloc)) - thrust::raw_reference_cast(*alloc).size);
       m_upstream->do_deallocate(
         p, thrust::raw_reference_cast(*alloc).size + sizeof(chunk_descriptor), m_options.alignment);
@@ -237,7 +237,7 @@ class unsynchronized_pool_resource final
 
       oversized_block_descriptor desc = thrust::raw_reference_cast(*alloc);
 
-      void_ptr p = static_cast<void_ptr>(static_cast<char_ptr>(static_cast<void_ptr>(alloc)) - desc.current_size);
+      auto p = static_cast<void_ptr>(static_cast<char_ptr>(static_cast<void_ptr>(alloc)) - desc.current_size);
       m_upstream->do_deallocate(p, desc.size + sizeof(oversized_block_descriptor), desc.alignment);
     }
 
@@ -334,7 +334,7 @@ class unsynchronized_pool_resource final
 
       // no fitting cached block found; allocate a new one that's just up to the specs
       void_ptr allocated = m_upstream->do_allocate(bytes + sizeof(oversized_block_descriptor), alignment);
-      oversized_block_descriptor_ptr block =
+      auto block =
         static_cast<oversized_block_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(allocated) + bytes));
 
       oversized_block_descriptor desc;
@@ -397,7 +397,7 @@ class unsynchronized_pool_resource final
       std::size_t chunk_size = block_size * n;
 
       void_ptr allocated = m_upstream->do_allocate(chunk_size + sizeof(chunk_descriptor), m_options.alignment);
-      chunk_descriptor_ptr chunk =
+      auto chunk =
         static_cast<chunk_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(allocated) + chunk_size));
 
       chunk_descriptor chunk_desc;
@@ -408,7 +408,7 @@ class unsynchronized_pool_resource final
 
       for (std::size_t i = 0; i < n; ++i)
       {
-        block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
+        auto block = static_cast<block_descriptor_ptr>(
           static_cast<void_ptr>(static_cast<char_ptr>(allocated) + block_size * i + bytes));
 
         block_descriptor block_desc;
@@ -435,8 +435,7 @@ class unsynchronized_pool_resource final
     // the deallocated block is oversized and/or overaligned
     if (n > m_options.largest_block_size || alignment > m_options.alignment)
     {
-      oversized_block_descriptor_ptr block =
-        static_cast<oversized_block_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(p) + n));
+      auto block = static_cast<oversized_block_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(p) + n));
 
       oversized_block_descriptor desc = *block;
       assert(desc.current_size == n);
@@ -498,7 +497,7 @@ class unsynchronized_pool_resource final
 
     n = static_cast<std::size_t>(1) << n_log2;
 
-    block_descriptor_ptr block = static_cast<block_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(p) + n));
+    auto block = static_cast<block_descriptor_ptr>(static_cast<void_ptr>(static_cast<char_ptr>(p) + n));
 
     block_descriptor desc;
     desc.next        = bucket.free_list;
diff --git a/thrust/thrust/random/detail/normal_distribution_base.h b/thrust/thrust/random/detail/normal_distribution_base.h
index ebceb9b27bc..9df8ade382e 100644
--- a/thrust/thrust/random/detail/normal_distribution_base.h
+++ b/thrust/thrust/random/detail/normal_distribution_base.h
@@ -42,10 +42,10 @@ class normal_distribution_nvcc
     constexpr uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
 
     // Constants for conversion
-    constexpr RealType S1 = static_cast<RealType>(1. / static_cast<double>(urng_range));
+    constexpr auto S1     = static_cast<RealType>(1. / static_cast<double>(urng_range));
     constexpr RealType S2 = S1 / 2;
 
-    RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
+    auto S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
 
     // Get the integer value
     uint_type u = urng() - UniformRandomNumberGenerator::min;
@@ -118,7 +118,7 @@ class normal_distribution_portable
       m_valid = false;
     }
 
-    const RealType pi = RealType(3.14159265358979323846);
+    const auto pi = RealType(3.14159265358979323846);
 
     RealType result = m_cached_rho * (m_valid ? cos(RealType(2) * pi * m_r1) : sin(RealType(2) * pi * m_r1));
 
diff --git a/thrust/thrust/random/detail/uniform_int_distribution.inl b/thrust/thrust/random/detail/uniform_int_distribution.inl
index dddca6aa706..111bd751708 100644
--- a/thrust/thrust/random/detail/uniform_int_distribution.inl
+++ b/thrust/thrust/random/detail/uniform_int_distribution.inl
@@ -54,8 +54,8 @@ uniform_int_distribution<IntType>::operator()(UniformRandomNumberGenerator& urng
 
   using float_type = typename thrust::detail::largest_available_float::type;
 
-  const float_type real_min(static_cast<float_type>(parm.first));
-  const float_type real_max(static_cast<float_type>(parm.second));
+  const auto real_min(static_cast<float_type>(parm.first));
+  const auto real_max(static_cast<float_type>(parm.second));
 
   // add one to the right end of the interval because it is half-open
   // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
diff --git a/thrust/thrust/random/detail/uniform_real_distribution.inl b/thrust/thrust/random/detail/uniform_real_distribution.inl
index 3f026ecfc89..294ad316911 100644
--- a/thrust/thrust/random/detail/uniform_real_distribution.inl
+++ b/thrust/thrust/random/detail/uniform_real_distribution.inl
@@ -49,7 +49,7 @@ _CCCL_HOST_DEVICE typename uniform_real_distribution<RealType>::result_type
 uniform_real_distribution<RealType>::operator()(UniformRandomNumberGenerator& urng, const param_type& parm)
 {
   // call the urng & map its result to [0,1)
-  result_type result = static_cast<result_type>(urng() - UniformRandomNumberGenerator::min);
+  auto result = static_cast<result_type>(urng() - UniformRandomNumberGenerator::min);
 
   // adding one to the denominator ensures that the interval is half-open at 1.0
   // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index 65533cb08c6..52b0f204d28 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -123,7 +123,7 @@ struct DispatchCopyIf
     }
 
     // Memory allocation for the number of selected output items
-    OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
+    auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
 
     // Run algorithm
     status = cub::
@@ -166,7 +166,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
 {
   using size_type = thrust::detail::it_difference_t<InputIt>;
 
-  size_type num_items       = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items            = static_cast<size_type>(::cuda::std::distance(first, last));
   cudaError_t status        = cudaSuccess;
   size_t temp_storage_bytes = 0;
 
diff --git a/thrust/thrust/system/cuda/detail/cross_system.h b/thrust/thrust/system/cuda/detail/cross_system.h
index e2f94b123a5..2014c5a2ec5 100644
--- a/thrust/thrust/system/cuda/detail/cross_system.h
+++ b/thrust/thrust/system/cuda/detail/cross_system.h
@@ -200,8 +200,8 @@ _CCCL_HOST_DEVICE auto select_device_system(thrust::cuda::execution_policy<Sys1>
   _CCCL_HOST_DEVICE cross_system<Sys1, Sys2> select_system(
     execution_policy<Sys1> const& sys1, thrust::cpp::execution_policy<Sys2> const& sys2)
 {
-  thrust::execution_policy<Sys1>& non_const_sys1      = const_cast<execution_policy<Sys1>&>(sys1);
-  thrust::cpp::execution_policy<Sys2>& non_const_sys2 = const_cast<thrust::cpp::execution_policy<Sys2>&>(sys2);
+  thrust::execution_policy<Sys1>& non_const_sys1 = const_cast<execution_policy<Sys1>&>(sys1);
+  auto& non_const_sys2                           = const_cast<thrust::cpp::execution_policy<Sys2>&>(sys2);
   return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
 }
 
@@ -210,8 +210,8 @@ template <class Sys1, class Sys2>
 _CCCL_HOST_DEVICE cross_system<Sys1, Sys2>
 select_system(thrust::cpp::execution_policy<Sys1> const& sys1, execution_policy<Sys2> const& sys2)
 {
-  thrust::cpp::execution_policy<Sys1>& non_const_sys1 = const_cast<thrust::cpp::execution_policy<Sys1>&>(sys1);
-  thrust::execution_policy<Sys2>& non_const_sys2      = const_cast<execution_policy<Sys2>&>(sys2);
+  auto& non_const_sys1                           = const_cast<thrust::cpp::execution_policy<Sys1>&>(sys1);
+  thrust::execution_policy<Sys2>& non_const_sys2 = const_cast<execution_policy<Sys2>&>(sys2);
   return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
 }
 } // namespace cuda_cub
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index d2d9f69dc80..fce273173a2 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -347,7 +347,7 @@ element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPr
   using InputType = thrust::detail::it_value_t<ItemsIt>;
   using IndexType = thrust::detail::it_difference_t<ItemsIt>;
 
-  IndexType num_items = static_cast<IndexType>(::cuda::std::distance(first, last));
+  auto num_items = static_cast<IndexType>(::cuda::std::distance(first, last));
 
   using iterator_tuple = ::cuda::std::tuple<ItemsIt, counting_iterator<IndexType>>;
   using zip_iterator   = thrust::zip_iterator<iterator_tuple>;
diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h
index 90952c0cca2..46170484851 100644
--- a/thrust/thrust/system/cuda/detail/find.h
+++ b/thrust/thrust/system/cuda/detail/find.h
@@ -69,8 +69,8 @@ find_if_n_impl(execution_policy<Derived>& policy, InputIt first, Size num_items,
     thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, sizeof(adjusted_size_type) + tmp_size);
 
     // Run find_if.
-    adjusted_size_type* result_ptr = thrust::detail::aligned_reinterpret_cast<adjusted_size_type*>(tmp.data().get());
-    void* tmp_ptr                  = static_cast<void*>((tmp.data() + sizeof(adjusted_size_type)).get());
+    auto* result_ptr = thrust::detail::aligned_reinterpret_cast<adjusted_size_type*>(tmp.data().get());
+    void* tmp_ptr    = static_cast<void*>((tmp.data() + sizeof(adjusted_size_type)).get());
 
     status = cub::DeviceFind::FindIf(tmp_ptr, tmp_size, first, result_ptr, predicate, num_items_fixed, stream);
     cuda_cub::throw_on_error(status, "find_if: failed to run algorithm");
diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
index 6ca50350d17..8655cb14983 100644
--- a/thrust/thrust/system/cuda/detail/for_each.h
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -50,7 +50,7 @@ template <class Derived, class Input, class UnaryOp>
 Input _CCCL_API _CCCL_FORCEINLINE for_each(execution_policy<Derived>& policy, Input first, Input last, UnaryOp op)
 {
   using size_type = thrust::detail::it_difference_t<Input>;
-  size_type count = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto count      = static_cast<size_type>(::cuda::std::distance(first, last));
 
   return THRUST_NS_QUALIFIER::cuda_cub::for_each_n(policy, first, count, op);
 }
diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h
index 96e7a536b0c..d270c4de479 100644
--- a/thrust/thrust/system/cuda/detail/partition.h
+++ b/thrust/thrust/system/cuda/detail/partition.h
@@ -100,7 +100,7 @@ struct DispatchPartitionIf
     }
 
     // Memory allocation for the number of selected output items
-    OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
+    auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
 
     // Run algorithm
     status = cub::DispatchSelectIf<
@@ -200,7 +200,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair<SelectedOutIt, RejectedOutIt> stable_p
   }
 
   using output_it_wrapper_t = cub::detail::select::partition_distinct_output_t<SelectedOutIt, RejectedOutIt>;
-  std::size_t num_items     = static_cast<std::size_t>(::cuda::std::distance(first, last));
+  auto num_items            = static_cast<std::size_t>(::cuda::std::distance(first, last));
   std::size_t num_selected =
     partition(policy, first, last, stencil, output_it_wrapper_t{selected_result, rejected_result}, predicate);
   return ::cuda::std::make_pair(selected_result + num_selected, rejected_result + num_items - num_selected);
@@ -216,8 +216,8 @@ THRUST_RUNTIME_FUNCTION InputIt inplace_partition(
   }
 
   // Element type of the input iterator
-  using value_t         = thrust::detail::it_value_t<InputIt>;
-  std::size_t num_items = static_cast<std::size_t>(::cuda::std::distance(first, last));
+  using value_t  = thrust::detail::it_value_t<InputIt>;
+  auto num_items = static_cast<std::size_t>(::cuda::std::distance(first, last));
 
   // Allocate temporary storage, which will serve as the input to the partition
   thrust::detail::temporary_array<value_t, Derived> tmp(policy, num_items);
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index b8d326e27ca..664c762df2c 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -277,7 +277,7 @@ struct ReduceAgent
 
       T items[ITEMS_PER_THREAD];
 
-      Vector* vec_items = reinterpret_cast<Vector*>(items);
+      auto* vec_items = reinterpret_cast<Vector*>(items);
 
       // Vector Input iterator wrapper type (for applying cache modifier)
       T* d_in_unqualified = const_cast<T*>(input_it) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
@@ -700,7 +700,7 @@ _CCCL_HOST_DEVICE T reduce(execution_policy<Derived>& policy, InputIt first, Inp
 {
   using size_type = thrust::detail::it_difference_t<InputIt>;
   // FIXME: Check for RA iterator.
-  size_type num_items = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items = static_cast<size_type>(::cuda::std::distance(first, last));
   return cuda_cub::reduce_n(policy, first, num_items, init, binary_op);
 }
 
@@ -724,7 +724,7 @@ reduce_into(execution_policy<Derived>& policy, InputIt first, InputIt last, Outp
 {
   using size_type = thrust::detail::it_difference_t<InputIt>;
   // FIXME: Check for RA iterator.
-  size_type num_items = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items = static_cast<size_type>(::cuda::std::distance(first, last));
   cuda_cub::reduce_n_into(policy, first, num_items, output, init, binary_op);
 }
 
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
index 6caa5126960..9d5973b99cf 100644
--- a/thrust/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -1059,8 +1059,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
   _CUDA_CUB_RET_IF_FAIL(status);
 
-  ::cuda::std::pair<Size, Size>* partitions = (::cuda::std::pair<Size, Size>*) allocations[1];
-  char* vshmem_ptr                          = vshmem_storage > 0 ? (char*) allocations[2] : nullptr;
+  auto* partitions = (::cuda::std::pair<Size, Size>*) allocations[1];
+  char* vshmem_ptr = vshmem_storage > 0 ? (char*) allocations[2] : nullptr;
 
   init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent");
   ia.launch(tile_state, num_tiles);
@@ -1115,8 +1115,8 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair<KeysOutputIt, ValuesOutputIt> set_oper
 {
   using size_type = thrust::detail::it_difference_t<KeysIt1>;
 
-  size_type num_keys1 = static_cast<size_type>(::cuda::std::distance(keys1_first, keys1_last));
-  size_type num_keys2 = static_cast<size_type>(::cuda::std::distance(keys2_first, keys2_last));
+  auto num_keys1 = static_cast<size_type>(::cuda::std::distance(keys1_first, keys1_last));
+  auto num_keys2 = static_cast<size_type>(::cuda::std::distance(keys2_first, keys2_last));
 
   if (num_keys1 + num_keys2 == 0)
   {
@@ -1163,7 +1163,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair<KeysOutputIt, ValuesOutputIt> set_oper
   status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
 
-  std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
+  auto* d_output_count = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
 
   THRUST_DOUBLE_INDEX_TYPE_DISPATCH(
     status,
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index 8fd01877e8c..1169be16bc5 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -128,7 +128,7 @@ THRUST_RUNTIME_FUNCTION void merge_sort(
 {
   using size_type = thrust::detail::it_difference_t<KeysIt>;
 
-  size_type count = static_cast<size_type>(::cuda::std::distance(keys_first, keys_last));
+  auto count = static_cast<size_type>(::cuda::std::distance(keys_first, keys_last));
 
   size_t storage_size = 0;
   cudaStream_t stream = cuda_cub::stream(policy);
diff --git a/thrust/thrust/system/cuda/detail/transform_reduce.h b/thrust/thrust/system/cuda/detail/transform_reduce.h
index 0782fb5dcdf..47ca05d8b91 100644
--- a/thrust/thrust/system/cuda/detail/transform_reduce.h
+++ b/thrust/thrust/system/cuda/detail/transform_reduce.h
@@ -102,8 +102,8 @@ template <class Derived, class InputIt, class TransformOp, class T, class Reduce
 T _CCCL_HOST_DEVICE transform_reduce(
   execution_policy<Derived>& policy, InputIt first, InputIt last, TransformOp transform_op, T init, ReduceOp reduce_op)
 {
-  using size_type           = thrust::detail::it_difference_t<InputIt>;
-  const size_type num_items = static_cast<size_type>(::cuda::std::distance(first, last));
+  using size_type      = thrust::detail::it_difference_t<InputIt>;
+  const auto num_items = static_cast<size_type>(::cuda::std::distance(first, last));
 
   THRUST_CDP_DISPATCH(
     (init = thrust::cuda_cub::detail::transform_reduce_n_impl(policy, first, num_items, transform_op, init, reduce_op);),
diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h
index 19cb48c54f6..3e5755cd96e 100644
--- a/thrust/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/thrust/system/cuda/detail/transform_scan.h
@@ -40,7 +40,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
   using value_type  = ::cuda::std::remove_cvref_t<result_type>;
 
   using size_type              = thrust::detail::it_difference_t<InputIt>;
-  size_type num_items          = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items               = static_cast<size_type>(::cuda::std::distance(first, last));
   using transformed_iterator_t = transform_iterator<TransformOp, InputIt, value_type, value_type>;
 
   return cuda_cub::inclusive_scan_n(policy, transformed_iterator_t(first, transform_op), num_items, result, scan_op);
@@ -61,7 +61,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
   using value_type  = ::cuda::std::remove_cvref_t<result_type>;
 
   using size_type              = thrust::detail::it_difference_t<InputIt>;
-  size_type num_items          = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items               = static_cast<size_type>(::cuda::std::distance(first, last));
   using transformed_iterator_t = transform_iterator<TransformOp, InputIt, value_type, value_type>;
 
   return cuda_cub::inclusive_scan_n(
@@ -82,7 +82,7 @@ OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan(
   using result_type = ::cuda::std::remove_cvref_t<InitialValueType>;
 
   using size_type              = thrust::detail::it_difference_t<InputIt>;
-  size_type num_items          = static_cast<size_type>(::cuda::std::distance(first, last));
+  auto num_items               = static_cast<size_type>(::cuda::std::distance(first, last));
   using transformed_iterator_t = transform_iterator<TransformOp, InputIt, result_type, result_type>;
 
   return cuda_cub::exclusive_scan_n(
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
index e6544b79d01..738346fd409 100644
--- a/thrust/thrust/system/cuda/detail/unique.h
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -89,7 +89,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t dispatch_select_unique(
   void* allocations[2]            = {nullptr, nullptr};
 
   // The flag iterator is not used for unique, so we set it to nullptr.
-  flag_iterator_t flag_it = static_cast<flag_iterator_t>(nullptr);
+  auto flag_it = static_cast<flag_iterator_t>(nullptr);
 
   // Query algorithm memory requirements
   status = cub::DispatchSelectIf<
@@ -128,7 +128,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t dispatch_select_unique(
   }
 
   // Memory allocation for the number of selected output items
-  OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
+  auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
 
   // Run algorithm
   status = cub::DispatchSelectIf<
diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h
index 889bbad697c..0172d3ce66d 100644
--- a/thrust/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/thrust/system/cuda/detail/unique_by_key.h
@@ -116,7 +116,7 @@ struct DispatchUniqueByKey
     }
 
     // Memory allocation for the number of selected output items
-    OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
+    auto* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
 
     // Run algorithm
     status = cub::DeviceSelect::UniqueByKey(
@@ -159,7 +159,7 @@ THRUST_RUNTIME_FUNCTION ::cuda::std::pair<KeyOutputIt, ValOutputIt> unique_by_ke
 {
   using size_type = thrust::detail::it_difference_t<KeyInputIt>;
 
-  size_type num_items = static_cast<size_type>(::cuda::std::distance(keys_first, keys_last));
+  auto num_items = static_cast<size_type>(::cuda::std::distance(keys_first, keys_last));
   ::cuda::std::pair<KeyOutputIt, ValOutputIt> result_end{};
   cudaError_t status        = cudaSuccess;
   size_t temp_storage_bytes = 0;
diff --git a/thrust/thrust/system/detail/sequential/stable_radix_sort.h b/thrust/thrust/system/detail/sequential/stable_radix_sort.h
index 8c968651125..4ac45003a49 100644
--- a/thrust/thrust/system/detail/sequential/stable_radix_sort.h
+++ b/thrust/thrust/system/detail/sequential/stable_radix_sort.h
@@ -237,7 +237,7 @@ _CCCL_HOST_DEVICE void radix_sort(
   const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
   const unsigned int HistogramSize = 1 << RadixBits;
 
-  const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
+  const auto BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
 
   Encoder encode;
 
@@ -285,7 +285,7 @@ _CCCL_HOST_DEVICE void radix_sort(
   // shuffle keys and (optionally) values
   for (unsigned int i = 0; i < NumHistograms; i++)
   {
-    const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
+    const auto BitShift = static_cast<EncodedType>(RadixBits * i);
 
     if (!skip_shuffle[i])
     {
diff --git a/thrust/thrust/system/omp/detail/reduce_intervals.h b/thrust/thrust/system/omp/detail/reduce_intervals.h
index 580bd1ffc9e..3de0248bb3d 100644
--- a/thrust/thrust/system/omp/detail/reduce_intervals.h
+++ b/thrust/thrust/system/omp/detail/reduce_intervals.h
@@ -57,7 +57,7 @@ void reduce_intervals(
 
   using index_type = std::intptr_t;
 
-  index_type n = static_cast<index_type>(decomp.size());
+  auto n = static_cast<index_type>(decomp.size());
 
   THRUST_PRAGMA_OMP(parallel for)
   for (index_type i = 0; i < n; i++)