|
11 | 11 | #include <cub/block/block_reduce.cuh> |
12 | 12 | #include <cub/block/block_store.cuh> |
13 | 13 | #include <cub/cub.cuh> |
| 14 | +#include <cuda/std/functional> |
14 | 15 | #include <cub/warp/warp_reduce.cuh> |
15 | 16 | #include <cuda_fp16.h> |
16 | 17 | #include <math_constants.h> |
@@ -416,7 +417,7 @@ __global__ void kQuantizeBlockwise( |
416 | 417 | for (int j = 0; j < NUM_PER_TH; j++) |
417 | 418 | local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j])); |
418 | 419 |
|
419 | | - local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items); |
| 420 | + local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cuda::maximum<>{}, valid_items); |
420 | 421 |
|
421 | 422 | if (threadIdx.x == 0) { |
422 | 423 | smem_absmax_value[0] = 1.0f / local_abs_max; |
@@ -1002,9 +1003,9 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b |
1002 | 1003 | } |
1003 | 1004 |
|
1004 | 1005 | __syncthreads(); |
1005 | | - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); |
| 1006 | + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items); |
1006 | 1007 | __syncthreads(); |
1007 | | - local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items); |
| 1008 | + local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cuda::maximum<>{}, valid_items); |
1008 | 1009 | if (unorm != NULL) { |
1009 | 1010 | __syncthreads(); |
1010 | 1011 | local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); |
@@ -1213,7 +1214,7 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b |
1213 | 1214 | } |
1214 | 1215 |
|
1215 | 1216 | __syncthreads(); |
1216 | | - local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); |
| 1217 | + local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items); |
1217 | 1218 | if (threadIdx.x == 0) { |
1218 | 1219 | atomicMax(&new_max1[0], local_max_s1); |
1219 | 1220 | } |
@@ -1843,7 +1844,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ |
1843 | 1844 | } |
1844 | 1845 |
|
1845 | 1846 | // Reduce thread-local absmax across the block. |
1846 | | - const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); |
| 1847 | + const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols); |
1847 | 1848 | if (threadIdx.x == 0) { |
1848 | 1849 | // Save our block's absmax to shared memory for the quantization step. |
1849 | 1850 | rowStats[row_id] = smem_row_absmax = row_absmax; |
@@ -1898,7 +1899,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ |
1898 | 1899 |
|
1899 | 1900 | // Reduce thread-local absmax across the block. |
1900 | 1901 | // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY |
1901 | | - const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); |
| 1902 | + const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols); |
1902 | 1903 | if (threadIdx.x == 0) { |
1903 | 1904 | // Save our block's absmax to shared memory for the quantization step. |
1904 | 1905 | rowStats[row_id] = row_absmax; |
|
0 commit comments