feat: compatible with Cub versions

JYMiracle305 · kilinchange · commit 45c11cb95896 · 2026-01-30T10:49:35.000+08:00
diff --git a/infini_train/include/common/cuda/cub_compat.cuh b/infini_train/include/common/cuda/cub_compat.cuh
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cub/version.cuh>
+
+namespace infini_train::kernels::cuda {
+
+#if defined(CUB_VERSION) && CUB_VERSION >= 200800
+using CubSumOp = ::cuda::std::plus<>;
+using CubMaxOp = ::cuda::maximum<>;
+using CubMinOp = ::cuda::minimum<>;
+#else
+using CubSumOp = cub::Sum;
+using CubMaxOp = cub::Max;
+using CubMinOp = cub::Min;
+#endif
+
+} // namespace infini_train::kernels::cuda
diff --git a/infini_train/src/kernels/cuda/cross_entropy.cu b/infini_train/src/kernels/cuda/cross_entropy.cu
@@ -6,6 +6,7 @@
 #include <cuda_runtime.h>
 
 #include "infini_train/include/common/cuda/common_cuda.h"
+#include "infini_train/include/common/cuda/cub_compat.cuh"
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
@@ -44,7 +45,7 @@ __global__ void CrossEntropyForwardKernel(const InputType *__restrict__ input_pt
     for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
         thread_max = fmaxf(thread_max, common::cuda::Cast<float>(input_ptr[base + i]));
     }
-    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, ::cuda::maximum<>());
+    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());
     if (tid == 0) {
         shared.max_logit = block_max;
     }
@@ -139,7 +140,7 @@ __global__ void CrossEntropyBackwardKernel(const InputType *__restrict__ input_p
     for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
         thread_max = fmaxf(thread_max, common::cuda::Cast<float>(input_ptr[idx_base + i]));
     }
-    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, ::cuda::maximum<>());
+    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());
     if (tid == 0) {
         shared.max_logit = block_max;
     }
diff --git a/infini_train/src/kernels/cuda/reduction.cu b/infini_train/src/kernels/cuda/reduction.cu
@@ -1,6 +1,7 @@
 #include <cub/cub.cuh>
 
 #include "infini_train/include/common/cuda/common_cuda.h"
+#include "infini_train/include/common/cuda/cub_compat.cuh"
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
@@ -14,22 +15,22 @@ namespace {
 // Reduction operators
 template <typename T, typename ReduceFunc> struct CubOp;
 
-template <typename T> struct CubOp<T, ::cuda::std::plus<>> {
+template <typename T> struct CubOp<T, CubSumOp> {
     __device__ static T Init() { return common::cuda::Cast<T>(0); }
     __device__ static T Reduce(T a, T b) { return common::cuda::Add<T>(a, b); }
-    __device__ static ::cuda::std::plus<> Op() { return ::cuda::std::plus<>(); }
+    __device__ static CubSumOp Op() { return CubSumOp(); }
 };
 
-template <typename T> struct CubOp<T, ::cuda::maximum<>> {
+template <typename T> struct CubOp<T, CubMaxOp> {
     __device__ static T Init() { return common::cuda::Cast<T>(-kInfinity); }
     __device__ static T Reduce(T a, T b) { return common::cuda::Max<T>(a, b); }
-    __device__ static ::cuda::maximum<> Op() { return ::cuda::maximum<>(); }
+    __device__ static CubMaxOp Op() { return CubMaxOp(); }
 };
 
-template <typename T> struct CubOp<T, ::cuda::minimum<>> {
+template <typename T> struct CubOp<T, CubMinOp> {
     __device__ static T Init() { return common::cuda::Cast<T>(kInfinity); }
     __device__ static T Reduce(T a, T b) { return common::cuda::Min<T>(a, b); }
-    __device__ static ::cuda::minimum<> Op() { return ::cuda::minimum<>(); }
+    __device__ static CubMinOp Op() { return CubMinOp(); }
 };
 
 // Finalization strategies
@@ -179,19 +180,19 @@ std::shared_ptr<Tensor> ReduceOpBackward(const std::shared_ptr<Tensor> &grad_out
 }
 
 std::shared_ptr<Tensor> MeanForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
-    return ReduceOpForward<::cuda::std::plus<>, MeanFinalize>(input, dim, keep_dim);
+    return ReduceOpForward<CubSumOp, MeanFinalize>(input, dim, keep_dim);
 }
 
 std::shared_ptr<Tensor> SumForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
-    return ReduceOpForward<::cuda::std::plus<>, IdentityFinalize>(input, dim, keep_dim);
+    return ReduceOpForward<CubSumOp, IdentityFinalize>(input, dim, keep_dim);
 }
 
 std::shared_ptr<Tensor> MaxForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
-    return ReduceOpForward<::cuda::maximum<>, IdentityFinalize>(input, dim, keep_dim);
+    return ReduceOpForward<CubMaxOp, IdentityFinalize>(input, dim, keep_dim);
 }
 
 std::shared_ptr<Tensor> MinForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
-    return ReduceOpForward<::cuda::minimum<>, IdentityFinalize>(input, dim, keep_dim);
+    return ReduceOpForward<CubMinOp, IdentityFinalize>(input, dim, keep_dim);
 }
 
 std::shared_ptr<Tensor> MeanBackward(const std::shared_ptr<Tensor> &grad_output, const std::vector<int64_t> &input_dims,
diff --git a/infini_train/src/kernels/cuda/softmax.cu b/infini_train/src/kernels/cuda/softmax.cu
@@ -6,6 +6,7 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/common/cuda/common_cuda.h"
+#include "infini_train/include/common/cuda/cub_compat.cuh"
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
@@ -31,7 +32,7 @@ __global__ void SoftmaxForwardKernel(T *output, const T *input, int64_t outer_si
         int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
         thread_max = max(thread_max, common::cuda::Cast<float>(input[idx]));
     }
-    float block_max = BlockReduce(temp_storage_max).Reduce(thread_max, ::cuda::maximum<>());
+    float block_max = BlockReduce(temp_storage_max).Reduce(thread_max, CubMaxOp());
 
     if (tid == 0) {
         row_max = block_max;

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`	`#include <cuda_runtime.h>`
`7`	`7`
`8`	`8`	`#include "infini_train/include/common/cuda/common_cuda.h"`
	`9`	`+#include "infini_train/include/common/cuda/cub_compat.cuh"`
`9`	`10`	`#include "infini_train/include/common/cuda/kernel_helper.cuh"`
`10`	`11`	`#include "infini_train/include/dispatcher.h"`
`11`	`12`	`#include "infini_train/include/tensor.h"`
`@@ -44,7 +45,7 @@ __global__ void CrossEntropyForwardKernel(const InputType *__restrict__ input_pt`
`44`	`45`	`for (int i = tid; i < num_classes; i += BLOCK_SIZE) {`
`45`	`46`	`thread_max = fmaxf(thread_max, common::cuda::Cast<float>(input_ptr[base + i]));`
`46`	`47`	`}`
`47`		`- const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, ::cuda::maximum<>());`
	`48`	`+ const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());`
`48`	`49`	`if (tid == 0) {`
`49`	`50`	`shared.max_logit = block_max;`
`50`	`51`	`}`
`@@ -139,7 +140,7 @@ __global__ void CrossEntropyBackwardKernel(const InputType *__restrict__ input_p`
`139`	`140`	`for (int i = tid; i < num_classes; i += BLOCK_SIZE) {`
`140`	`141`	`thread_max = fmaxf(thread_max, common::cuda::Cast<float>(input_ptr[idx_base + i]));`
`141`	`142`	`}`
`142`		`- const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, ::cuda::maximum<>());`
	`143`	`+ const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());`
`143`	`144`	`if (tid == 0) {`
`144`	`145`	`shared.max_logit = block_max;`
`145`	`146`	`}`