Skip to content

Commit c76c064

Browse files
crapromerroot
andauthored
issue/367 - Fix compile bug on cuda 13.0
* fix compile bug on cuda 13.0 * issue/367 - clang format code on ubuntu --------- Co-authored-by: root <root@Crapromer>
1 parent f796aaa commit c76c064

4 files changed

Lines changed: 30 additions & 2 deletions

File tree

src/infiniop/ops/logsoftmax/cuda/kernel.cuh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,11 @@ __device__ void logSoftmaxKernel(
5454
}
5555
}
5656
}
57+
#if CUDART_VERSION >= 12090
58+
max_val = BlockReduce(temp_storage).Reduce(max_val, ::cuda::maximum());
59+
#else
5760
max_val = BlockReduce(temp_storage).Reduce(max_val, cub::Max());
61+
#endif
5862
if (tid == 0) {
5963
shared_max_val = max_val;
6064
}

src/infiniop/ops/random_sample/nvidia/random_sample_kernel.cuh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,13 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
8989
nullptr, size_inclusive_sum,
9090
nullptr, n,
9191
nullptr));
92+
#if CUDART_VERSION >= 12090
93+
size_random += ::cuda::maximum()(size_radix_sort, size_inclusive_sum);
94+
return utils::Result<size_t>(::cuda::maximum()(argmax, size_random));
95+
#else
9296
size_random += cub::Max()(size_radix_sort, size_inclusive_sum);
93-
9497
return utils::Result<size_t>(cub::Max()(argmax, size_random));
98+
#endif
9599
}
96100

97101
// ↑↑↑ 计算 workspace
@@ -161,8 +165,13 @@ static __global__ void randomSampleKernel(
161165
const Tidx *__restrict__ indices_out,
162166
size_t n,
163167
float random, float topp, size_t topk) {
168+
#if CUDART_VERSION >= 12090
169+
topk = ::cuda::minimum()(topk, n);
170+
auto p = (Tval)(random * ::cuda::minimum()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
171+
#else
164172
topk = cub::Min()(topk, n);
165173
auto p = (Tval)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
174+
#endif
166175
for (size_t i = 0;; ++i) {
167176
if ((sorted[i]) >= p) {
168177
*result = indices_out[i];
@@ -228,8 +237,11 @@ struct Algo {
228237

229238
workspace_ = reinterpret_cast<void *>(workspace);
230239
workspace_size = workspace_end - workspace;
231-
240+
#if CUDART_VERSION >= 12090
241+
auto block = ::cuda::minimum()((size_t)block_size, n);
242+
#else
232243
auto block = cub::Min()((size_t)block_size, n);
244+
#endif
233245
auto grid = (n + block - 1) / block;
234246
// sort
235247
fillIndices<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(indices, static_cast<int>(n));

src/infiniop/ops/topksoftmax/cuda/kernel.cuh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,11 @@ __global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形
5555

5656
{
5757
__shared__ typename BlockReduce::TempStorage temp_storage_max;
58+
#if CUDART_VERSION >= 12090
59+
T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, ::cuda::maximum());
60+
#else
5861
T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max());
62+
#endif
5963
if (tid == 0) {
6064
shared_max = value_max;
6165
}

src/infiniop/reduce/cuda/reduce.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
5454
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
5555
#ifdef ENABLE_HYGON_API
5656
max_ = (data_ptr[i] > max_) ? data_ptr[i] : max_;
57+
#else
58+
#if CUDART_VERSION >= 12090
59+
max_ = ::cuda::maximum()(max_, data_ptr[i]);
5760
#else
5861
max_ = cub::Max()(max_, data_ptr[i]);
62+
#endif
5963
#endif
6064
}
6165

@@ -65,9 +69,13 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
6569
#ifdef ENABLE_HYGON_API
6670
return BlockReduce(temp_storage).Reduce(
6771
max_, [](const Tdata &a, const Tdata &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
72+
#else
73+
#if CUDART_VERSION >= 12090
74+
return BlockReduce(temp_storage).Reduce(max_, ::cuda::maximum(), BLOCK_SIZE);
6875
#else
6976
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
7077
#endif
78+
#endif
7179
}
7280

7381
} // namespace op::common_cuda::reduce_op

0 commit comments

Comments
 (0)