Skip to content

Commit e81256b

Browse files
committed
feat(ops): add shared CUDA and NVIDIA implementations for Cast
1 parent 589fc04 commit e81256b

3 files changed

Lines changed: 146 additions & 0 deletions

File tree

src/cuda/cast/kernel.cuh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#ifndef INFINI_OPS_CUDA_CAST_KERNEL_CUH_
2+
#define INFINI_OPS_CUDA_CAST_KERNEL_CUH_
3+
4+
#include "cuda/kernel_commons.cuh"
5+
6+
namespace infini::ops {
7+
8+
template <Device::Type kDev, typename InT, typename OutT,
9+
unsigned int BLOCK_SIZE>
10+
__global__ void CastKernel(OutT* __restrict__ out,
11+
const InT* __restrict__ input,
12+
const size_t* __restrict__ out_shape,
13+
const size_t* __restrict__ input_shape,
14+
const ptrdiff_t* __restrict__ out_strides,
15+
const ptrdiff_t* __restrict__ input_strides,
16+
size_t output_size, size_t ndim, bool out_contiguous,
17+
bool input_contiguous) {
18+
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
19+
20+
if (idx < output_size) {
21+
size_t out_idx =
22+
out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
23+
size_t input_idx =
24+
input_contiguous ? idx
25+
: IndexToOffset(idx, ndim, input_shape, input_strides);
26+
27+
out[out_idx] = Caster<kDev>::template Cast<OutT>(input[input_idx]);
28+
}
29+
}
30+
31+
} // namespace infini::ops
32+
33+
#endif

src/cuda/cast/kernel.h

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#ifndef INFINI_OPS_CUDA_CAST_KERNEL_H_
2+
#define INFINI_OPS_CUDA_CAST_KERNEL_H_
3+
4+
#include <cstddef>
5+
#include <cstdint>
6+
#include <cstring>
7+
#include <vector>
8+
9+
#include "base/cast.h"
10+
#include "common/generic_utils.h"
11+
#include "cuda/cast/kernel.cuh"
12+
#include "cuda/kernel_commons.cuh"
13+
#include "cuda/runtime_utils.h"
14+
15+
namespace infini::ops {
16+
17+
template <typename Backend>
18+
class CudaCast : public Cast {
19+
public:
20+
CudaCast(const Tensor input, Tensor out) : Cast{input, out} {
21+
size_t shape_size = ndim_ * sizeof(*d_input_shape_);
22+
size_t strides_size = ndim_ * sizeof(*d_input_strides_);
23+
const size_t metadata_size = 2 * (shape_size + strides_size);
24+
std::vector<std::byte> metadata(metadata_size);
25+
26+
Backend::Malloc((void**)&d_metadata_, metadata_size);
27+
28+
size_t offset = 0;
29+
d_input_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
30+
std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size);
31+
offset += shape_size;
32+
33+
d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
34+
std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
35+
offset += shape_size;
36+
37+
d_input_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
38+
std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size);
39+
offset += strides_size;
40+
41+
d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
42+
std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
43+
44+
Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
45+
Backend::MemcpyHostToDevice);
46+
}
47+
48+
~CudaCast() { Backend::Free(d_metadata_); }
49+
50+
void operator()(const Tensor input, Tensor out) const override {
51+
int block_size = RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize();
52+
53+
DispatchFunc<AllTypes, AllTypes, AllCudaBlockSizes>(
54+
{static_cast<int64_t>(input_dtype_), static_cast<int64_t>(out_dtype_),
55+
block_size},
56+
[&](auto list_tag) {
57+
using InT = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
58+
using OutT = TypeMapType<Backend::kDeviceType, ListGet<1>(list_tag)>;
59+
constexpr int kBlockSize = ListGet<2>(list_tag);
60+
61+
auto cuda_stream =
62+
static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
63+
dim3 blockDims(
64+
std::min(static_cast<Tensor::Size>(block_size), output_size_));
65+
dim3 gridDims(utils::CeilDiv(output_size_, blockDims.x));
66+
67+
CastKernel<Backend::kDeviceType, InT, OutT, kBlockSize>
68+
<<<gridDims, blockDims, 0, cuda_stream>>>(
69+
reinterpret_cast<OutT*>(out.data()),
70+
reinterpret_cast<const InT*>(input.data()), d_out_shape_,
71+
d_input_shape_, d_out_strides_, d_input_strides_,
72+
output_size_, ndim_, is_out_contiguous_,
73+
is_input_contiguous_);
74+
},
75+
"CudaCast::operator()");
76+
}
77+
78+
private:
79+
std::byte* d_metadata_{nullptr};
80+
81+
Tensor::Size* d_input_shape_{nullptr};
82+
83+
Tensor::Size* d_out_shape_{nullptr};
84+
85+
Tensor::Stride* d_input_strides_{nullptr};
86+
87+
Tensor::Stride* d_out_strides_{nullptr};
88+
};
89+
90+
} // namespace infini::ops
91+
92+
#endif

src/nvidia/cast/kernel.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#ifndef INFINI_OPS_NVIDIA_CAST_KERNEL_H_
2+
#define INFINI_OPS_NVIDIA_CAST_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "cuda/cast/kernel.h"
7+
#include "nvidia/caster.cuh"
8+
#include "nvidia/runtime_.h"
9+
10+
namespace infini::ops {
11+
12+
template <>
13+
class Operator<Cast, Device::Type::kNvidia>
14+
: public CudaCast<Runtime<Device::Type::kNvidia>> {
15+
public:
16+
using CudaCast<Runtime<Device::Type::kNvidia>>::CudaCast;
17+
};
18+
19+
} // namespace infini::ops
20+
21+
#endif

0 commit comments

Comments
 (0)