Skip to content

Commit 5d6dfc3

Browse files
authored
feat: add cuda sigmoid infinilm (#680)
1 parent 1b1dd5f commit 5d6dfc3

8 files changed

Lines changed: 355 additions & 0 deletions

File tree

src/base/sigmoid_infinilm.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#ifndef INFINI_OPS_BASE_SIGMOID_INFINILM_H_
2+
#define INFINI_OPS_BASE_SIGMOID_INFINILM_H_
3+
4+
#include <cassert>
5+
6+
#include "operator.h"
7+
8+
namespace infini::ops {
9+
10+
class SigmoidInfinilm : public Operator<SigmoidInfinilm> {
11+
public:
12+
SigmoidInfinilm(const Tensor input, Tensor out)
13+
: input_shape_{input.shape()},
14+
input_strides_{input.strides()},
15+
input_type_{input.dtype()},
16+
out_shape_{out.shape()},
17+
out_strides_{out.strides()},
18+
out_type_{out.dtype()},
19+
output_size_{out.numel()},
20+
ndim_{out.ndim()},
21+
is_input_contiguous_{input.IsContiguous()},
22+
is_out_contiguous_{out.IsContiguous()},
23+
device_index_{out.device().index()} {
24+
assert(input_shape_ == out_shape_ &&
25+
"`SigmoidInfinilm` input and output shapes must match");
26+
assert(input_type_ == out_type_ &&
27+
"`SigmoidInfinilm` input and output dtypes must match");
28+
assert(!out.HasBroadcastDim() &&
29+
"`SigmoidInfinilm` output must not have broadcasted dimensions");
30+
}
31+
32+
virtual void operator()(const Tensor input, Tensor out) const = 0;
33+
34+
protected:
35+
Tensor::Shape input_shape_;
36+
37+
Tensor::Strides input_strides_;
38+
39+
DataType input_type_;
40+
41+
Tensor::Shape out_shape_;
42+
43+
Tensor::Strides out_strides_;
44+
45+
DataType out_type_;
46+
47+
Tensor::Size output_size_{0};
48+
49+
Tensor::Size ndim_{0};
50+
51+
bool is_input_contiguous_{false};
52+
53+
bool is_out_contiguous_{false};
54+
55+
int device_index_{0};
56+
};
57+
58+
} // namespace infini::ops
59+
60+
#endif
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef INFINI_OPS_ILUVATAR_SIGMOID_INFINILM_KERNEL_H_
2+
#define INFINI_OPS_ILUVATAR_SIGMOID_INFINILM_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "native/cuda/iluvatar/caster.cuh"
7+
#include "native/cuda/iluvatar/runtime_.h"
8+
#include "native/cuda/ops/sigmoid_infinilm/kernel.h"
9+
10+
namespace infini::ops {
11+
12+
template <>
13+
class Operator<SigmoidInfinilm, Device::Type::kIluvatar>
14+
: public CudaSigmoidInfinilm<Runtime<Device::Type::kIluvatar>> {
15+
public:
16+
using CudaSigmoidInfinilm<
17+
Runtime<Device::Type::kIluvatar>>::CudaSigmoidInfinilm;
18+
};
19+
20+
} // namespace infini::ops
21+
22+
#endif
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#ifndef INFINI_OPS_METAX_SIGMOID_INFINILM_KERNEL_H_
2+
#define INFINI_OPS_METAX_SIGMOID_INFINILM_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "native/cuda/metax/caster.cuh"
7+
#include "native/cuda/metax/runtime_.h"
8+
#include "native/cuda/ops/sigmoid_infinilm/kernel.h"
9+
10+
namespace infini::ops {
11+
12+
template <>
13+
class Operator<SigmoidInfinilm, Device::Type::kMetax>
14+
: public CudaSigmoidInfinilm<Runtime<Device::Type::kMetax>> {
15+
public:
16+
using CudaSigmoidInfinilm<Runtime<Device::Type::kMetax>>::CudaSigmoidInfinilm;
17+
};
18+
19+
} // namespace infini::ops
20+
21+
#endif
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef INFINI_OPS_MOORE_SIGMOID_INFINILM_KERNEL_H_
2+
#define INFINI_OPS_MOORE_SIGMOID_INFINILM_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "native/cuda/moore/caster.cuh"
7+
#include "native/cuda/moore/polyfills.cuh"
8+
#include "native/cuda/moore/runtime_.h"
9+
#include "native/cuda/ops/sigmoid_infinilm/kernel.h"
10+
11+
namespace infini::ops {
12+
13+
template <>
14+
class Operator<SigmoidInfinilm, Device::Type::kMoore>
15+
: public CudaSigmoidInfinilm<Runtime<Device::Type::kMoore>> {
16+
public:
17+
using CudaSigmoidInfinilm<Runtime<Device::Type::kMoore>>::CudaSigmoidInfinilm;
18+
};
19+
20+
} // namespace infini::ops
21+
22+
#endif
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef INFINI_OPS_NVIDIA_SIGMOID_INFINILM_KERNEL_H_
2+
#define INFINI_OPS_NVIDIA_SIGMOID_INFINILM_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "native/cuda/nvidia/caster.cuh"
7+
#include "native/cuda/nvidia/runtime_.h"
8+
#include "native/cuda/ops/sigmoid_infinilm/kernel.h"
9+
10+
namespace infini::ops {
11+
12+
template <>
13+
class Operator<SigmoidInfinilm, Device::Type::kNvidia>
14+
: public CudaSigmoidInfinilm<Runtime<Device::Type::kNvidia>> {
15+
public:
16+
using CudaSigmoidInfinilm<
17+
Runtime<Device::Type::kNvidia>>::CudaSigmoidInfinilm;
18+
};
19+
20+
} // namespace infini::ops
21+
22+
#endif
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#ifndef INFINI_OPS_CUDA_SIGMOID_INFINILM_KERNEL_CUH_
2+
#define INFINI_OPS_CUDA_SIGMOID_INFINILM_KERNEL_CUH_
3+
4+
#include <cmath>
5+
#include <cstddef>
6+
#include <type_traits>
7+
8+
#include "native/cuda/caster.cuh"
9+
#include "native/cuda/kernel_commons.cuh"
10+
11+
namespace infini::ops {
12+
13+
namespace {
14+
15+
template <Device::Type kDev, typename T>
16+
__device__ __forceinline__ T SigmoidInfinilmValue(T x) {
17+
if constexpr (std::is_same_v<T, double>) {
18+
return 1.0 / (1.0 + exp(-x));
19+
} else {
20+
const float v = Caster<kDev>::template Cast<float>(x);
21+
const float y = 1.0f / (1.0f + expf(-v));
22+
return Caster<kDev>::template Cast<T>(y);
23+
}
24+
}
25+
26+
} // namespace
27+
28+
template <Device::Type kDev, typename T, unsigned int block_size>
29+
__global__ void SigmoidInfinilmKernel(
30+
T* __restrict__ out, const T* __restrict__ input,
31+
const size_t* __restrict__ out_shape,
32+
const size_t* __restrict__ input_shape,
33+
const ptrdiff_t* __restrict__ out_strides,
34+
const ptrdiff_t* __restrict__ input_strides, size_t output_size,
35+
size_t ndim, bool out_contiguous, bool input_contiguous) {
36+
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
37+
38+
if (idx < output_size) {
39+
size_t out_idx =
40+
out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
41+
size_t input_idx =
42+
input_contiguous ? idx
43+
: IndexToOffset(idx, ndim, input_shape, input_strides);
44+
out[out_idx] = SigmoidInfinilmValue<kDev>(input[input_idx]);
45+
}
46+
}
47+
48+
} // namespace infini::ops
49+
50+
#endif
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#ifndef INFINI_OPS_CUDA_SIGMOID_INFINILM_KERNEL_H_
2+
#define INFINI_OPS_CUDA_SIGMOID_INFINILM_KERNEL_H_
3+
4+
#include <algorithm>
5+
#include <cstddef>
6+
#include <cstring>
7+
#include <vector>
8+
9+
#include "base/sigmoid_infinilm.h"
10+
#include "common/generic_utils.h"
11+
#include "data_type.h"
12+
#include "dispatcher.h"
13+
#include "native/cuda/kernel_commons.cuh"
14+
#include "native/cuda/ops/sigmoid_infinilm/kernel.cuh"
15+
#include "native/cuda/runtime_utils.h"
16+
17+
namespace infini::ops {
18+
19+
template <typename Backend>
20+
class CudaSigmoidInfinilm : public SigmoidInfinilm {
21+
public:
22+
CudaSigmoidInfinilm(const Tensor input, Tensor out)
23+
: SigmoidInfinilm{input, out} {
24+
size_t shape_size = ndim_ * sizeof(*d_input_shape_);
25+
size_t strides_size = ndim_ * sizeof(*d_input_strides_);
26+
const size_t metadata_size = 2 * (shape_size + strides_size);
27+
std::vector<std::byte> metadata(metadata_size);
28+
29+
Backend::Malloc((void**)&d_metadata_, metadata_size);
30+
31+
size_t offset = 0;
32+
d_input_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
33+
std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size);
34+
offset += shape_size;
35+
36+
d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
37+
std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
38+
offset += shape_size;
39+
40+
d_input_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
41+
std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size);
42+
offset += strides_size;
43+
44+
d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
45+
std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
46+
47+
Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
48+
Backend::MemcpyHostToDevice);
49+
}
50+
51+
~CudaSigmoidInfinilm() { Backend::Free(d_metadata_); }
52+
53+
void operator()(const Tensor input, Tensor out) const override {
54+
auto cuda_stream =
55+
static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
56+
int block_size = std::min(
57+
RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize(), 1024);
58+
dim3 block(std::min(static_cast<Tensor::Size>(block_size), output_size_));
59+
dim3 grid(utils::CeilDiv(output_size_, block.x));
60+
61+
DispatchFunc<AllFloatTypes, List<128, 256, 512, 1024>>(
62+
{static_cast<int64_t>(out_type_), block_size},
63+
[&](auto list_tag) {
64+
using T = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
65+
constexpr int kBlockSize = ListGet<1>(list_tag);
66+
67+
SigmoidInfinilmKernel<Backend::kDeviceType, T, kBlockSize>
68+
<<<grid, block, 0, cuda_stream>>>(
69+
reinterpret_cast<T*>(out.data()),
70+
reinterpret_cast<const T*>(input.data()), d_out_shape_,
71+
d_input_shape_, d_out_strides_, d_input_strides_,
72+
output_size_, ndim_, is_out_contiguous_,
73+
is_input_contiguous_);
74+
},
75+
"CudaSigmoidInfinilm::operator()");
76+
}
77+
78+
private:
79+
std::byte* d_metadata_{nullptr};
80+
81+
Tensor::Size* d_input_shape_{nullptr};
82+
83+
Tensor::Size* d_out_shape_{nullptr};
84+
85+
Tensor::Stride* d_input_strides_{nullptr};
86+
87+
Tensor::Stride* d_out_strides_{nullptr};
88+
};
89+
90+
} // namespace infini::ops
91+
92+
#endif

tests/test_sigmoid_infinilm.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import infini.ops
2+
import pytest
3+
import torch
4+
5+
from tests.utils import Payload, empty_strided, get_stream, randn_strided
6+
7+
8+
@pytest.mark.auto_act_and_assert
9+
@pytest.mark.parametrize(
10+
"shape, input_strides, out_strides, inplace",
11+
(
12+
((13, 4), None, None, False),
13+
((13, 4), None, None, True),
14+
((13, 4), (10, 1), (10, 1), False),
15+
((13, 4), (0, 1), None, False),
16+
((13, 4, 4), None, None, False),
17+
((13, 4, 4), None, None, True),
18+
((13, 4, 4), (20, 4, 1), (20, 4, 1), False),
19+
((13, 4, 4), (4, 0, 1), None, False),
20+
((16, 5632), None, None, False),
21+
((16, 5632), None, None, True),
22+
((16, 5632), (13312, 1), (13312, 1), False),
23+
((4, 4, 5632), None, None, False),
24+
((4, 4, 5632), None, None, True),
25+
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), False),
26+
((4, 4, 56320), None, None, False),
27+
),
28+
)
29+
@pytest.mark.parametrize(
30+
("dtype", "rtol", "atol"),
31+
(
32+
(torch.float32, 1e-7, 1e-7),
33+
(torch.float16, 1e-3, 1e-3),
34+
(torch.bfloat16, 1e-2, 1e-2),
35+
),
36+
)
37+
def test_sigmoid_infinilm(
38+
shape, input_strides, out_strides, inplace, dtype, device, rtol, atol
39+
):
40+
input = randn_strided(shape, input_strides, dtype=dtype, device=device)
41+
out = (
42+
input
43+
if inplace
44+
else empty_strided(shape, out_strides, dtype=dtype, device=device)
45+
)
46+
47+
return Payload(
48+
_sigmoid_infinilm,
49+
_torch_sigmoid_infinilm,
50+
(input, out),
51+
{},
52+
rtol=rtol,
53+
atol=atol,
54+
)
55+
56+
57+
def _sigmoid_infinilm(input, out):
58+
infini.ops.sigmoid_infinilm(input, out, stream=get_stream(input.device))
59+
60+
return out
61+
62+
63+
def _torch_sigmoid_infinilm(input, out):
64+
torch.sigmoid(input, out=out)
65+
66+
return out

0 commit comments

Comments
 (0)