Skip to content

Commit 6effbfb

Browse files
Susskind115wooway777
authored andcommitted
Add quickgelu/gelutanh ops needed by KV compression
1 parent fa2a580 commit 6effbfb

File tree

22 files changed

+886
-14
lines changed

22 files changed

+886
-14
lines changed

include/infiniop.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include "infiniop/ops/fmin.h"
4848
#include "infiniop/ops/fmod.h"
4949
#include "infiniop/ops/gelu.h"
50+
#include "infiniop/ops/gelutanh.h"
5051
#include "infiniop/ops/gemm.h"
5152
#include "infiniop/ops/hardswish.h"
5253
#include "infiniop/ops/hardtanh.h"
@@ -84,6 +85,7 @@
8485
#include "infiniop/ops/pixel_shuffle.h"
8586
#include "infiniop/ops/quant/per_channel_quant_int8.h"
8687
#include "infiniop/ops/quant/per_tensor_quant_int8.h"
88+
#include "infiniop/ops/quickgelu.h"
8789
#include "infiniop/ops/random_sample.h"
8890
#include "infiniop/ops/rearrange.h"
8991
#include "infiniop/ops/reciprocal.h"

include/infiniop/ops/gelutanh.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#ifndef __INFINIOP_GELUTANH_API_H__
2+
#define __INFINIOP_GELUTANH_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopGeluTanhDescriptor_t;
7+
8+
/**
9+
* Create GELU-Tanh descriptor
10+
*
11+
* y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
12+
*/
13+
__INFINI_C __export infiniStatus_t infiniopCreateGeluTanhDescriptor(
14+
infiniopHandle_t handle,
15+
infiniopGeluTanhDescriptor_t *desc_ptr,
16+
infiniopTensorDescriptor_t y,
17+
infiniopTensorDescriptor_t x);
18+
19+
/**
20+
* Query workspace size
21+
*/
22+
__INFINI_C __export infiniStatus_t infiniopGetGeluTanhWorkspaceSize(
23+
infiniopGeluTanhDescriptor_t desc,
24+
size_t *size);
25+
26+
/**
27+
* Launch GELU-Tanh operator
28+
*/
29+
__INFINI_C __export infiniStatus_t infiniopGeluTanh(
30+
infiniopGeluTanhDescriptor_t desc,
31+
void *workspace,
32+
size_t workspace_size,
33+
void *y,
34+
const void *x,
35+
void *stream);
36+
37+
/**
38+
* Destroy descriptor
39+
*/
40+
__INFINI_C __export infiniStatus_t infiniopDestroyGeluTanhDescriptor(
41+
infiniopGeluTanhDescriptor_t desc);
42+
43+
#endif

include/infiniop/ops/quickgelu.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#ifndef __INFINIOP_QUICKGELU_API_H__
2+
#define __INFINIOP_QUICKGELU_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopQuickGeluDescriptor_t;
7+
8+
/**
9+
* Create QuickGELU descriptor
10+
* y = x * sigmoid(1.702 * x)
11+
*/
12+
__INFINI_C __export infiniStatus_t infiniopCreateQuickGeluDescriptor(
13+
infiniopHandle_t handle,
14+
infiniopQuickGeluDescriptor_t *desc_ptr,
15+
infiniopTensorDescriptor_t y,
16+
infiniopTensorDescriptor_t x);
17+
18+
/**
19+
* Query workspace size
20+
*/
21+
__INFINI_C __export infiniStatus_t infiniopGetQuickGeluWorkspaceSize(
22+
infiniopQuickGeluDescriptor_t desc,
23+
size_t *size);
24+
25+
/**
26+
* Launch QuickGELU operator
27+
*/
28+
__INFINI_C __export infiniStatus_t infiniopQuickGelu(
29+
infiniopQuickGeluDescriptor_t desc,
30+
void *workspace,
31+
size_t workspace_size,
32+
void *y,
33+
const void *x,
34+
void *stream);
35+
36+
/**
37+
* Destroy descriptor
38+
*/
39+
__INFINI_C __export infiniStatus_t infiniopDestroyQuickGeluDescriptor(
40+
infiniopQuickGeluDescriptor_t desc);
41+
42+
#endif

src/infiniop/ops/conv/operator.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#ifdef ENABLE_CPU_API
66
#include "cpu/conv_cpu.h"
77
#endif
8-
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
8+
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API)
99
#include "nvidia/conv_nvidia.cuh"
1010
#endif
1111

@@ -48,6 +48,9 @@ __INFINI_C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t
4848
#ifdef ENABLE_ALI_API
4949
CREATE(INFINI_DEVICE_ALI, nvidia);
5050
#endif
51+
#ifdef ENABLE_HYGON_API
52+
CREATE(INFINI_DEVICE_HYGON, nvidia);
53+
#endif
5154

5255
default:
5356
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -82,6 +85,9 @@ infiniopGetConvWorkspaceSize(
8285
#ifdef ENABLE_ALI_API
8386
GET(INFINI_DEVICE_ALI, nvidia);
8487
#endif
88+
#ifdef ENABLE_HYGON_API
89+
GET(INFINI_DEVICE_HYGON, nvidia);
90+
#endif
8591

8692
default:
8793
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -124,6 +130,9 @@ __INFINI_C infiniStatus_t infiniopConv(
124130
#ifdef ENABLE_ALI_API
125131
CALCULATE(INFINI_DEVICE_ALI, nvidia);
126132
#endif
133+
#ifdef ENABLE_HYGON_API
134+
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
135+
#endif
127136

128137
default:
129138
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -154,6 +163,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
154163
#ifdef ENABLE_ALI_API
155164
DELETE(INFINI_DEVICE_ALI, nvidia);
156165
#endif
166+
#ifdef ENABLE_HYGON_API
167+
DELETE(INFINI_DEVICE_HYGON, nvidia);
168+
#endif
157169

158170
default:
159171
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

src/infiniop/ops/gelu/operator.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#ifdef ENABLE_CPU_API
66
#include "cpu/gelu_cpu.h"
77
#endif
8-
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
8+
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API)
99
#include "nvidia/gelu_nvidia.cuh"
1010
#endif
1111
#ifdef ENABLE_METAX_API
@@ -43,6 +43,9 @@ __INFINI_C infiniStatus_t infiniopCreateGeluDescriptor(
4343
#ifdef ENABLE_QY_API
4444
CREATE(INFINI_DEVICE_QY, nvidia);
4545
#endif
46+
#ifdef ENABLE_HYGON_API
47+
CREATE(INFINI_DEVICE_HYGON, nvidia);
48+
#endif
4649
#ifdef ENABLE_METAX_API
4750
CREATE(INFINI_DEVICE_METAX, metax);
4851
#endif
@@ -80,6 +83,9 @@ __INFINI_C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t
8083
#ifdef ENABLE_QY_API
8184
GET(INFINI_DEVICE_QY, nvidia);
8285
#endif
86+
#ifdef ENABLE_HYGON_API
87+
GET(INFINI_DEVICE_HYGON, nvidia);
88+
#endif
8389
#ifdef ENABLE_METAX_API
8490
GET(INFINI_DEVICE_METAX, metax);
8591
#endif
@@ -125,6 +131,9 @@ __INFINI_C infiniStatus_t infiniopGelu(
125131
#ifdef ENABLE_QY_API
126132
CALCULATE(INFINI_DEVICE_QY, nvidia);
127133
#endif
134+
#ifdef ENABLE_HYGON_API
135+
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
136+
#endif
128137
#ifdef ENABLE_METAX_API
129138
CALCULATE(INFINI_DEVICE_METAX, metax);
130139
#endif
@@ -164,6 +173,9 @@ infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) {
164173
#ifdef ENABLE_QY_API
165174
DELETE(INFINI_DEVICE_QY, nvidia);
166175
#endif
176+
#ifdef ENABLE_HYGON_API
177+
DELETE(INFINI_DEVICE_HYGON, nvidia);
178+
#endif
167179
#ifdef ENABLE_METAX_API
168180
DELETE(INFINI_DEVICE_METAX, metax);
169181
#endif
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include "gelutanh_cpu.h"
2+
3+
namespace op::gelutanh::cpu {
4+
5+
Descriptor::~Descriptor() = default;
6+
7+
infiniStatus_t Descriptor::create(
8+
infiniopHandle_t handle_,
9+
Descriptor **desc_ptr,
10+
infiniopTensorDescriptor_t out_desc,
11+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
12+
13+
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
14+
auto dtype = out_desc->dtype();
15+
16+
const auto &x_desc = input_desc_vec.at(0);
17+
const auto &y_shape = out_desc->shape();
18+
const auto &x_shape = x_desc->shape();
19+
20+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
21+
CHECK_SAME_SHAPE(y_shape, x_shape);
22+
23+
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
24+
25+
return INFINI_STATUS_SUCCESS;
26+
}
27+
28+
infiniStatus_t Descriptor::calculate(
29+
void *workspace,
30+
size_t workspace_size,
31+
void *output,
32+
std::vector<const void *> inputs,
33+
void *stream) const {
34+
35+
(void)workspace;
36+
(void)workspace_size;
37+
38+
switch (_dtype) {
39+
case INFINI_DTYPE_F16:
40+
return _device_info->calculate<GeluTanhOp, fp16_t>(_info, output, inputs, stream);
41+
case INFINI_DTYPE_F32:
42+
return _device_info->calculate<GeluTanhOp, float>(_info, output, inputs, stream);
43+
case INFINI_DTYPE_F64:
44+
return _device_info->calculate<GeluTanhOp, double>(_info, output, inputs, stream);
45+
case INFINI_DTYPE_BF16:
46+
return _device_info->calculate<GeluTanhOp, bf16_t>(_info, output, inputs, stream);
47+
default:
48+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
49+
}
50+
}
51+
52+
} // namespace op::gelutanh::cpu
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __GELUTANH_CPU_H__
2+
#define __GELUTANH_CPU_H__
3+
4+
#include "../../../elementwise/cpu/elementwise_cpu.h"
5+
6+
#include <cmath>
7+
8+
ELEMENTWISE_DESCRIPTOR(gelutanh, cpu)
9+
10+
namespace op::gelutanh::cpu {
11+
typedef struct GeluTanhOp {
12+
public:
13+
static constexpr size_t num_inputs = 1;
14+
15+
template <typename T>
16+
T operator()(const T &x) const {
17+
// y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
18+
constexpr T alpha = static_cast<T>(0.7978845608); // sqrt(2/pi)
19+
constexpr T beta = static_cast<T>(0.044715);
20+
T inner = alpha * (x + beta * x * x * x);
21+
return x * static_cast<T>(0.5) * (static_cast<T>(1) + std::tanh(inner));
22+
}
23+
} GeluTanhOp;
24+
} // namespace op::gelutanh::cpu
25+
26+
#endif // __GELUTANH_CPU_H__
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#ifndef __GELUTANH_CUDA_H__
2+
#define __GELUTANH_CUDA_H__
3+
4+
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
5+
#include <cmath>
6+
#include <cuda_bf16.h>
7+
#include <cuda_fp16.h>
8+
9+
namespace op::gelutanh::cuda {
10+
11+
typedef struct GeluTanhOp {
12+
public:
13+
static constexpr size_t num_inputs = 1;
14+
15+
// GELU-Tanh constants
16+
// static constexpr float alpha = std::sqrt(2.0 / M_PI);
17+
// static constexpr float beta = 0.044715f;
18+
static constexpr float alpha = 0.7978845608f; // sqrt(2/pi)
19+
static constexpr float beta = 0.044715f;
20+
// f32 tanh helper
21+
__device__ __forceinline__ float tanh_f32_func(float x) const {
22+
return tanhf(x);
23+
}
24+
25+
template <typename T>
26+
__device__ __forceinline__ T operator()(const T &x) const {
27+
if constexpr (std::is_same_v<T, half2>) {
28+
// half2 -> float2
29+
float2 vf = __half22float2(x);
30+
float inner_x0 = alpha * (vf.x + beta * vf.x * vf.x * vf.x);
31+
float inner_x1 = alpha * (vf.y + beta * vf.y * vf.y * vf.y);
32+
float2 vr = make_float2(tanh_f32_func(inner_x0) * 0.5f + 0.5f,
33+
tanh_f32_func(inner_x1) * 0.5f + 0.5f);
34+
return __hmul2(x, __float22half2_rn(vr)); // y = x * 0.5 * (1 + tanh(...))
35+
} else if constexpr (std::is_same_v<T, half>) {
36+
float xf = __half2float(x);
37+
float inner = alpha * (xf + beta * xf * xf * xf);
38+
float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner));
39+
return __float2half_rn(yf);
40+
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
41+
float xf = __bfloat162float(x);
42+
float inner = alpha * (xf + beta * xf * xf * xf);
43+
float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner));
44+
return __float2bfloat16(yf);
45+
} else if constexpr (std::is_same_v<T, float>) {
46+
float inner = alpha * (x + beta * x * x * x);
47+
return x * 0.5f * (1.0f + tanh_f32_func(inner));
48+
} else { // double
49+
double inner = alpha * (x + beta * x * x * x);
50+
return x * 0.5 * (1.0 + std::tanh(inner));
51+
}
52+
}
53+
54+
} GeluTanhOp;
55+
56+
} // namespace op::gelutanh::cuda
57+
58+
#endif // __GELUTANH_CUDA_H__

0 commit comments

Comments
 (0)