Skip to content

Commit ae2d77e

Browse files
author
zhangyue
committed
style: apply clang-format and fix code convention violations (round 1)
- C1: auto-format all C++ files with clang-format (25 files) - C4: lowercase assert messages, remove trailing periods (10 messages) - G4: backtick-fence identifiers in comments (causal_softmax) - P5: add blank lines before return statements (generate_wrappers.py)
1 parent 2c0bcf8 commit ae2d77e

26 files changed

Lines changed: 201 additions & 264 deletions

File tree

scripts/generate_wrappers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def _find_optional_tensor_params(op_name):
9999
source text.
100100
"""
101101
source = (_BASE_DIR / f"{op_name}.h").read_text()
102+
102103
return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
103104

104105

@@ -109,6 +110,7 @@ def _find_vector_tensor_params(op_name):
109110
import re
110111

111112
source = (_BASE_DIR / f"{op_name}.h").read_text()
113+
112114
return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
113115

114116

src/ascend/add_rms_norm/kernel.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
#include "aclnn/aclnn_base.h"
88
#include "aclnn_add.h"
99
#include "aclnn_rms_norm.h"
10-
#include "ascend/common.h"
1110
#include "ascend/add_rms_norm/registry.h"
11+
#include "ascend/common.h"
1212
#include "ascend/workspace_pool_.h"
1313
#include "operator.h"
1414

@@ -63,10 +63,8 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
6363
&add_exec_);
6464
aclSetAclOpExecutorRepeatable(add_exec_);
6565
} else {
66-
aclSetInputTensorAddr(add_exec_, 0, t_x1,
67-
const_cast<void*>(x1.data()));
68-
aclSetInputTensorAddr(add_exec_, 1, t_x2,
69-
const_cast<void*>(x2.data()));
66+
aclSetInputTensorAddr(add_exec_, 0, t_x1, const_cast<void*>(x1.data()));
67+
aclSetInputTensorAddr(add_exec_, 1, t_x2, const_cast<void*>(x2.data()));
7068
aclSetOutputTensorAddr(add_exec_, 0, t_x_out, x_out.data());
7169
}
7270
auto& add_arena = ascend::workspacePool().ensure(stream, add_ws_);
@@ -78,18 +76,17 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
7876

7977
// Lazily create rstd tensor descriptor on first call.
8078
if (!rstd_tensor_) {
81-
rstd_tensor_ = aclCreateTensor(
82-
rstd_shape_.data(), 2, ACL_FLOAT,
83-
/*strides=*/nullptr, 0, ACL_FORMAT_ND, rstd_shape_.data(), 2,
84-
rstd_arena.buf);
79+
rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
80+
/*strides=*/nullptr, 0, ACL_FORMAT_ND,
81+
rstd_shape_.data(), 2, rstd_arena.buf);
8582
} else {
8683
aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
8784
}
8885

8986
// Step 2: y_out = rms_norm(x_out, gamma, eps).
9087
if (!norm_exec_) {
91-
aclnnRmsNormGetWorkspaceSize(t_x_out, t_gamma, eps, t_y_out,
92-
rstd_tensor_, &norm_ws_, &norm_exec_);
88+
aclnnRmsNormGetWorkspaceSize(t_x_out, t_gamma, eps, t_y_out, rstd_tensor_,
89+
&norm_ws_, &norm_exec_);
9390
aclSetAclOpExecutorRepeatable(norm_exec_);
9491
} else {
9592
aclSetInputTensorAddr(norm_exec_, 0, t_x_out, x_out.data());

src/ascend/add_rms_norm/kernel_custom.h

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,22 @@
1010
#include "acl/acl.h"
1111
#include "aclnn/aclnn_base.h"
1212
#include "aclnnop/aclnn_cast.h"
13-
#include "ascend/common.h"
1413
#include "ascend/add_rms_norm/registry.h"
14+
#include "ascend/common.h"
1515
#include "ascend/workspace_pool_.h"
1616
#include "base/add_rms_norm.h"
1717
#include "operator.h"
1818

1919
// Forward-declare the generated AscendC kernel launch function.
2020
// This symbol is provided by the `no_workspace_kernel` static library
21-
// built from `ascend/custom_kernel/csrc/ops/add_rms_norm/op_kernel/add_rms_norm.cpp`
22-
// via `ascendc_library()`.
21+
// built from
22+
// `ascend/custom_kernel/csrc/ops/add_rms_norm/op_kernel/add_rms_norm.cpp` via
23+
// `ascendc_library()`.
2324
extern "C" uint32_t aclrtlaunch_add_rms_norm(
24-
uint32_t blockDim, void* stream,
25-
void* x1, void* x2, void* weight, void* y, void* x_out,
26-
int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
27-
int64_t formerNum, int64_t formerLength, int64_t tailLength,
28-
float eps, int64_t dtypeSize);
25+
uint32_t blockDim, void* stream, void* x1, void* x2, void* weight, void* y,
26+
void* x_out, int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
27+
int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
28+
int64_t dtypeSize);
2929

3030
namespace infini::ops {
3131

@@ -62,8 +62,8 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
6262
assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
6363
"Custom AddRmsNorm kernel requires 32-byte aligned last dimension");
6464

65-
total_rows_ = static_cast<int64_t>(batch_size_) *
66-
static_cast<int64_t>(nhead_);
65+
total_rows_ =
66+
static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
6767

6868
// For fp16 input, weight needs fp32 conversion because the custom
6969
// kernel always reads weight as fp32.
@@ -72,16 +72,15 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
7272
if (needs_weight_cast_) {
7373
// Allocate persistent fp32 weight buffer on device.
7474
size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
75-
aclrtMalloc(&weight_fp32_data_, fp32_bytes,
76-
ACL_MEM_MALLOC_NORMAL_ONLY);
75+
aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
7776

7877
// AclTensorCache for the cast source (fp16 weight descriptor).
79-
weight_src_cache_ = ascend::AclTensorCache(
80-
{static_cast<int64_t>(dim_)}, ACL_FLOAT16, nullptr);
78+
weight_src_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
79+
ACL_FLOAT16, nullptr);
8180

8281
// AclTensorCache for the cast destination (fp32 weight buffer).
83-
weight_dst_cache_ = ascend::AclTensorCache(
84-
{static_cast<int64_t>(dim_)}, ACL_FLOAT, weight_fp32_data_);
82+
weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
83+
ACL_FLOAT, weight_fp32_data_);
8584
}
8685
}
8786

@@ -105,8 +104,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
105104
const void* cur_weight = gamma.data();
106105

107106
if (cur_weight != last_weight_ptr_) {
108-
auto t_src =
109-
weight_src_cache_.get(const_cast<void*>(cur_weight));
107+
auto t_src = weight_src_cache_.get(const_cast<void*>(cur_weight));
110108
auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
111109

112110
if (!cast_exec_) {
@@ -133,25 +131,17 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
133131
// Block-level tiling: distribute rows across cores.
134132
static constexpr int64_t kMaxBlockDim = 40;
135133
int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
136-
int64_t former_length =
137-
(total_rows_ + used_cores - 1) / used_cores;
134+
int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
138135
int64_t tail_length = former_length - 1;
139136
int64_t former_num = total_rows_ - tail_length * used_cores;
140137
uint32_t block_dim = static_cast<uint32_t>(used_cores);
141138

142139
// Launch custom AscendC kernel.
143140
aclrtlaunch_add_rms_norm(
144-
block_dim, stream,
145-
const_cast<void*>(x1.data()),
146-
const_cast<void*>(x2.data()),
147-
weight_fp32,
148-
y_out.data(),
149-
x_out.data(),
150-
total_rows_,
151-
static_cast<int64_t>(dim_),
152-
dim_length_align_,
153-
former_num, former_length, tail_length,
154-
eps, dtype_size_);
141+
block_dim, stream, const_cast<void*>(x1.data()),
142+
const_cast<void*>(x2.data()), weight_fp32, y_out.data(), x_out.data(),
143+
total_rows_, static_cast<int64_t>(dim_), dim_length_align_, former_num,
144+
former_length, tail_length, eps, dtype_size_);
155145
}
156146

157147
private:

src/ascend/add_rms_norm/kernel_fused.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,13 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
7676
auto stream = static_cast<aclrtStream>(stream_);
7777

7878
if (!executor_) {
79-
aclnnAddRmsNormGetWorkspaceSize(t_x1, t_x2, t_gamma,
80-
static_cast<double>(eps), t_y_out,
81-
rstd_tensor_, t_x_out, &ws_size_,
82-
&executor_);
79+
aclnnAddRmsNormGetWorkspaceSize(
80+
t_x1, t_x2, t_gamma, static_cast<double>(eps), t_y_out, rstd_tensor_,
81+
t_x_out, &ws_size_, &executor_);
8382
aclSetAclOpExecutorRepeatable(executor_);
8483
} else {
85-
aclSetInputTensorAddr(executor_, 0, t_x1,
86-
const_cast<void*>(x1.data()));
87-
aclSetInputTensorAddr(executor_, 1, t_x2,
88-
const_cast<void*>(x2.data()));
84+
aclSetInputTensorAddr(executor_, 0, t_x1, const_cast<void*>(x1.data()));
85+
aclSetInputTensorAddr(executor_, 1, t_x2, const_cast<void*>(x2.data()));
8986
aclSetInputTensorAddr(executor_, 2, t_gamma,
9087
const_cast<void*>(gamma.data()));
9188
aclSetOutputTensorAddr(executor_, 0, t_y_out, y_out.data());

src/ascend/atb_common_.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
#include <vector>
1010

1111
#include "acl/acl.h"
12+
#include "ascend/data_type_.h"
1213
#include "atb/context.h"
1314
#include "atb/operation.h"
1415
#include "atb/types.h"
15-
#include "ascend/data_type_.h"
1616
#include "tensor.h"
1717

1818
namespace infini::ops::ascend {

src/ascend/cat/kernel.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#include <vector>
55

66
#include "acl/acl.h"
7-
#include "aclnn/aclnn_base.h"
87
#include "aclnn/acl_meta.h"
8+
#include "aclnn/aclnn_base.h"
99
#include "aclnnop/aclnn_cat.h"
1010
#include "ascend/common.h"
1111
#include "ascend/workspace_pool_.h"
@@ -55,9 +55,9 @@ class Operator<Cat, Device::Type::kAscend> : public Cat {
5555
in_caches_[i].get(const_cast<void*>(inputs[i]->data()));
5656
}
5757

58-
tensor_list_ = aclCreateTensorList(
59-
const_cast<const aclTensor**>(acl_tensors.data()),
60-
static_cast<uint64_t>(input_count_));
58+
tensor_list_ =
59+
aclCreateTensorList(const_cast<const aclTensor**>(acl_tensors.data()),
60+
static_cast<uint64_t>(input_count_));
6161

6262
aclnnCatGetWorkspaceSize(tensor_list_, dim_, t_out, &ws_size_,
6363
&executor_);

src/ascend/causal_softmax/kernel.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,18 @@
1818
namespace infini::ops {
1919

2020
// Implements causal softmax via three ACLNN calls:
21-
// 1. InplaceCopy(temp, input) — stride-aware copy to contiguous temp
21+
// 1. `InplaceCopy(temp, input)` — stride-aware copy to contiguous temp
2222
// buffer.
23-
// 2. InplaceMaskedFillScalar(temp, mask, -inf) — apply upper-triangle mask.
24-
// 3. Softmax(temp, dim=-1, out) — softmax over the last dimension.
23+
// 2. `InplaceMaskedFillScalar(temp, mask, -inf)` — apply upper-triangle mask.
24+
// 3. `Softmax(temp, dim=-1, out)` — softmax over the last dimension.
2525
//
2626
// The boolean causal mask is pre-computed and uploaded to device once in the
2727
// constructor. Its shape (seq_len, total_seq_len) broadcasts over the batch.
2828
template <>
2929
class Operator<CausalSoftmax, Device::Type::kAscend> : public CausalSoftmax {
3030
public:
3131
Operator(const Tensor input, Tensor out)
32-
: CausalSoftmax(input, out),
33-
in_cache_(input),
34-
out_cache_(out) {
32+
: CausalSoftmax(input, out), in_cache_(input), out_cache_(out) {
3533
// Compute temp buffer size — allocated lazily from pool in `operator()`.
3634
size_t n_elems = input.numel();
3735
size_t elem_bytes = kDataTypeToSize.at(dtype_);

src/ascend/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ class AclTensorCache {
7373
public:
7474
AclTensorCache() = default;
7575

76-
// Construct from explicit metadata (for device buffers not wrapped in Tensor).
77-
// Computes contiguous strides from shape.
76+
// Construct from explicit metadata (for device buffers not wrapped in
77+
// Tensor). Computes contiguous strides from shape.
7878
AclTensorCache(std::vector<int64_t> shape, aclDataType dtype, void* data)
7979
: shape_(std::move(shape)), dtype_(dtype) {
8080
strides_.resize(shape_.size());

src/ascend/flash_attention/kernel.h

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@ inline aclIntArray* extractSeqLengths(const Tensor& cu_seqlens,
3434
cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
3535
} else {
3636
cu_host_buf.resize(n);
37-
aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t),
38-
cu_seqlens.data(), n * sizeof(int64_t),
39-
ACL_MEMCPY_DEVICE_TO_HOST, stream);
37+
aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
38+
n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
4039
aclrtSynchronizeStream(stream);
4140
cu_host_ptr = cu_host_buf.data();
4241
}
@@ -67,9 +66,8 @@ inline aclIntArray* cumSeqLengths(const Tensor& cu_seqlens,
6766
cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
6867
} else {
6968
cu_host_buf.resize(n);
70-
aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t),
71-
cu_seqlens.data(), n * sizeof(int64_t),
72-
ACL_MEMCPY_DEVICE_TO_HOST, stream);
69+
aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
70+
n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
7371
aclrtSynchronizeStream(stream);
7472
cu_host_ptr = cu_host_buf.data();
7573
}
@@ -141,10 +139,10 @@ class Operator<FlashAttention, Device::Type::kAscend> : public FlashAttention {
141139
const int64_t D = query.size(2);
142140
const int64_t B = query.size(0);
143141

144-
decode_q_cache_ = ascend::AclTensorCache(
145-
{B, N, 1, D}, acl_dt, const_cast<void*>(query.data()));
146-
decode_out_cache_ = ascend::AclTensorCache(
147-
{B, N, 1, D}, acl_dt, output.data());
142+
decode_q_cache_ = ascend::AclTensorCache({B, N, 1, D}, acl_dt,
143+
const_cast<void*>(query.data()));
144+
decode_out_cache_ =
145+
ascend::AclTensorCache({B, N, 1, D}, acl_dt, output.data());
148146
block_table_cache_ = ascend::AclTensorCache(block_table.value());
149147

150148
// Pre-compute KV reshape metadata.
@@ -224,8 +222,8 @@ class Operator<FlashAttention, Device::Type::kAscend> : public FlashAttention {
224222
t_q, key_list, val_list,
225223
nullptr, // pseShift
226224
causal_mask_, // attenMask (pre-computed, or nullptr)
227-
seq_q, // actualSeqLengths
228-
seq_kv, // actualSeqLengthsKv
225+
seq_q, // actualSeqLengths
226+
seq_kv, // actualSeqLengthsKv
229227
nullptr, nullptr, nullptr, nullptr,
230228
nullptr, // deqScale1..quantOffset2
231229
nullptr, nullptr, // antiquantScale, antiquantOffset

src/ascend/linear/kernel.h

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,8 @@ class Operator<Linear, Device::Type::kAscend> : public Linear {
6060
} else {
6161
aclSetInputTensorAddr(executor_, 0, t_bias,
6262
const_cast<void*>(bias->data()));
63-
aclSetInputTensorAddr(executor_, 1, t_a,
64-
const_cast<void*>(a.data()));
65-
aclSetInputTensorAddr(executor_, 2, t_b,
66-
const_cast<void*>(b.data()));
63+
aclSetInputTensorAddr(executor_, 1, t_a, const_cast<void*>(a.data()));
64+
aclSetInputTensorAddr(executor_, 2, t_b, const_cast<void*>(b.data()));
6765
aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
6866
}
6967

@@ -77,14 +75,12 @@ class Operator<Linear, Device::Type::kAscend> : public Linear {
7775
} else {
7876
if (!executor_) {
7977
int8_t cube_math_type = 1;
80-
aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type,
81-
&ws_size_, &executor_);
78+
aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_,
79+
&executor_);
8280
aclSetAclOpExecutorRepeatable(executor_);
8381
} else {
84-
aclSetInputTensorAddr(executor_, 0, t_a,
85-
const_cast<void*>(a.data()));
86-
aclSetInputTensorAddr(executor_, 1, t_b,
87-
const_cast<void*>(b.data()));
82+
aclSetInputTensorAddr(executor_, 0, t_a, const_cast<void*>(a.data()));
83+
aclSetInputTensorAddr(executor_, 1, t_b, const_cast<void*>(b.data()));
8884
aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
8985
}
9086

0 commit comments

Comments
 (0)