style: fix code convention violations (round 2)

zhangyue · zhangyue · commit 255ba005e600 · 2026-04-15T14:19:00.000+08:00
- C4: lowercase assert message starts (workspace_pool_, rms_norm, rotary_embedding)
- C4: remove trailing period from workspace_pool_ assert
- C9: add blank line between SlotKey struct members
- G4: backtick-fence identifiers in comments across 12 files
- G4: backtick-fence identifiers in assert messages (flash_attention, rotary_embedding)
- P1: remove duplicate `import re` in generate_wrappers.py
- P4: add blank lines around control flow in test_flash_attention.py
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -107,8 +107,6 @@ def _find_vector_tensor_params(op_name):
     """Return a set of parameter names declared as `std::vector<Tensor>` in
     the base header.
     """
-    import re
-
     source = (_BASE_DIR / f"{op_name}.h").read_text()
 
     return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
diff --git a/src/ascend/add/kernel.h b/src/ascend/add/kernel.h
@@ -20,8 +20,8 @@ class Operator<Add, Device::Type::kAscend> : public Add {
         in_cache_(input),
         oth_cache_(other),
         out_cache_(out) {
-    // aclCreateScalar stores the pointer rather than copying the value, so
-    // alpha_storage_* must remain alive for the lifetime of alpha_.
+    // `aclCreateScalar` stores the pointer rather than copying the value, so
+    // `alpha_storage_*` must remain alive for the lifetime of `alpha_`.
     // The alpha scalar type must match the tensor dtype: use int64 for integer
     // dtypes and float for floating-point dtypes.
     if (ascend::isIntegerDtype(input.dtype())) {
@@ -71,8 +71,9 @@ class Operator<Add, Device::Type::kAscend> : public Add {
   mutable uint64_t ws_size_ = 0;
 
   float alpha_float_storage_ =
-      1.0f;                        // stable address for aclCreateScalar (float)
-  int64_t alpha_int_storage_ = 1;  // stable address for aclCreateScalar (int)
+      1.0f;  // Stable address for `aclCreateScalar` (float).
+  int64_t alpha_int_storage_ =
+      1;  // Stable address for `aclCreateScalar` (int).
   aclScalar* alpha_ = nullptr;
 };
 
diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h
@@ -14,9 +14,9 @@
 
 namespace infini::ops {
 
-// Decomposed implementation: aclnnAdd + aclnnRmsNorm.
+// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`.
 //
-// The fused aclnnAddRmsNorm API has ~200 us host-side launch overhead that
+// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that
 // dominates small-tensor dispatch.  Decomposing into two fast ACLNN calls
 // reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
 // NPU-side impact for inference tensor sizes.
@@ -31,10 +31,10 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
         gamma_cache_(gamma),
         y_out_cache_(y_out),
         x_out_cache_(x_out) {
-    // Alpha scalar for aclnnAdd (x_out = x1 + 1.0 * x2).
+    // Alpha scalar for `aclnnAdd` (x_out = x1 + 1.0 * x2).
     alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
 
-    // aclnnRmsNorm writes rstd as a required side output.
+    // `aclnnRmsNorm` writes `rstd` as a required side output.
     // Size computed here; buffer obtained from pool in `operator()`.
     rstd_shape_ = {static_cast<int64_t>(batch_size_),
                    static_cast<int64_t>(nhead_)};
diff --git a/src/ascend/add_rms_norm/kernel_custom.h b/src/ascend/add_rms_norm/kernel_custom.h
@@ -33,8 +33,9 @@ namespace infini::ops {
 //
 // A single-kernel implementation that computes x_out = x1 + x2 followed by
 // y = rms_norm(x_out, gamma, eps) in one launch, avoiding the decomposed
-// aclnnAdd + aclnnRmsNorm calls (index 0) or the fused aclnnAddRmsNorm call
-// (index 1).  Migrated from the custom RmsNorm kernel (index 1 of RmsNorm).
+// `aclnnAdd` + `aclnnRmsNorm` calls (index 0) or the fused `aclnnAddRmsNorm`
+// call (index 1).  Migrated from the custom RmsNorm kernel (index 1 of
+// RmsNorm).
 //
 // Select via `implementation_index=2` in Python:
 //   infini.ops.add_rms_norm(x1, x2, gamma, eps, y_out, x_out,
@@ -59,8 +60,9 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
     dim_length_align_ =
         ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
         align_elems;
-    assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
-           "Custom AddRmsNorm kernel requires 32-byte aligned last dimension");
+    assert(
+        static_cast<int64_t>(dim_) == dim_length_align_ &&
+        "custom `AddRmsNorm` kernel requires 32-byte aligned last dimension");
 
     total_rows_ =
         static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
diff --git a/src/ascend/add_rms_norm/kernel_fused.h b/src/ascend/add_rms_norm/kernel_fused.h
@@ -13,12 +13,12 @@
 
 namespace infini::ops {
 
-// Fused implementation via aclnnAddRmsNorm (implementation index 1).
+// Fused implementation via `aclnnAddRmsNorm` (implementation index 1).
 //
 // Computes x_out = x1 + x2 and y_out = rms_norm(x_out, gamma, eps) in a
 // single CANN launch.  The fused API has higher host-side launch overhead
-// (~200 us) compared to the decomposed aclnnAdd + aclnnRmsNorm path (~39 us),
-// but may offer better NPU-side efficiency for large tensors where kernel
+// (~200 us) compared to the decomposed `aclnnAdd` + `aclnnRmsNorm` path (~39
+// us), but may offer better NPU-side efficiency for large tensors where kernel
 // fusion reduces memory traffic.
 //
 // Select via `implementation_index=1` in Python:
@@ -34,7 +34,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
         gamma_cache_(gamma),
         y_out_cache_(y_out),
         x_out_cache_(x_out) {
-    // aclnnAddRmsNorm requires rstdOut to have the same ndim as x1, with
+    // `aclnnAddRmsNorm` requires `rstdOut` to have the same ndim as x1, with
     // the last gamma.ndim() dimensions set to 1.  For example:
     //   x1 shape(2, 32, 128), gamma shape(128) -> rstdOut shape(2, 32, 1)
     //   x1 shape(64, 128),    gamma shape(128) -> rstdOut shape(64, 1)
diff --git a/src/ascend/causal_softmax/kernel.h b/src/ascend/causal_softmax/kernel.h
@@ -64,10 +64,11 @@ class Operator<CausalSoftmax, Device::Type::kAscend> : public CausalSoftmax {
                                    mstrides.data(), 0, ACL_FORMAT_ND,
                                    mshape.data(), mshape.size(), mask_buf_);
 
-    // Scalar -inf for the masked-fill step. aclCreateScalar stores the pointer
-    // rather than copying, so neg_inf_storage_ must stay alive with the object.
+    // Scalar -inf for the masked-fill step.  `aclCreateScalar` stores the
+    // pointer rather than copying, so `neg_inf_storage_` must stay alive with
+    // the object.
     neg_inf_ = aclCreateScalar(&neg_inf_storage_, ACL_FLOAT);
-    // Workspaces are allocated lazily on first operator() call.
+    // Workspaces are allocated lazily on first `operator()` call.
   }
 
   ~Operator() {
diff --git a/src/ascend/reshape_and_cache/kernel.h b/src/ascend/reshape_and_cache/kernel.h
@@ -15,11 +15,11 @@
 
 namespace infini::ops {
 
-// Device-side scatter via aclnnInplaceIndexCopy.
+// Device-side scatter via `aclnnInplaceIndexCopy`.
 //
 // The previous implementation copied slot_mapping D2H (aclrtSynchronizeStream),
 // then issued per-token D2D memcpy in a host loop.  For batch=256, this meant
-// ~100 us sync + ~500 us host loop overhead.  aclnnInplaceIndexCopy performs
+// ~100 us sync + ~500 us host loop overhead.  `aclnnInplaceIndexCopy` performs
 // the scatter entirely on the NPU with two ACLNN calls (one for K, one for V),
 // eliminating all D2H synchronisation and host-side loops.
 //
@@ -72,7 +72,7 @@ class Operator<ReshapeAndCache, Device::Type::kAscend>
     auto t_slot = slot_cache_.get(const_cast<void*>(slot_mapping.data()));
 
     // K cache scatter: kv_k[slot_mapping[i]] = key[i] along dim 0.
-    // Executor caching is not used here because aclnnInplaceIndexCopy is an
+    // Executor caching is not used here because `aclnnInplaceIndexCopy` is an
     // inplace operation where self is both input and output; the executor
     // reuse via aclSetInputTensorAddr does not update the output reference.
     uint64_t k_ws = 0;
diff --git a/src/ascend/rms_norm/kernel.h b/src/ascend/rms_norm/kernel.h
@@ -22,7 +22,7 @@ class Operator<RmsNorm, Device::Type::kAscend> : public RmsNorm {
         in_cache_(input),
         weight_cache_(weight),
         out_cache_(out) {
-    // aclnnRmsNorm writes rstd as a required side output.
+    // `aclnnRmsNorm` writes `rstd` as a required side output.
     // Size computed here; buffer obtained from pool in `operator()`.
     rstd_shape_ = {static_cast<int64_t>(batch_size_),
                    static_cast<int64_t>(nhead_)};
diff --git a/src/ascend/rms_norm/kernel_custom.h b/src/ascend/rms_norm/kernel_custom.h
@@ -58,7 +58,7 @@ class Operator<RmsNorm, Device::Type::kAscend, 1> : public RmsNorm {
         ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
         align_elems;
     assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
-           "Custom RmsNorm kernel requires 32-byte aligned last dimension");
+           "custom `RmsNorm` kernel requires 32-byte aligned last dimension");
 
     total_rows_ =
         static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
diff --git a/src/ascend/rotary_embedding/kernel.h b/src/ascend/rotary_embedding/kernel.h
@@ -17,7 +17,7 @@
 
 namespace infini::ops {
 
-// Rotary position embedding via aclnnApplyRotaryPosEmbV2.
+// Rotary position embedding via `aclnnApplyRotaryPosEmbV2`.
 //
 // V2 handles Q and K simultaneously in a single inplace call (layout=4, TND).
 // The `rotaryMode` parameter accepts "half", "interleave", or "quarter", but
@@ -42,12 +42,13 @@ class Operator<RotaryEmbedding, Device::Type::kAscend>
       : RotaryEmbedding(positions, query, key, cos_sin_cache, head_size,
                         rotary_dim, is_neox_style, query_out, key_out) {
     assert(rotary_dim == head_size &&
-           "Ascend `RotaryEmbedding` requires rotary_dim == head_size "
+           "ascend `RotaryEmbedding` requires `rotary_dim` == `head_size` "
            "(partial rotation not supported)");
     assert(is_neox_style &&
-           "Ascend `RotaryEmbedding` requires neox style — "
-           "aclnnApplyRotaryPosEmbV2 rotaryMode only supports \"half\"; "
-           "\"interleave\" and \"quarter\" return ACLNN_ERR_PARAM_INVALID");
+           "ascend `RotaryEmbedding` requires neox style — "
+           "`aclnnApplyRotaryPosEmbV2` `rotaryMode` only supports "
+           "\"half\"; \"interleave\" and \"quarter\" return "
+           "`ACLNN_ERR_PARAM_INVALID`");
 
     const int64_t max_seq_len = cos_sin_cache.size(0);
     const int64_t D = head_size_;
@@ -101,7 +102,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend>
     const int64_t Nkv = num_kv_heads_;
     aclDataType acl_dt = ascend::toAclDtype(query.dtype());
 
-    // Gathered cos/sin buffers [T, D] — filled by aclnnIndexSelect each call.
+    // Gathered cos/sin buffers [T, D] — filled by `aclnnIndexSelect` each call.
     size_t gathered_bytes = static_cast<size_t>(T * D) * elem_sz;
     aclrtMalloc(&cos_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
     aclrtMalloc(&sin_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
@@ -147,7 +148,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend>
     const int64_t Nkv = key.size(1);
     const int64_t D = head_size;
 
-    // Step 1: Gather cos/sin by positions via aclnnIndexSelect (async).
+    // Step 1: Gather cos/sin by positions via `aclnnIndexSelect` (async).
     {
       auto t_cos_table = cos_table_cache_.get(cos_table_dev_);
       auto t_sin_table = sin_table_cache_.get(sin_table_dev_);
diff --git a/src/ascend/rotary_embedding/kernel_atb.h b/src/ascend/rotary_embedding/kernel_atb.h
@@ -56,9 +56,9 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
       : RotaryEmbedding(positions, query, key, cos_sin_cache, head_size,
                         rotary_dim, is_neox_style, query_out, key_out) {
     assert(rotary_dim == head_size &&
-           "ATB `RotaryEmbedding` requires rotary_dim == head_size");
+           "ATB `RotaryEmbedding` requires `rotary_dim` == `head_size`");
     assert(is_neox_style &&
-           "ATB `RotaryEmbedding` requires neox style (rotaryCoeff=2)");
+           "ATB `RotaryEmbedding` requires neox style (`rotaryCoeff`=2)");
 
     const int64_t max_seq_len = cos_sin_cache.size(0);
     const int64_t D = head_size_;
diff --git a/src/ascend/swiglu/kernel.h b/src/ascend/swiglu/kernel.h
@@ -16,7 +16,7 @@ namespace infini::ops {
 
 // Implements SwiGLU as two ACLNN calls: silu(gate) into a temp buffer,
 // then elementwise mul(input, temp) into out.
-// aclnnSiluMul was not used because it fuses silu_AND_mul on the same
+// `aclnnSiluMul` was not used because it fuses silu_AND_mul on the same
 // tensor (x * silu(x)), whereas SwiGLU requires input * silu(gate) —
 // two distinct inputs.
 template <>
diff --git a/src/ascend/workspace_pool_.h b/src/ascend/workspace_pool_.h
@@ -55,8 +55,8 @@ class WorkspacePool {
     // Slow path: look up arena in the map under lock.
     assert(!capturing_ &&
            "`WorkspacePool`: `aclrtMalloc` on slow path during graph "
-           "capture. Ensure all operators run at least once during "
-           "eager warmup.");
+           "capture; ensure all operators run at least once during "
+           "eager warmup");
 
     std::lock_guard<std::mutex> lock(mutex_);
 
@@ -122,6 +122,7 @@ class WorkspacePool {
  private:
   struct SlotKey {
     aclrtStream stream;
+
     std::string slot;
 
     bool operator==(const SlotKey& o) const {
diff --git a/src/base/flash_attention.h b/src/base/flash_attention.h
@@ -40,7 +40,7 @@ class FlashAttention : public Operator<FlashAttention> {
         has_cu_seqlens_kv_{cu_seqlens_kv.has_value()},
         has_block_table_{block_table.has_value()} {
     assert(num_heads % num_kv_heads == 0 &&
-           "`FlashAttention` requires num_heads divisible by num_kv_heads");
+           "`FlashAttention` requires `num_heads` divisible by `num_kv_heads`");
     assert(query.ndim() == 3 &&
            "`FlashAttention` requires query to be 3D [T, N, D]");
   }
diff --git a/src/base/linear.h b/src/base/linear.h
@@ -11,7 +11,8 @@ namespace infini::ops {
 //
 // When bias is present, computes out = a @ b + bias in a single dispatch.
 // When bias is absent, computes out = a @ b (equivalent to Matmul).
-// trans_a / trans_b: if true, transpose the last two dims before multiplying.
+// `trans_a` / `trans_b`: If true, transpose the last two dims before
+// multiplying.
 class Linear : public Operator<Linear> {
  public:
   Linear(const Tensor a, const Tensor b, std::optional<Tensor> bias,
diff --git a/src/base/rotary_embedding.h b/src/base/rotary_embedding.h
@@ -34,7 +34,7 @@ class RotaryEmbedding : public Operator<RotaryEmbedding> {
     assert(key.ndim() == 3 &&
            "`RotaryEmbedding` requires key to be 3D [T, N_kv, D]");
     assert(rotary_dim <= head_size &&
-           "`RotaryEmbedding` requires rotary_dim <= head_size");
+           "`RotaryEmbedding` requires `rotary_dim` <= `head_size`");
   }
 
   virtual void operator()(const Tensor positions, const Tensor query,
diff --git a/src/operator.h b/src/operator.h
@@ -187,7 +187,7 @@ class Operator : public OperatorBase {
     if (it == cache.end()) {
       // Pass args as lvalue refs so they remain valid for the `operator()` call
       // below. Forwarding rvalue temporaries into `make()` would leave the args
-      // in a moved-from (empty) state before operator() can use them.
+      // in a moved-from (empty) state before `operator()` can use them.
       it = cache.emplace(std::move(key), make(config, args...)).first;
     }
 
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
@@ -413,9 +413,11 @@ def _ref_flash_attention_paged(
         k_pages = []
         v_pages = []
         remaining = kv_len
+
         for b in blocks:
             if remaining <= 0:
                 break
+
             take = min(remaining, block_size)
             # cache layout: [num_blocks, block_size, KV_N, D]
             # Slice [take, KV_N, D], transpose to [KV_N, take, D] for cat.
diff --git a/tests/test_reshape_and_cache.py b/tests/test_reshape_and_cache.py
@@ -4,8 +4,8 @@
 
 from tests.utils import Payload, get_npu_stream, randn_strided
 
-# ReshapeAndCache only works on NPU (aclrtMemcpy-based), so tests only
-# parametrize on float16/bfloat16 and use explicit device parametrization.
+# `ReshapeAndCache` only works on NPU (`aclrtMemcpy`-based), so tests only
+# parametrize on `float16`/`bfloat16` and use explicit device parametrization.
 
 
 @pytest.mark.auto_act_and_assert
diff --git a/tests/test_rotary_embedding.py b/tests/test_rotary_embedding.py
@@ -121,7 +121,7 @@ def test_rotary_embedding_full(
             "(rotaryMode='half')"
         )
 
-    # aclnnApplyRotaryPosEmbV2 accumulates with ~4 ULP error for float16.
+    # `aclnnApplyRotaryPosEmbV2` accumulates with ~4 ULP error for `float16`.
     if device == "npu" and dtype == torch.float16:
         atol = 0.01