style: fix code convention violations (round 3)

zhangyue · zhangyue · commit f1ab525c9e29 · 2026-04-15T14:27:45.000+08:00
- C4: lowercase "rope" in ATB assert messages
- G4: backtick-fence `VariantPack`, `rotaryCoeff`, `sparseMode`, `hostData`
- G4: backtick-fence identifiers in Python test comments
- P4: add blank line before `if` in test_rms_norm_precision.py
diff --git a/src/ascend/custom_kernel/csrc/ops/rms_norm/test/test_rms_norm_precision.py b/src/ascend/custom_kernel/csrc/ops/rms_norm/test/test_rms_norm_precision.py
@@ -68,6 +68,7 @@ def _compute_metrics(out, ref):
 
     ref_abs = ref.float().abs()
     nonzero = ref_abs > 1e-10
+
     if nonzero.any():
         rel_err = diff[nonzero] / ref_abs[nonzero]
         max_rel_err = rel_err.max().item()
diff --git a/src/ascend/custom_kernel/tests/test_add_rms_norm.py b/src/ascend/custom_kernel/tests/test_add_rms_norm.py
@@ -77,7 +77,7 @@ def test_add_rms_norm_correctness(dtype, shape):
         f"{(x_out_npu.cpu() - x_out_ref).abs().max().item()}"
     )
 
-    # Check y = rms_norm(x_out) * weight.
+    # Check `y = rms_norm(x_out) * weight`.
     rtol = 1e-3 if dtype == torch.float16 else 1e-5
     atol = 1e-3 if dtype == torch.float16 else 1e-5
     assert torch.allclose(y_npu.cpu(), y_ref, rtol=rtol, atol=atol), (
diff --git a/src/ascend/custom_kernel/tests/test_rms_norm.py b/src/ascend/custom_kernel/tests/test_rms_norm.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 import torch_npu
-import ascend_kernel  # noqa: F401  Loads libascend_kernel.so into torch.ops.npu.
+import ascend_kernel  # noqa: F401  Loads `libascend_kernel.so` into `torch.ops.npu`.
 
 
 def rms_norm_ref(x: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
diff --git a/src/ascend/flash_attention/kernel.h b/src/ascend/flash_attention/kernel.h
@@ -77,7 +77,7 @@ inline aclIntArray* cumSeqLengths(const Tensor& cu_seqlens,
 }
 
 // Allocate a 2048x2048 lower-triangular UINT8 causal mask on device.
-// Required for sparseMode >= 2.
+// Required for `sparseMode` >= 2.
 inline aclTensor* makeCausalMask(void** mask_buf, aclrtStream stream) {
   constexpr int64_t kMaskDim = 2048;
   const int64_t mask_elems = kMaskDim * kMaskDim;
diff --git a/src/ascend/paged_attention/kernel_atb.h b/src/ascend/paged_attention/kernel_atb.h
@@ -34,7 +34,7 @@ namespace infini::ops {
 // synchronous D2H copies for these two small tensors in each call.
 // All other tensors are device-only.
 //
-// ATB VariantPack layout (BSND with S=1):
+// ATB `VariantPack` layout (BSND with S=1):
 //   inTensors[0] = query         [B, N, D]
 //   inTensors[1] = key_cache     [num_blocks, block_size, Nkv, D]
 //   inTensors[2] = value_cache   [num_blocks, block_size, Nkv, D]
@@ -154,7 +154,7 @@ class Operator<PagedAttention, Device::Type::kAscend, 0>
   }
 
  private:
-  // Build the ATB VariantPack.
+  // Build the ATB `VariantPack`.
   //
   // Query and output are 3D [B, N, D] (BSND with S=1 for decode).
   // Block table and context lens carry both `deviceData` and
@@ -183,12 +183,12 @@ class Operator<PagedAttention, Device::Type::kAscend, 0>
     atb::Tensor t_value_cache = ascend::toAtbTensor(kv_cache_shape_, acl_dt_,
                                                     value_cache_data, kv_bytes);
 
-    // Block table [B, max_blocks] — with hostData for `aclIntArray*`.
+    // Block table [B, max_blocks] — with `hostData` for `aclIntArray*`.
     atb::Tensor t_block_table = ascend::toAtbTensor(
         block_table_shape_, bt_dt_, block_table_data, bt_host_bytes_);
     t_block_table.hostData = bt_host_;
 
-    // Context lens [B] — with hostData for `aclIntArray*`.
+    // Context lens [B] — with `hostData` for `aclIntArray*`.
     atb::Tensor t_context_lens = ascend::toAtbTensor(
         context_lens_shape_, sl_dt_, seq_lens_data, sl_host_bytes_);
     t_context_lens.hostData = sl_host_;
diff --git a/src/ascend/reshape_and_cache/kernel_atb.h b/src/ascend/reshape_and_cache/kernel_atb.h
@@ -29,7 +29,7 @@ namespace infini::ops {
 // `aclnnInplaceIndexCopy` path (index 0, ~35 us).
 //
 // The ATB operation is created once in the constructor.  Setup is called
-// before each Execute to bind the VariantPack.
+// before each `Execute` to bind the `VariantPack`.
 //
 // NOTE: `ReshapeAndCacheParam` requires int32 `slot_mapping`.  When the
 // caller passes int64 (the default in PyTorch / vLLM), this operator casts
@@ -57,7 +57,7 @@ class Operator<ReshapeAndCache, Device::Type::kAscend, 2>
     int64_t hs = static_cast<int64_t>(head_size_);
     int64_t T = static_cast<int64_t>(num_tokens_);
 
-    // Cache shapes for rebuilding VariantPack on each call.
+    // Cache shapes for rebuilding `VariantPack` on each call.
     kv_shape_ = {num_blocks, bs, nkv, hs};
     key_shape_ = {T, nkv, hs};
     slot_shape_ = {T};
@@ -134,7 +134,7 @@ class Operator<ReshapeAndCache, Device::Type::kAscend, 2>
                                            const_cast<void*>(value.data()),
                                            kv_cache_out.data(), slot32_ptr);
 
-    // Setup binds the VariantPack and computes workspace requirements.
+    // `Setup` binds the `VariantPack` and computes workspace requirements.
     uint64_t ws_size = 0;
     atb::Status s = op_->Setup(vp, ws_size, ctx);
     assert(s == atb::NO_ERROR &&
@@ -154,7 +154,7 @@ class Operator<ReshapeAndCache, Device::Type::kAscend, 2>
   }
 
  private:
-  // Build the ATB VariantPack for this operation.
+  // Build the ATB `VariantPack` for this operation.
   //
   // ATB `ReshapeAndCache` expects 5 inputs and 2 outputs:
   //   inTensors[0] = key         [num_tokens, num_kv_heads, head_size]
diff --git a/src/ascend/rotary_embedding/kernel_atb.h b/src/ascend/rotary_embedding/kernel_atb.h
@@ -44,7 +44,7 @@ namespace infini::ops {
 //
 // Restrictions:
 //   - rotary_dim must equal head_size (full rotation only).
-//   - is_neox_style must be true (rotaryCoeff=2).
+//   - is_neox_style must be true (`rotaryCoeff`=2).
 //   - fp16 only (ATB inference constraint).
 template <>
 class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
@@ -74,7 +74,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
     aclrtMemcpy(cache_host.data(), table_bytes, cos_sin_cache.data(),
                 table_bytes, ACL_MEMCPY_DEVICE_TO_HOST);
 
-    // ATB Rope with rotaryCoeff=2 expects cos/sin of shape [S, D].
+    // ATB Rope with `rotaryCoeff`=2 expects cos/sin of shape [S, D].
     // Neox-style expansion: [c0..c_{hD-1}, c0..c_{hD-1}].
     std::vector<uint8_t> cos_host(table_bytes);
     std::vector<uint8_t> sin_host(table_bytes);
@@ -208,7 +208,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
                        ACL_MEMCPY_HOST_TO_DEVICE, stream);
     }
 
-    // Build ATB VariantPack with 5 inputs + 2 outputs.
+    // Build ATB `VariantPack` with 5 inputs + 2 outputs.
     atb::Context* ctx = ascend::getAtbContext(stream);
 
     uint64_t q_bytes = static_cast<uint64_t>(T * hiddenQ) * elem_size_;
@@ -233,7 +233,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
     uint64_t ws_size = 0;
     atb::Status s = op_->Setup(vp, ws_size, ctx);
 
-    assert(s == atb::NO_ERROR && "ATB Rope setup failed");
+    assert(s == atb::NO_ERROR && "ATB rope setup failed");
 
     uint8_t* ws_ptr = nullptr;
 
@@ -244,7 +244,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
 
     s = op_->Execute(vp, ws_ptr, ws_size, ctx);
 
-    assert(s == atb::NO_ERROR && "ATB Rope execute failed");
+    assert(s == atb::NO_ERROR && "ATB rope execute failed");
   }
 
  private:
@@ -260,7 +260,7 @@ class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
 
   mutable size_t pos_buf_size_ = 0;
 
-  // Cached shapes for ATB VariantPack.
+  // Cached shapes for ATB `VariantPack`.
   std::vector<int64_t> q_2d_shape_;
 
   std::vector<int64_t> k_2d_shape_;
diff --git a/tests/test_add_rms_norm.py b/tests/test_add_rms_norm.py
@@ -75,7 +75,7 @@ def _add_rms_norm(x1, x2, gamma, *, eps=1e-6, y_out=None, x_out=None,
             implementation_index=implementation_index,
         )
 
-    # Concatenate both outputs into a single flat tensor for allclose comparison.
+    # Concatenate both outputs into a single flat tensor for `allclose` comparison.
     return torch.cat([y_out.contiguous().flatten(), x_out.contiguous().flatten()])
 
 

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def test_add_rms_norm_correctness(dtype, shape):`
`77`	`77`	`f"{(x_out_npu.cpu() - x_out_ref).abs().max().item()}"`
`78`	`78`	`)`
`79`	`79`
`80`		`- # Check y = rms_norm(x_out) * weight.`
	`80`	+ # Check `y = rms_norm(x_out) * weight`.
`81`	`81`	`rtol = 1e-3 if dtype == torch.float16 else 1e-5`
`82`	`82`	`atol = 1e-3 if dtype == torch.float16 else 1e-5`
`83`	`83`	`assert torch.allclose(y_npu.cpu(), y_ref, rtol=rtol, atol=atol), (`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ inline aclIntArray* cumSeqLengths(const Tensor& cu_seqlens,`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`// Allocate a 2048x2048 lower-triangular UINT8 causal mask on device.`
`80`		`-// Required for sparseMode >= 2.`
	`80`	+// Required for `sparseMode` >= 2.
`81`	`81`	`inline aclTensor* makeCausalMask(void** mask_buf, aclrtStream stream) {`
`82`	`82`	`constexpr int64_t kMaskDim = 2048;`
`83`	`83`	`const int64_t mask_elems = kMaskDim * kMaskDim;`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ def _add_rms_norm(x1, x2, gamma, *, eps=1e-6, y_out=None, x_out=None,`
`75`	`75`	`implementation_index=implementation_index,`
`76`	`76`	`)`
`77`	`77`
`78`		`- # Concatenate both outputs into a single flat tensor for allclose comparison.`
	`78`	+ # Concatenate both outputs into a single flat tensor for `allclose` comparison.
`79`	`79`	`return torch.cat([y_out.contiguous().flatten(), x_out.contiguous().flatten()])`
`80`	`80`
`81`	`81`