fix: address PR #46 review feedback

zhangyue · zhangyue · commit 1654d1d01806 · 2026-04-13T14:39:49.000+08:00
- Rename `toAclDtype` → `ToAclDtype`, `isIntegerDtype` → `IsIntegerDtype`
  (Google C++ Style Guide PascalCase).
- Reorder `switch` cases in `ToAclDtype` to match `DataType` enum definition.
- Simplify `device_.h` include to `#include "device.h"`.
- Add Markdown backticks to code references in comments and help messages.
- Add blank lines before `return`/`if` per CONTRIBUTING.md Python style rules.
- Reorder pybind11 generated params: `Handle` (`stream`) before `Config`
  (`implementation_index`), matching `Operator::call` signature.
- Rename `Matmul` → `MatMul` (ONNX convention), params → `input`/`other`/`out`,
  remove `trans_a`/`trans_b` (use `Gemm` for transposed matmul).
- Rename `AddRmsNorm` params: `x1`/`x2`/`gamma` → `input`/`other`/`weight`,
  `y_out`/`x_out` → `out`/`rstd_out` (PyTorch conventions).
- Rename `skip_unsupported_dtype` → `skip_unsupported_dtypes`.
- Replace `get_npu_stream` with generic `get_stream(device)` using
  `torch.accelerator.current_stream` with device-specific fallbacks.
- Reorder `_PLATFORM_TO_TORCH_DEVICE` with `nvidia` first.
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -94,11 +94,12 @@ def __init__(self, name, constructors, calls):
 
 def _find_optional_tensor_params(op_name):
     """Return a set of parameter names declared as `std::optional<Tensor>` in
-    the base header.  libclang resolves the type to ``int`` when the STL
+    the base header. `libclang` resolves the type to `int` when the STL
     headers are not fully available, so we fall back to a regex scan of the
     source text.
     """
     source = (_BASE_DIR / f"{op_name}.h").read_text()
+
     return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
 
 
@@ -108,6 +109,7 @@ def _generate_pybind11(operator):
     def _is_optional_tensor(arg):
         if arg.spelling in optional_tensor_params:
             return True
+
         return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling
 
     def _generate_params(node):
@@ -116,6 +118,7 @@ def _generate_params(node):
         for arg in node.get_arguments():
             if arg.spelling == "stream":
                 continue
+
             if _is_optional_tensor(arg):
                 parts.append(f"std::optional<py::object> {arg.spelling}")
             else:
@@ -132,6 +135,7 @@ def _generate_arguments(node):
         for arg in node.get_arguments():
             if arg.spelling == "stream":
                 continue
+
             if _is_optional_tensor(arg):
                 args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})")
             elif "Tensor" in arg.type.spelling:
@@ -163,23 +167,23 @@ def _generate_call(op_name, call, method=True):
 
         if not method:
             params = (
-                f"{call_params}, std::size_t implementation_index, std::uintptr_t stream"
+                f"{call_params}, std::uintptr_t stream, std::size_t implementation_index"
                 if call_params
-                else "std::size_t implementation_index, std::uintptr_t stream"
+                else "std::uintptr_t stream, std::size_t implementation_index"
             )
             py_args = _generate_py_args(call)
             py_args_str = f"{py_args}, " if py_args else ""
 
             return (
                 f'  m.def("{op_name}", []({params}) {{\n'
-                f"    Config config;\n"
-                f"    config.set_implementation_index(implementation_index);\n"
                 f"    Handle handle;\n"
                 f"    if (stream) {{\n"
                 f"      handle.set_stream(reinterpret_cast<void*>(stream));\n"
                 f"    }}\n"
+                f"    Config config;\n"
+                f"    config.set_implementation_index(implementation_index);\n"
                 f"    return Self::call(handle, config, {call_args});\n"
-                f'  }}, {py_args_str}py::kw_only(), py::arg("implementation_index") = 0, py::arg("stream") = 0);'
+                f'  }}, {py_args_str}py::kw_only(), py::arg("stream") = 0, py::arg("implementation_index") = 0);'
             )
 
         return f"""      .def("__call__", [](const Self& self, {call_params}) {{
@@ -438,7 +442,7 @@ def _get_all_ops(devices):
         nargs="+",
         default="cpu",
         type=str,
-        help="Devices to use. Please pick from cpu, nvidia, cambricon, ascend, metax, moore, iluvatar, kunlun, hygon, and qy. (default: cpu)",
+        help="Devices to use. Please pick from `cpu`, `nvidia`, `cambricon`, `ascend`, `metax`, `moore`, `iluvatar`, `kunlun`, `hygon`, and `qy`. (default: `cpu`)",
     )
 
     args = parser.parse_args()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -178,7 +178,7 @@ if(WITH_ASCEND)
         "ascend/*.cc"
         "ascend/*.cpp"
     )
-    # Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.
+    # Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
     list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
 
     target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
diff --git a/src/ascend/common.h b/src/ascend/common.h
@@ -11,11 +11,11 @@
 
 namespace infini::ops::ascend {
 
-// Build an aclTensor descriptor from an InfiniOps Tensor.
+// Build an `aclTensor` descriptor from an InfiniOps `Tensor`.
 //
 // When `transpose_last2` is true the last two dimensions are swapped in the
-// descriptor (shape and strides) without copying data.  This is used by GEMM
-// and Matmul to express a transpose via the view.
+// descriptor (shape and strides) without copying data.  This is used by `Gemm`
+// and `MatMul` to express a transpose via the view.
 inline aclTensor* buildAclTensor(const Tensor& t,
                                  bool transpose_last2 = false) {
   std::vector<int64_t> shape(t.shape().begin(), t.shape().end());
@@ -45,7 +45,7 @@ inline aclTensor* buildAclTensor(const Tensor& t,
   std::vector<int64_t> storage_shape = {storage_elems};
 
   return aclCreateTensor(
-      shape.data(), static_cast<int64_t>(shape.size()), toAclDtype(t.dtype()),
+      shape.data(), static_cast<int64_t>(shape.size()), ToAclDtype(t.dtype()),
       strides.data(),
       /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(),
       static_cast<int64_t>(storage_shape.size()), const_cast<void*>(t.data()));
diff --git a/src/ascend/data_type_.h b/src/ascend/data_type_.h
@@ -9,14 +9,8 @@
 
 namespace infini::ops::ascend {
 
-inline aclDataType toAclDtype(DataType dt) {
+inline aclDataType ToAclDtype(DataType dt) {
   switch (dt) {
-    case DataType::kFloat16:
-      return ACL_FLOAT16;
-    case DataType::kBFloat16:
-      return ACL_BF16;
-    case DataType::kFloat32:
-      return ACL_FLOAT;
     case DataType::kInt8:
       return ACL_INT8;
     case DataType::kInt16:
@@ -33,14 +27,20 @@ inline aclDataType toAclDtype(DataType dt) {
       return ACL_UINT32;
     case DataType::kUInt64:
       return ACL_UINT64;
+    case DataType::kFloat16:
+      return ACL_FLOAT16;
+    case DataType::kBFloat16:
+      return ACL_BF16;
+    case DataType::kFloat32:
+      return ACL_FLOAT;
     default:
-      assert(false && "unsupported dtype for Ascend backend");
+      assert(false && "Unsupported dtype for Ascend backend.");
       return ACL_DT_UNDEFINED;
   }
 }
 
-// Returns true for integer (signed or unsigned) DataType values.
-inline bool isIntegerDtype(DataType dt) {
+// Returns true for integer (signed or unsigned) `DataType` values.
+inline bool IsIntegerDtype(DataType dt) {
   switch (dt) {
     case DataType::kInt8:
     case DataType::kInt16:
diff --git a/src/ascend/device_.h b/src/ascend/device_.h
@@ -1,10 +1,7 @@
 #ifndef INFINI_OPS_ASCEND_DEVICE__H_
 #define INFINI_OPS_ASCEND_DEVICE__H_
 
-// NOTE: Cannot use `#include "device.h"` here — GCC resolves quoted includes
-// relative to the current file first, and `src/ascend/` used to contain a
-// `device.h`.  Use `data_type.h` which transitively pulls in `src/device.h`.
-#include "data_type.h"
+#include "device.h"
 
 namespace infini::ops {
 
diff --git a/src/base/add_rms_norm.h b/src/base/add_rms_norm.h
@@ -11,23 +11,24 @@ namespace infini::ops {
 
 class AddRmsNorm : public Operator<AddRmsNorm> {
  public:
-  AddRmsNorm(const Tensor x1, const Tensor x2, const Tensor gamma, float eps,
-             Tensor y_out, Tensor x_out)
-      : input_shape_{x1.shape()},
+  AddRmsNorm(const Tensor input, const Tensor other, const Tensor weight,
+             float eps, Tensor out, Tensor rstd_out)
+      : input_shape_{input.shape()},
         eps_{eps},
-        dim_{x1.size(-1)},
-        ndim_{x1.ndim()},
-        batch_size_{ndim_ == 2 ? x1.size(-2) : x1.size(-3)},
-        nhead_{ndim_ == 2 ? 1 : x1.size(-2)},
+        dim_{input.size(-1)},
+        ndim_{input.ndim()},
+        batch_size_{ndim_ == 2 ? input.size(-2) : input.size(-3)},
+        nhead_{ndim_ == 2 ? 1 : input.size(-2)},
         rstd_shape_{static_cast<int64_t>(batch_size_),
                     static_cast<int64_t>(nhead_)} {
-    assert(x1.dtype() == x2.dtype());
-    assert(x1.dtype() == y_out.dtype());
-    assert(x1.dtype() == x_out.dtype());
+    assert(input.dtype() == other.dtype());
+    assert(input.dtype() == out.dtype());
+    assert(input.dtype() == rstd_out.dtype());
   }
 
-  virtual void operator()(const Tensor x1, const Tensor x2, const Tensor gamma,
-                          float eps, Tensor y_out, Tensor x_out) const = 0;
+  virtual void operator()(const Tensor input, const Tensor other,
+                          const Tensor weight, float eps, Tensor out,
+                          Tensor rstd_out) const = 0;
 
  protected:
   Tensor::Shape input_shape_;
diff --git a/src/base/matmul.h b/src/base/matmul.h
@@ -6,34 +6,24 @@
 
 namespace infini::ops {
 
-class Matmul : public Operator<Matmul> {
+class MatMul : public Operator<MatMul> {
  public:
-  // `trans_a` / `trans_b`: If true, transpose the last two dims of `a` / `b`
-  // before multiplying.  These are constructor parameters so the `CacheKey`
-  // encodes the transposition and distinct descriptors are cached for each
-  // combination.
-  Matmul(const Tensor a, const Tensor b, Tensor c, bool trans_a, bool trans_b)
-      : a_shape_{a.shape()},
-        b_shape_{b.shape()},
-        c_shape_{c.shape()},
-        trans_a_{trans_a},
-        trans_b_{trans_b} {
-    assert(a.dtype() == b.dtype());
+  MatMul(const Tensor input, const Tensor other, Tensor out)
+      : input_shape_{input.shape()},
+        other_shape_{other.shape()},
+        out_shape_{out.shape()} {
+    assert(input.dtype() == other.dtype());
   }
 
-  virtual void operator()(const Tensor a, const Tensor b, Tensor c,
-                          bool trans_a, bool trans_b) const = 0;
+  virtual void operator()(const Tensor input, const Tensor other,
+                          Tensor out) const = 0;
 
  protected:
-  Tensor::Shape a_shape_;
+  Tensor::Shape input_shape_;
 
-  Tensor::Shape b_shape_;
+  Tensor::Shape other_shape_;
 
-  Tensor::Shape c_shape_;
-
-  bool trans_a_{false};
-
-  bool trans_b_{false};
+  Tensor::Shape out_shape_;
 };
 
 }  // namespace infini::ops
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -16,7 +16,7 @@ def pytest_addoption(parser):
         "--devices",
         nargs="+",
         default=None,
-        help="Device(s) to test on (e.g., --devices ascend cpu). Accepts platform names (ascend, nvidia, cambricon, metax, moore, iluvatar) or PyTorch device types (npu, cuda, mlu, musa). Defaults to all available devices.",
+        help="Device(s) to test on (e.g., `--devices ascend cpu`). Accepts platform names (`nvidia`, `metax`, `iluvatar`, `moore`, `cambricon`, `ascend`) or PyTorch device types (`cuda`, `mlu`, `musa`, `npu`). Defaults to all available devices.",
     )
 
 
@@ -46,15 +46,17 @@ def set_seed_per_test(request):
 
 _NPU_UNSUPPORTED_DTYPES = {torch.float64}
 
-# `torch_npu` does not implement random number generation for `uint16`/`uint32`/`uint64`.
+# `torch_npu` does not implement random number generation for
+# `uint16`/`uint32`/`uint64`.
 for _bits in (16, 32, 64):
     _t = getattr(torch, f"uint{_bits}", None)
     if _t is not None:
         _NPU_UNSUPPORTED_DTYPES.add(_t)
 
 
 @pytest.fixture(autouse=True)
-def skip_unsupported_dtype(request):
+def skip_unsupported_dtypes(request):
+
     if not hasattr(request.node, "callspec"):
         return
 
@@ -71,16 +73,16 @@ def _set_random_seed(seed):
 
 _PLATFORM_TO_TORCH_DEVICE = {
     "nvidia": "cuda",
-    "iluvatar": "cuda",
     "metax": "cuda",
-    "cambricon": "mlu",
+    "iluvatar": "cuda",
     "moore": "musa",
+    "cambricon": "mlu",
     "ascend": "npu",
 }
 
 
 def _resolve_device(name):
-    """Map a platform name (e.g., ``ascend``) to a PyTorch device type (e.g., ``npu``)."""
+    """Map a platform name (e.g., `ascend`) to a PyTorch device type (e.g., `npu`)."""
     return _PLATFORM_TO_TORCH_DEVICE.get(name, name)
 
 
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, get_npu_stream, randn_strided
+from tests.utils import Payload, get_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -84,28 +84,17 @@ def test_gemm(
 
 
 def _gemm(a, b, alpha, beta, trans_a, trans_b, c, implementation_index=0):
-    if a.device.type == "npu":
-        infini.ops.gemm(
-            a,
-            b,
-            alpha,
-            beta,
-            trans_a,
-            trans_b,
-            c,
-            stream=get_npu_stream(a),
-        )
-    else:
-        infini.ops.gemm(
-            a,
-            b,
-            alpha,
-            beta,
-            trans_a,
-            trans_b,
-            c,
-            implementation_index=implementation_index,
-        )
+    infini.ops.gemm(
+        a,
+        b,
+        alpha,
+        beta,
+        trans_a,
+        trans_b,
+        c,
+        stream=get_stream(a.device),
+        implementation_index=implementation_index,
+    )
 
     return c
 
diff --git a/tests/utils.py b/tests/utils.py
@@ -82,12 +82,47 @@ def randint_strided(low, high, shape, strides, *, dtype=None, device=None):
     return output
 
 
-def get_npu_stream(tensor):
-    """Return the current NPU stream handle for `tensor`, or 0 on other devices."""
-    if tensor.device.type != "npu":
+def get_stream(device):
+    """Return the raw stream handle for `device`, or 0 for CPU.
+
+    Uses `torch.accelerator.current_stream` when available, falling back to
+    device-specific APIs for older PyTorch versions.
+    """
+    if isinstance(device, torch.device):
+        device = device.type
+
+    if isinstance(device, str) and ":" in device:
+        device = device.split(":")[0]
+
+    if device == "cpu":
+        return 0
+
+    if hasattr(torch, "accelerator") and hasattr(torch.accelerator, "current_stream"):
+        stream = torch.accelerator.current_stream()
+
+        # Each backend exposes the raw handle under a different attribute name.
+        for attr in ("npu_stream", "cuda_stream", "mlu_stream", "musa_stream"):
+            if hasattr(stream, attr):
+                return getattr(stream, attr)
+
         return 0
 
-    return torch.npu.current_stream().npu_stream
+    # Fallback for older PyTorch builds without `torch.accelerator`.
+    _STREAM_ACCESSORS = {
+        "npu": ("npu", "npu_stream"),
+        "cuda": ("cuda", "cuda_stream"),
+        "mlu": ("mlu", "mlu_stream"),
+        "musa": ("musa", "musa_stream"),
+    }
+
+    if device in _STREAM_ACCESSORS:
+        mod_name, attr = _STREAM_ACCESSORS[device]
+        mod = getattr(torch, mod_name, None)
+
+        if mod is not None and hasattr(mod, "current_stream"):
+            return getattr(mod.current_stream(), attr)
+
+    return 0
 
 
 def clone_strided(input):

Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,7 @@ if(WITH_ASCEND)`
`178`	`178`	`"ascend/*.cc"`
`179`	`179`	`"ascend/*.cpp"`
`180`	`180`	`)`
`181`		`- # Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.`
	`181`	+ # Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
`182`	`182`	`list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")`
`183`	`183`
`184`	`184`	`target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)`