InfiniTensor
diff --git a/‎include/infinicore/adaptor/aten_adaptor.hpp‎
Lines changed: 6 additions & 6 deletions b/‎include/infinicore/adaptor/aten_adaptor.hpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/infinicore/adaptor/flash_attention_adaptor.hpp‎
Lines changed: 21 additions & 2 deletions b/‎include/infinicore/adaptor/flash_attention_adaptor.hpp‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎scripts/install.py‎
Lines changed: 7 additions & 1 deletion b/‎scripts/install.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎scripts/set_env.py‎
Lines changed: 75 additions & 0 deletions b/‎scripts/set_env.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎src/infinicore/adaptor/aten_adaptor.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/infinicore/adaptor/aten_adaptor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc‎
Lines changed: 38 additions & 12 deletions b/‎src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc‎
Lines changed: 38 additions & 12 deletions
@@ -5,9 +5,11 @@
 
 #include <ATen/ATen.h>
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API)
 #include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
+#endif
+
+#ifdef ENABLE_NVIDIA_API
 #include <ATen/cuda/CUDAContext.h>
 #endif
 
@@ -30,20 +32,18 @@ inline at::ScalarType to_at_dtype(DataType dtype) {
 }
 
 inline at::Device to_at_device(const Device &device) {
-    if (device.getType() == Device::Type::NVIDIA) {
+    if (device.getType() == Device::Type::NVIDIA || device.getType() == Device::Type::METAX) {
         return at::Device(at::kCUDA, device.getIndex());
     } else if (device.getType() == Device::Type::CPU) {
         return at::Device(at::kCPU);
-    } else if (device.getType() == Device::Type::QY) {
-        return at::Device(at::kCUDA, device.getIndex());
     } else {
         throw std::runtime_error("Unsupported device type for ATen");
     }
 }
 
 at::Tensor to_aten_tensor(const infinicore::Tensor &t);
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API)
 c10::cuda::CUDAStream get_cuda_stream();
 #endif
 } // namespace infinicore::adaptor
 
@@ -2,7 +2,12 @@
 #pragma once
 #include "aten_adaptor.hpp"
 
+// NVIDIA flash-attn-nvidia.so uses namespace flash. The pip/MetaX flash_attn_2_cuda extension
+// exports the same entry points at global scope (no namespace), matching FLASH_NAMESPACE builds
+// where the namespace is empty.
+#if !defined(ENABLE_METAX_API)
 namespace flash {
+#endif
 std::vector<at::Tensor>
 mha_fwd(at::Tensor &q,                            // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
         const at::Tensor &k,                      // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
@@ -39,7 +44,13 @@ mha_varlen_fwd(at::Tensor &q,                               // total_q x num_hea
                int window_size_right,
                const float softcap,
                const bool return_softmax,
-               std::optional<at::Generator> gen_);
+               std::optional<at::Generator> gen_
+#if defined(ENABLE_METAX_API) && defined(INFINICORE_HPCC_VERSION_MAJOR) && (INFINICORE_HPCC_VERSION_MAJOR >= 3)
+               // MetaX/Mars `flash_attn_2_cuda` (e.g. 2.6.x+mars) appends this argument vs upstream Dao-AILab flash-attn.
+               ,
+               std::optional<at::Tensor> &flash_attn_mars_ext_
+#endif
+    );
 
 std::vector<at::Tensor>
 mha_bwd(const at::Tensor &dout,                   // batch_size x seqlen_q x num_heads, x multiple_of(head_size_og, 8)
@@ -108,7 +119,15 @@ mha_fwd_kvcache(at::Tensor &q,                                     // batch_size
                 int window_size_right,
                 const float softcap,
                 bool is_rotary_interleaved, // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
-                int num_splits);
+                int num_splits
+#if defined(ENABLE_METAX_API) && defined(INFINICORE_HPCC_VERSION_MAJOR) && (INFINICORE_HPCC_VERSION_MAJOR >= 3)
+                // MetaX/Mars `flash_attn_2_cuda` (e.g. 2.6.x+mars) appends this argument vs upstream Dao-AILab flash-attn.
+                ,
+                std::optional<at::Tensor> &flash_attn_mars_ext_
+#endif
+    );
 
+#if !defined(ENABLE_METAX_API)
 } // namespace flash
+#endif
 #endif // ENABLE_FLASH_ATTN
@@ -2,7 +2,11 @@
 import subprocess
 import platform
 import sys
-from set_env import set_env
+from set_env import (
+    set_env,
+    ensure_metax_hpc_compiler_includes,
+    xmake_flags_need_metax_aten_torch_includes,
+)
 
 PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 os.chdir(PROJECT_DIR)
@@ -12,6 +16,8 @@ def run_cmd(cmd):
 
 
 def install(xmake_config_flags=""):
+    if xmake_flags_need_metax_aten_torch_includes(xmake_config_flags):
+        ensure_metax_hpc_compiler_includes()
     run_cmd(f"xmake f {xmake_config_flags} -cv")
     run_cmd("xmake")
     run_cmd("xmake install")
 
@@ -2,6 +2,81 @@
 import platform
 
 
+def _maca_root_from_env():
+    return (
+        os.environ.get("MACA_PATH")
+        or os.environ.get("MACA_HOME")
+        or os.environ.get("MACA_ROOT")
+        or ""
+    ).strip()
+
+
+def metax_hpc_compiler_include_dirs():
+    """Directories needed so g++ finds cuda_runtime_api.h (cu-bridge) when compiling against PyTorch c10/cuda headers on MetaX/HPCC."""
+    maca = _maca_root_from_env()
+    if not maca:
+        return []
+    return [
+        os.path.join(maca, "tools", "cu-bridge", "include"),
+        os.path.join(maca, "include", "hcr"),
+        os.path.join(maca, "include"),
+    ]
+
+
+def _prepend_path_var(name, prefixes):
+    """Prepend colon-separated *prefixes* to env var *name* (POSIX)."""
+    if not prefixes:
+        return
+    chunk = ":".join(prefixes)
+    cur = os.environ.get(name, "")
+    os.environ[name] = f"{chunk}:{cur}" if cur else chunk
+
+
+def ensure_metax_hpc_compiler_includes():
+    """
+    Prepend HPCC/cu-bridge includes to CPATH, CPLUS_INCLUDE_PATH, and C_INCLUDE_PATH.
+    g++ uses CPLUS_INCLUDE_PATH for .cc files; C_INCLUDE_PATH alone is not enough.
+    """
+    dirs = metax_hpc_compiler_include_dirs()
+    if not dirs:
+        return
+    for var in ("CPATH", "CPLUS_INCLUDE_PATH", "C_INCLUDE_PATH"):
+        _prepend_path_var(var, dirs)
+
+
+def _parse_xmake_cli_flag_values(flags: str):
+    """Parse a string like '--metax-gpu=y --aten=y' into {key: value}."""
+    parts = flags.replace("=", " ").split()
+    d = {}
+    i = 0
+    n = len(parts)
+    while i < n:
+        p = parts[i]
+        if p.startswith("--") and len(p) > 2:
+            key = p[2:].lower()
+            i += 1
+            if i < n and not parts[i].startswith("--"):
+                d[key] = parts[i].lower()
+                i += 1
+            else:
+                d[key] = "y"
+        else:
+            i += 1
+    return d
+
+
+def _truthy_flag_value(v: str) -> bool:
+    return v in ("y", "yes", "true", "1", "on")
+
+
+def xmake_flags_need_metax_aten_torch_includes(flags: str) -> bool:
+    """True when install.py-style args enable MetaX GPU and ATen (PyTorch) together."""
+    d = _parse_xmake_cli_flag_values(flags)
+    return _truthy_flag_value(d.get("metax-gpu", "n")) and _truthy_flag_value(
+        d.get("aten", "n")
+    )
+
+
 def set_env():
     if os.environ.get("INFINI_ROOT") == None:
         os.environ["INFINI_ROOT"] = os.path.expanduser("~/.infini")
 
@@ -32,7 +32,7 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {
         options);
 }
 
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API)
 c10::cuda::CUDAStream get_cuda_stream() {
     return c10::cuda::getStreamFromExternal(
         cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());
 
@@ -4,6 +4,18 @@
 
 #include <stdexcept>
 
+#ifdef ENABLE_FLASH_ATTN
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API)
+#include <c10/cuda/CUDAGuard.h>
+#endif
+#endif
+
+#if defined(ENABLE_METAX_API)
+#define INFINICORE_FLASH_OP(name) ::name
+#else
+#define INFINICORE_FLASH_OP(name) flash::name
+#endif
+
 namespace infinicore::op::mha_kvcache_impl::flashattn {
 
 struct PlannedMeta {
@@ -33,22 +45,24 @@ void *plan(Tensor out,
 
 void run(void *planned_meta) {
 #ifdef ENABLE_FLASH_ATTN
+#ifdef ENABLE_NVIDIA_API
     c10::cuda::CUDAStreamGuard guard(infinicore::adaptor::get_cuda_stream());
+#elif defined(ENABLE_METAX_API)
+    c10::cuda::CUDAStreamGuard guard(infinicore::adaptor::get_cuda_stream());
+#endif
     auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
 
-    auto out_tensor = infinicore::adaptor::to_aten_tensor(p->out);
-    auto q = infinicore::adaptor::to_aten_tensor(p->q);
-#if defined(ENABLE_NVIDIA_API)
-    auto k_cache = infinicore::adaptor::to_aten_tensor(p->k_cache);
-    auto v_cache = infinicore::adaptor::to_aten_tensor(p->v_cache);
-#elif defined(ENABLE_QY_API)
+    // FlashAttention kernels expect standard dense layout (contiguous last dimension).
+    auto out_at = infinicore::adaptor::to_aten_tensor(p->out);
+    const bool out_need_copy_back = !out_at.is_contiguous();
+    auto out_tensor = out_need_copy_back ? out_at.contiguous() : out_at;
+    auto q = infinicore::adaptor::to_aten_tensor(p->q).contiguous();
     auto k_cache = infinicore::adaptor::to_aten_tensor(p->k_cache).contiguous();
     auto v_cache = infinicore::adaptor::to_aten_tensor(p->v_cache).contiguous();
-#endif
-    auto seqlens_k = std::optional<const at::Tensor>(infinicore::adaptor::to_aten_tensor(p->seqlens_k));
-    auto block_table = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->block_table));
+    auto seqlens_k = std::optional<const at::Tensor>(infinicore::adaptor::to_aten_tensor(p->seqlens_k).contiguous());
+    auto block_table = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->block_table).contiguous());
     auto alibi_slopes = p->alibi_slopes
-                          ? std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(*p->alibi_slopes))
+                          ? std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(*p->alibi_slopes).contiguous())
                           : std::nullopt;
 
     std::optional<const at::Tensor> k_new = std::nullopt;
@@ -65,7 +79,11 @@ void run(void *planned_meta) {
     auto out = use_dynamic_out ? std::optional<at::Tensor>(std::nullopt)
                                : std::optional<at::Tensor>(out_tensor);
 
-    auto result = flash::mha_fwd_kvcache(
+#if defined(ENABLE_METAX_API) && defined(INFINICORE_HPCC_VERSION_MAJOR) && (INFINICORE_HPCC_VERSION_MAJOR >= 3)
+    std::optional<at::Tensor> flash_attn_mars_ext = std::nullopt;
+#endif
+
+    auto result = INFINICORE_FLASH_OP(mha_fwd_kvcache)(
         q,
         k_cache,
         v_cache,
@@ -85,11 +103,19 @@ void run(void *planned_meta) {
         -1,
         0.0f,
         false,
-        0);
+        0
+#if defined(ENABLE_METAX_API) && defined(INFINICORE_HPCC_VERSION_MAJOR) && (INFINICORE_HPCC_VERSION_MAJOR >= 3)
+        ,
+        flash_attn_mars_ext
+#endif
+    );
 
     if (use_dynamic_out) {
         out_tensor.copy_(result[0]);
     }
+    if (out_need_copy_back) {
+        out_at.copy_(out_tensor);
+    }
 #else
     throw std::runtime_error("FlashAttention is not enabled in this build");
 #endif
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {`
`32`	`32`	`options);`
`33`	`33`	`}`
`34`	`34`
`35`		`-#if defined(ENABLE_NVIDIA_API) \|\| defined(ENABLE_QY_API)`
	`35`	`+#if defined(ENABLE_NVIDIA_API) \|\| defined(ENABLE_METAX_API)`
`36`	`36`	`c10::cuda::CUDAStream get_cuda_stream() {`
`37`	`37`	`return c10::cuda::getStreamFromExternal(`
`38`	`38`	`cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());`