InfiniTensor
diff --git a/‎.gitmodules‎
Lines changed: 4 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/infinicore/ops/infllmv2_api.hpp‎
Lines changed: 65 additions & 0 deletions b/‎include/infinicore/ops/infllmv2_api.hpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 73 additions & 0 deletions b/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎include/infinicore/ops/sigmoid.hpp‎
Lines changed: 17 additions & 0 deletions b/‎include/infinicore/ops/sigmoid.hpp‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎include/infinicore/ops/simple_gla_attention.hpp‎
Lines changed: 28 additions & 0 deletions b/‎include/infinicore/ops/simple_gla_attention.hpp‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎include/infinicore/ops/simple_gla_decode_step.hpp‎
Lines changed: 25 additions & 0 deletions b/‎include/infinicore/ops/simple_gla_decode_step.hpp‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎include/infinicore/ops/simple_gla_prefill.hpp‎
Lines changed: 27 additions & 0 deletions b/‎include/infinicore/ops/simple_gla_prefill.hpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎include/infinicore/ops/simple_gla_recurrent_state_append.hpp‎
Lines changed: 23 additions & 0 deletions b/‎include/infinicore/ops/simple_gla_recurrent_state_append.hpp‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎include/infinicore/ops/zeros.hpp‎
Lines changed: 15 additions & 0 deletions b/‎include/infinicore/ops/zeros.hpp‎
Lines changed: 15 additions & 0 deletions
@@ -5,3 +5,7 @@
 	path = third_party/nlohmann_json
 	url = https://github.com/nlohmann/json.git
 	branch = master
+[submodule "third_party/infllmv2_cuda_impl"]
+	path = third_party/infllmv2_cuda_impl
+	url = https://github.com/Ceng23333/infllmv2_cuda_impl.git
+	branch = minicpm_sala_patches
@@ -19,11 +19,20 @@
 #include "ops/flash_attention.hpp"
 #include "ops/fmin.hpp"
 #include "ops/fmod.hpp"
+#include "ops/simple_gla_attention.hpp"
+#include "ops/simple_gla_decode_step.hpp"
+#include "ops/simple_gla_recurrent_state_append.hpp"
+#include "ops/simple_gla_prefill.hpp"
+#include "ops/infllmv2_attention.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
+#include "ops/mha_kvcache.hpp"
+#include "ops/mha_varlen.hpp"
+#include "ops/mul.hpp"
 #include "ops/ones.hpp"
+#include "ops/zeros.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
@@ -34,6 +43,7 @@
 #include "ops/reciprocal.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
+#include "ops/sigmoid.hpp"
 #include "ops/silu.hpp"
 #include "ops/silu_and_mul.hpp"
 #include "ops/swiglu.hpp"
@@ -0,0 +1,65 @@
+/**
+ * C++ API declarations for InfLLM-V2 attention kernels.
+ * When ENABLE_INFLLMV2 is defined, link against the InfLLM-V2 library
+ * (e.g. from infllmv2_cuda_impl) that provides these symbols.
+ * Requires ENABLE_ATEN for at::Tensor.
+ * Symbols are in global namespace to match entry.cu.
+ */
+#pragma once
+
+#if defined(ENABLE_INFLLMV2) && defined(ENABLE_ATEN)
+
+#include <ATen/ATen.h>
+#include <c10/util/Optional.h>
+#include <vector>
+
+/** Varlen forward: unpadded Q/K/V with cu_seqlens. Returns {out, softmax_lse, ...}. */
+std::vector<at::Tensor> mha_varlen_fwd(
+    at::Tensor &q,
+    const at::Tensor &k,
+    const at::Tensor &v,
+    c10::optional<at::Tensor> &out_,
+    const at::Tensor &cu_seqlens_q,
+    const at::Tensor &cu_seqlens_k,
+    c10::optional<at::Tensor> &seqused_k,
+    c10::optional<const at::Tensor> &leftpad_k_,
+    c10::optional<at::Tensor> &block_table_,
+    c10::optional<at::Tensor> &alibi_slopes_,
+    int max_seqlen_q,
+    int max_seqlen_k,
+    float p_dropout,
+    float softmax_scale,
+    bool zero_tensors,
+    bool is_causal,
+    int window_size_left,
+    int window_size_right,
+    float softcap,
+    bool return_softmax,
+    c10::optional<at::Generator> gen_,
+    c10::optional<at::Tensor> &blockmask_);
+
+/** KV-cache forward (decode). Returns {out, softmax_lse}. */
+std::vector<at::Tensor> mha_fwd_kvcache(
+    at::Tensor &q,
+    const at::Tensor &kcache,
+    const at::Tensor &vcache,
+    c10::optional<const at::Tensor> &k_,
+    c10::optional<const at::Tensor> &v_,
+    c10::optional<const at::Tensor> &seqlens_k_,
+    c10::optional<const at::Tensor> &rotary_cos_,
+    c10::optional<const at::Tensor> &rotary_sin_,
+    c10::optional<const at::Tensor> &cache_batch_idx_,
+    c10::optional<const at::Tensor> &leftpad_k_,
+    c10::optional<at::Tensor> &block_table_,
+    c10::optional<at::Tensor> &alibi_slopes_,
+    c10::optional<at::Tensor> &out_,
+    float softmax_scale,
+    bool is_causal,
+    int window_size_left,
+    int window_size_right,
+    float softcap,
+    bool is_rotary_interleaved,
+    int num_splits,
+    c10::optional<at::Tensor> &blockmask_);
+
+#endif // ENABLE_INFLLMV2 && ENABLE_ATEN
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+// Varlen InfLLM-V2 attention over unpadded Q/K/V.
+//
+// Shapes follow the FlashAttn-style varlen convention:
+//   q           : [total_q, nheads, head_dim]
+//   k, v        : [total_k, nheads_k, head_dim]
+//   cu_seqlens_q: [batch_size + 1] (int32)
+//   cu_seqlens_k: [batch_size + 1] (int32)
+//
+// Returns:
+//   [total_q, nheads, head_dim]
+Tensor infllmv2_varlen(const Tensor &q,
+                       const Tensor &k,
+                       const Tensor &v,
+                       const Tensor &cu_seqlens_q,
+                       const Tensor &cu_seqlens_k,
+                       int max_seqlen_q,
+                       int max_seqlen_k,
+                       float scale,
+                       bool causal,
+                       int window_size_left = -1,
+                       int window_size_right = -1);
+
+// Decode-time InfLLM-V2 attention with KV cache.
+//
+// Shapes:
+//   q          : [batch, seqlen_q, nheads, head_dim]
+//   k_cache    : [num_blocks, block_size, nheads_k, head_dim] or [batch, seqlen_cache, nheads_k, head_dim]
+//   v_cache    : same as k_cache
+//   cache_lens : [batch] (int32) total KV length per sequence
+//
+// Returns:
+//   [batch, seqlen_q, nheads, head_dim]
+Tensor infllmv2_kvcache(const Tensor &q,
+                        const Tensor &k_cache,
+                        const Tensor &v_cache,
+                        const Tensor &cache_lens,
+                        float scale,
+                        bool causal,
+                        int window_size_left = -1,
+                        int window_size_right = -1);
+
+// Decode-time InfLLM-V2 attention with KV cache, updating cache in-place.
+//
+// Shapes:
+//   q          : [batch, seqlen_q, nheads, head_dim]
+//   k_cache    : [batch, seqlen_cache, nheads_k, head_dim] (dense cache)
+//   v_cache    : same as k_cache
+//   k_new/v_new: [batch, seqlen_new, nheads_k, head_dim] (new KV to append at cache_lens offsets)
+//   cache_lens : [batch] (int32) current KV length per sequence BEFORE appending
+//
+// Returns:
+//   [batch, seqlen_q, nheads, head_dim]
+Tensor infllmv2_kvcache_update(const Tensor &q,
+                               const Tensor &k_cache,
+                               const Tensor &v_cache,
+                               const Tensor &k_new,
+                               const Tensor &v_new,
+                               const Tensor &cache_lens,
+                               float scale,
+                               bool causal,
+                               int window_size_left = -1,
+                               int window_size_right = -1);
+
+} // namespace infinicore::op
+
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Sigmoid {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor sigmoid(Tensor input);
+void sigmoid_(Tensor output, Tensor input);
+} // namespace infinicore::op
+
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+// Simple GLA (recurrent linear) attention with per-head decay.
+// Shapes: q, k, v [B, T, H, D], g_gamma [H] (log-decay per head).
+// Recurrence: gate = exp(g_gamma); S = S * gate + outer(k_t, v_t); o_t = (q_t * scale) @ S.
+// Returns [B, T, H, D].
+class SimpleGlaAttention {
+public:
+    using schema = void (*)(Tensor & out, const Tensor &q, const Tensor &k, const Tensor &v,
+                           const Tensor &g_gamma, float scale);
+    static void execute(Tensor & out, const Tensor &q, const Tensor &k, const Tensor &v,
+                        const Tensor &g_gamma, float scale);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor simple_gla_attention(const Tensor &q,
+                            const Tensor &k,
+                            const Tensor &v,
+                            const Tensor &g_gamma,
+                            float scale);
+
+} // namespace infinicore::op
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+// One decode timestep of Simple GLA (same recurrence as SimpleGlaAttention).
+// q, k, v: [B, 1, H, D]; g_gamma: [H] (log-decay per head); state: [B, H, D, D] float32 (in-place).
+// Updates: state = state * exp(g_gamma) + outer(k, v); then out[b,0,h,:] = (q * scale) @ state[b,h].
+// Returns out with shape [B, 1, H, D] (same dtype as q).
+class SimpleGlaDecodeStep {
+public:
+    using schema = void (*)(Tensor &out, Tensor &state, const Tensor &q, const Tensor &k, const Tensor &v,
+                            const Tensor &g_gamma, float scale);
+    static void execute(Tensor &out, Tensor &state, const Tensor &q, const Tensor &k, const Tensor &v,
+                        const Tensor &g_gamma, float scale);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor simple_gla_decode_step(const Tensor &q, const Tensor &k, const Tensor &v, Tensor &state,
+                              const Tensor &g_gamma, float scale);
+
+} // namespace infinicore::op
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "../tensor.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(SimpleGLAPrefill,
+                          Tensor,
+                          const Tensor &,
+                          const Tensor &,
+                          const Tensor &,
+                          const Tensor &,
+                          float);
+
+// Fused/chunked Simple GLA prefill forward.
+// q,k,v: [B,T,H,D] (F16/BF16), g_gamma: [H] (F32), returns [B,T,H,D] (same dtype).
+Tensor simple_gla_prefill(const Tensor &q,
+                          const Tensor &k,
+                          const Tensor &v,
+                          const Tensor &g_gamma,
+                          float scale);
+
+} // namespace infinicore::op
+
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../tensor.hpp"
+#include "common/dispatcher.hpp"
+
+namespace infinicore::op {
+
+// Batched update of Simple GLA recurrent state (float32 [B,H,D,D]) for a contiguous
+// K/V segment [B,L,H,D], matching L repeated simple_gla_decode_step applications:
+//   S <- g^L * S + sum_{j=0}^{L-1} g^{L-1-j} * outer(k_j, v_j)
+// g_gamma: [H] (same log-gate as simple_gla_decode_step; gate = exp(g_gamma)).
+class SimpleGlaRecurrentStateAppend {
+public:
+    using schema = void (*)(Tensor &state, const Tensor &k_seg, const Tensor &v_seg, const Tensor &g_gamma);
+    static void execute(Tensor &state, const Tensor &k_seg, const Tensor &v_seg, const Tensor &g_gamma);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void simple_gla_recurrent_state_append_segment(Tensor &state, const Tensor &k_seg, const Tensor &v_seg,
+                                             const Tensor &g_gamma);
+
+} // namespace infinicore::op
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Zeros {
+
+public:
+    using schema = void (*)(Tensor);
+    static void execute(Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void zeros_(Tensor output);
+} // namespace infinicore::op