feat(npu): Added implementation of the recurrent_gated_delta_rule operator

maojunx99 · maojunx99 · commit ab4bcf816d3a · 2026-04-21T16:06:39.000+08:00
Implemented the recurrent_gated_delta_rule operator on the NPU, including:

1. Adding operator interface declaration

2. Implementing NPU backend computation logic

3. Updating CMake build configuration

4. Integrating and using it in Qwen3GatedDeltaNetBase
diff --git a/third_party/xllm_ops b/third_party/xllm_ops
@@ -1 +1 @@
-Subproject commit 96a590903bd8e4a73131a59b25240d11634cb0ed
+Subproject commit d3a0acf6dc56efce92a1ecc98723a9259d9a68a6
diff --git a/xllm/core/kernels/npu/CMakeLists.txt b/xllm/core/kernels/npu/CMakeLists.txt
@@ -20,6 +20,7 @@ cc_library(
     npu_moe_init_routing_v2.cpp
     npu_moe_token_unpermute.cpp
     rope.cpp
+    npu_recurrent_gated_delta_rule.cpp
   DEPS
     :torch_npu_kernels
     :tilelang_kernels
diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h
@@ -141,4 +141,16 @@ std::pair<torch::Tensor, torch::Tensor> apply_npu_partial_rotary_embedding(
     const torch::Tensor& cos_sin_cache,
     bool is_neox_style);
 
+torch::Tensor npu_recurrent_gated_delta_rule(
+    const torch::Tensor& query,
+    const torch::Tensor& key,
+    const torch::Tensor& value,
+    torch::Tensor& state,
+    const std::optional<torch::Tensor>& beta,
+    const std::optional<double> scale,
+    const std::optional<torch::Tensor>& actual_seq_lengths,
+    const std::optional<torch::Tensor>& ssm_state_indices,
+    const std::optional<torch::Tensor>& num_accepted_tokens,
+    const std::optional<torch::Tensor>& g,
+    const std::optional<torch::Tensor>& gk);
 }  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/npu_recurrent_gated_delta_rule.cpp b/xllm/core/kernels/npu/npu_recurrent_gated_delta_rule.cpp
@@ -0,0 +1,165 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <c10/core/Device.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+#include <torch_npu/csrc/libs/init_npu.h>
+#include <torch_npu/torch_npu.h>
+
+#include <nlohmann/json.hpp>
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include "acl/acl.h"
+#include "aclnn_recurrent_gated_delta_rule.h"
+#include "core/common/macros.h"
+#include "core/kernels/npu/utils.h"
+#include "npu_ops_api.h"
+
+namespace xllm::kernel::npu {
+
+torch::Tensor npu_recurrent_gated_delta_rule(
+    const torch::Tensor& query,
+    const torch::Tensor& key,
+    const torch::Tensor& value,
+    torch::Tensor& state,
+    const std::optional<torch::Tensor>& beta,
+    const std::optional<double> scale,
+    const std::optional<torch::Tensor>& actual_seq_lengths,
+    const std::optional<torch::Tensor>& ssm_state_indices,
+    const std::optional<torch::Tensor>& num_accepted_tokens,
+    const std::optional<torch::Tensor>& g,
+    const std::optional<torch::Tensor>& gk) {
+  check_tensor(query, "query", "recurrent_gated_delta_rule");
+  check_tensor(key, "key", "recurrent_gated_delta_rule");
+  check_tensor(value, "value", "recurrent_gated_delta_rule");
+  check_tensor(state, "state", "recurrent_gated_delta_rule");
+
+  aclTensor* query_ids = nullptr;
+  aclTensor* key_ids = nullptr;
+  aclTensor* value_ids = nullptr;
+  aclTensor* state_ids = nullptr;
+  aclTensor* beta_ids = nullptr;
+  aclTensor* actual_seq_lengths_ids = nullptr;
+  aclTensor* ssm_state_indices_ids = nullptr;
+  aclTensor* num_accepted_tokens_ids = nullptr;
+  aclTensor* g_ids = nullptr;
+  aclTensor* gk_ids = nullptr;
+  aclTensor* out_ids = nullptr;
+
+  int32_t device_id = query.device().index();
+  aclrtStream stream = c10_npu::getCurrentNPUStream(device_id).stream();
+
+  create_acltensor(&query_ids, query);
+  create_acltensor(&key_ids, key);
+  create_acltensor(&value_ids, value);
+  create_acltensor(&state_ids, state);
+
+  if (beta.has_value() && beta.value().defined()) {
+    create_acltensor(&beta_ids, beta.value());
+  }
+  if (actual_seq_lengths.has_value() && actual_seq_lengths.value().defined()) {
+    create_acltensor(&actual_seq_lengths_ids, actual_seq_lengths.value());
+  }
+  if (ssm_state_indices.has_value() && ssm_state_indices.value().defined()) {
+    create_acltensor(&ssm_state_indices_ids, ssm_state_indices.value());
+  }
+  if (num_accepted_tokens.has_value() &&
+      num_accepted_tokens.value().defined()) {
+    create_acltensor(&num_accepted_tokens_ids, num_accepted_tokens.value());
+  }
+  if (g.has_value() && g.value().defined()) {
+    create_acltensor(&g_ids, g.value());
+  }
+  if (gk.has_value() && gk.value().defined()) {
+    create_acltensor(&gk_ids, gk.value());
+  }
+
+  at::Tensor out_result = at::empty_like(value);
+  create_acltensor(&out_ids, out_result);
+
+  float scale_value = static_cast<float>(scale.value());
+
+  uint64_t workspace_size = 0;
+  aclOpExecutor* executor = nullptr;
+
+  CHECK_ACL_SUCCESS(
+      aclnnRecurrentGatedDeltaRuleGetWorkspaceSize(query_ids,
+                                                   key_ids,
+                                                   value_ids,
+                                                   beta_ids,
+                                                   state_ids,
+                                                   actual_seq_lengths_ids,
+                                                   ssm_state_indices_ids,
+                                                   g_ids,
+                                                   gk_ids,
+                                                   num_accepted_tokens_ids,
+                                                   scale_value,
+                                                   out_ids,
+                                                   &workspace_size,
+                                                   &executor),
+      "recurrent_gated_delta_rule: failed to get workspace size");
+
+  void* workspace_addr = nullptr;
+  if (workspace_size > 0) {
+    CHECK_ACL_SUCCESS(
+        aclrtMalloc(&workspace_addr, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST),
+        "recurrent_gated_delta_rule: failed to allocate workspace");
+  }
+
+  CHECK_ACL_SUCCESS(aclnnRecurrentGatedDeltaRule(
+                        workspace_addr, workspace_size, executor, stream),
+                    "recurrent_gated_delta_rule: failed to perform recurrent "
+                    "gated delta rule");
+
+  aclDestroyTensor(query_ids);
+  aclDestroyTensor(key_ids);
+  aclDestroyTensor(value_ids);
+  aclDestroyTensor(state_ids);
+  aclDestroyTensor(out_ids);
+
+  if (beta_ids != nullptr) {
+    aclDestroyTensor(beta_ids);
+  }
+  if (actual_seq_lengths_ids != nullptr) {
+    aclDestroyTensor(actual_seq_lengths_ids);
+  }
+  if (ssm_state_indices_ids != nullptr) {
+    aclDestroyTensor(ssm_state_indices_ids);
+  }
+  if (num_accepted_tokens_ids != nullptr) {
+    aclDestroyTensor(num_accepted_tokens_ids);
+  }
+  if (g_ids != nullptr) {
+    aclDestroyTensor(g_ids);
+  }
+  if (gk_ids != nullptr) {
+    aclDestroyTensor(gk_ids);
+  }
+
+  if (workspace_size > 0) {
+    CHECK_ACL_SUCCESS(aclrtFree(workspace_addr),
+                      "recurrent_gated_delta_rule: failed to free workspace");
+  }
+
+  return out_result;
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -1018,4 +1018,33 @@ std::pair<torch::Tensor, torch::Tensor> chunk_gated_delta_rule(
   NOT_IMPLEMENTED();
 #endif
 }
+
+torch::Tensor recurrent_gated_delta_rule(
+    const torch::Tensor& query,
+    const torch::Tensor& key,
+    const torch::Tensor& value,
+    torch::Tensor& state,
+    const std::optional<torch::Tensor>& beta,
+    const std::optional<double> scale,
+    const std::optional<torch::Tensor>& actual_seq_lengths,
+    const std::optional<torch::Tensor>& ssm_state_indices,
+    const std::optional<torch::Tensor>& num_accepted_tokens,
+    const std::optional<torch::Tensor>& g,
+    const std::optional<torch::Tensor>& gk) {
+#if defined(USE_NPU)
+  return npu::npu_recurrent_gated_delta_rule(query,
+                                             key,
+                                             value,
+                                             state,
+                                             beta,
+                                             scale,
+                                             actual_seq_lengths,
+                                             ssm_state_indices,
+                                             num_accepted_tokens,
+                                             g,
+                                             gk);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
 }  // namespace xllm::kernel
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
@@ -148,4 +148,16 @@ void gemma_rms_norm(GemmaRMSNormParams& params);
 std::pair<torch::Tensor, torch::Tensor> chunk_gated_delta_rule(
     ChunkGatedDeltaRuleParams& params);
 
+torch::Tensor recurrent_gated_delta_rule(
+    const torch::Tensor& query,
+    const torch::Tensor& key,
+    const torch::Tensor& value,
+    torch::Tensor& state,
+    const std::optional<torch::Tensor>& beta,
+    const std::optional<double> scale,
+    const std::optional<torch::Tensor>& actual_seq_lengths,
+    const std::optional<torch::Tensor>& ssm_state_indices,
+    const std::optional<torch::Tensor>& num_accepted_tokens,
+    const std::optional<torch::Tensor>& g,
+    const std::optional<torch::Tensor>& gk);
 }  // namespace xllm::kernel
diff --git a/xllm/core/layers/npu_torch/qwen3_gated_delta_net_base.cpp b/xllm/core/layers/npu_torch/qwen3_gated_delta_net_base.cpp
@@ -434,8 +434,8 @@ torch::Tensor Qwen3GatedDeltaNetBaseImpl::forward(
     chunk_gated_delta_params.beta = beta;
     // Get initial state from ssm_cache for sequences with previous state
     // Shape: [batch_size, num_heads, head_k_dim, head_v_dim]
-    torch::Tensor initial_state_tensor = torch::index_select(
-        ssm_cache, 0, input_params.block_tables.select(1, 0));
+    torch::Tensor initial_state_tensor =
+        torch::index_select(ssm_cache, 0, linear_state_indices);
     // Todo: chunked-prefill/prefix-cache use initial_state
     initial_state_tensor.fill_(0.0);
     chunk_gated_delta_params.initial_state = initial_state_tensor;
@@ -447,15 +447,14 @@ torch::Tensor Qwen3GatedDeltaNetBaseImpl::forward(
     std::tie(core_attn_out, last_recurrent_state) =
         xllm::kernel::chunk_gated_delta_rule(chunk_gated_delta_params);
     ssm_cache.index_put_(
-        {input_params.block_tables.select(1, 0)},
+        {linear_state_indices},
         last_recurrent_state.transpose(-1, -2).to(ssm_cache.dtype()));
   } else {
     processed_q = xllm::kernel::l2_norm(processed_q, 1e-6);
     processed_k = xllm::kernel::l2_norm(processed_k, 1e-6);
-    torch::Tensor ssm_state_indices =
-        attn_metadata.block_table.select(1, 0).contiguous();
-    auto zero = torch::zeros({1},attn_metadata.q_seq_lens.options());
-    torch::Tensor actual_seq_lengths = torch::cat({zero, attn_metadata.q_seq_lens}, 0);
+    auto zero = torch::zeros({1}, attn_metadata.q_seq_lens.options());
+    torch::Tensor actual_seq_lengths =
+        torch::cat({zero, attn_metadata.q_seq_lens}, 0);
     double scale = 1.0 / std::sqrt(static_cast<float>(processed_q.size(-1)));
     core_attn_out = xllm::kernel::recurrent_gated_delta_rule(
                         processed_q.reshape(
@@ -468,7 +467,7 @@ torch::Tensor Qwen3GatedDeltaNetBaseImpl::forward(
                         beta.squeeze(0).contiguous(),
                         scale,
                         actual_seq_lengths,
-                        ssm_state_indices,
+                        linear_state_indices,
                         c10::nullopt,
                         g.squeeze(0).contiguous(),
                         c10::nullopt)