feat - support xqa spec

zerozw · zerozw · commit 68ff0898bca6 · 2026-04-07T14:14:05.000+08:00
diff --git a/rtp_llm/cpp/cuda/ops/CudaXqa.h b/rtp_llm/cpp/cuda/ops/CudaXqa.h
@@ -13,6 +13,8 @@ struct XQAParams: public ParamsBase {
     size_t        max_seq_len;
     torch::Tensor kv_cache_offset;
     torch::Tensor sequence_lengths;
+    torch::Tensor q_cu_seqlens;
+    size_t        max_q_len{1};
 };
 
 using XQAParamsPtr = std::shared_ptr<XQAParams>;
diff --git a/rtp_llm/models_py/bindings/cuda/XQAAttnOp.cc b/rtp_llm/models_py/bindings/cuda/XQAAttnOp.cc
@@ -7,8 +7,12 @@
 namespace rtp_llm {
 
 XQAAttnOp::XQAAttnOp(const AttentionConfigs& attn_configs): attn_configs_(attn_configs) {}
+XQASpecAttnOp::XQASpecAttnOp(const AttentionConfigs& attn_configs): attn_configs_(attn_configs) {}
 
 bool XQAAttnOp::support(torch_ext::PyAttentionInputs attn_inputs) {
+    if (attn_inputs.is_target_verify) {
+        return false;
+    }
     return attn_configs_.kv_cache_dtype != KvCacheDataType::INT8 && get_sm() >= tensorrt_llm::kernels::kSM_90
            && supportXqa(DataType::TYPE_BF16,
                          DataType::TYPE_BF16,
@@ -41,6 +45,51 @@ ParamsBasePtr XQAAttnOp::prepare(torch_ext::PyAttentionInputs attn_inputs) {
     return ParamsBasePtr(params);
 }
 
+bool XQASpecAttnOp::support(torch_ext::PyAttentionInputs attn_inputs) {
+    if (!attn_inputs.is_target_verify || !attn_inputs.decode_cu_seqlens_d.defined()
+        || attn_inputs.decode_cu_seqlens_d.numel() <= 1 || attn_configs_.kv_cache_dtype != KvCacheDataType::FP8
+        || get_sm() != tensorrt_llm::kernels::kSM_90) {
+        return false;
+    }
+    const auto input_type = attn_configs_.dtype == torch::kBFloat16 ? DataType::TYPE_BF16 : DataType::TYPE_FP16;
+    const auto kv_type    = DataType::TYPE_FP8_E4M3;
+    return supportXqa(input_type,
+                      input_type,
+                      kv_type,
+                      attn_configs_.head_num / attn_configs_.kv_head_num,
+                      attn_configs_.size_per_head,
+                      attn_configs_.kernel_tokens_per_block);
+}
+
+ParamsBasePtr XQASpecAttnOp::prepare(torch_ext::PyAttentionInputs attn_inputs) {
+    XQAParamsPtr params     = std::make_shared<XQAParams>();
+    int          batch_size = attn_inputs.sequence_lengths.size(0);
+    RTP_LLM_CHECK_WITH_INFO(attn_inputs.kv_cache_kernel_block_id_host.defined()
+                                && attn_inputs.kv_cache_kernel_block_id_device.defined(),
+                            "decode should have kv cache block id.");
+
+    auto run_stream   = at::cuda::getCurrentCUDAStream(at::cuda::current_device()).stream();
+    bool use_fp8_fmha = attn_configs_.kv_cache_dtype == KvCacheDataType::FP8;
+    auto trt_params   = prepareTrtAttnParams(
+        attn_configs_, attn_inputs.kv_cache_kernel_block_id_device, batch_size, use_fp8_fmha, run_stream, false);
+    params->kv_block_array  = ((TRTAttn*)trt_params.get())->kv_block_array;
+    params->kv_cache_offset = ((TRTAttn*)trt_params.get())->kv_cache_offset.clone();
+    params->batch_size      = batch_size;
+    params->sequence_lengths =
+        (attn_inputs.sequence_lengths + attn_inputs.input_lengths)
+            .to(torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA), /*non_blocking=*/true);
+    params->q_cu_seqlens = attn_inputs.decode_cu_seqlens_d;
+    params->max_q_len    = static_cast<size_t>(
+        (attn_inputs.decode_cu_seqlens_d.slice(0, 1) - attn_inputs.decode_cu_seqlens_d.slice(0, 0, -1))
+            .max()
+            .item<int32_t>());
+    params->max_seq_len =
+        attn_inputs.input_lengths.max().item<int32_t>() + attn_inputs.prefix_lengths.max().item<int32_t>();
+    params->kv_block_array.cache_type = attn_configs_.kv_cache_dtype;
+
+    return ParamsBasePtr(params);
+}
+
 torch::Tensor XQAAttnOp::forward(const torch::Tensor&                   input,
                                  std::optional<torch_ext::LayerKVCache> kv_cache,
                                  const XQAParamsPtr&                    params) {
@@ -80,6 +129,49 @@ torch::Tensor XQAAttnOp::forward(const torch::Tensor&                   input,
     return output;
 }
 
+torch::Tensor XQASpecAttnOp::forward(const torch::Tensor&                   input,
+                                     std::optional<torch_ext::LayerKVCache> kv_cache,
+                                     const XQAParamsPtr&                    params) {
+    const int            batch_size        = params->batch_size;
+    const int            local_head_num    = attn_configs_.head_num;
+    const int            local_head_num_kv = attn_configs_.kv_head_num;
+    const int            size_per_head     = attn_configs_.size_per_head;
+    torch::TensorOptions options           = torch::TensorOptions(input.dtype()).device(input.device());
+    torch::Tensor        output =
+        torch::empty({batch_size, static_cast<int64_t>(params->max_q_len), local_head_num, size_per_head}, options);
+
+    KVBlockArray kv_block_array;
+    if (kv_cache.has_value()) {
+        kv_block_array                 = params->kv_block_array;
+        kv_block_array.mPrimaryPoolPtr = kv_cache.value().kv_cache_base.data_ptr();
+        if (kv_cache.value().kv_scale_base.defined() && kv_cache.value().kv_scale_base.numel() > 0) {
+            kv_block_array.scale = kv_cache.value().kv_scale_base.data_ptr();
+        }
+    }
+
+    RTP_LLM_CHECK_WITH_INFO(kv_cache.has_value(), "spec decode should have kv cache.");
+
+    torch::Tensor xqa_input = input.contiguous();
+    runXqa(xqa_input.data_ptr(),
+           input.dtype() == torch::kBFloat16,
+           output.data_ptr(),
+           local_head_num,
+           local_head_num_kv,
+           size_per_head,
+           params->batch_size,
+           static_cast<size_t>(kv_block_array.mMaxBlocksPerSeq),
+           params->max_seq_len,
+           attn_configs_.kernel_tokens_per_block,
+           kv_block_array.mPrimaryPoolPtr,
+           reinterpret_cast<int32_t*>((KVCacheIndex*)(params->kv_cache_offset.data_ptr())),
+           kv_block_array.cache_type == KvCacheDataType::FP8,
+           reinterpret_cast<uint32_t*>(params->sequence_lengths.data_ptr()),
+           nullptr,
+           params->max_q_len,
+           params->q_cu_seqlens.data_ptr());
+    return output;
+}
+
 void registerXQAAttnOp(const py::module& m) {
     pybind11::class_<XQAParams, std::shared_ptr<XQAParams>, rtp_llm::ParamsBase>(m, "XQAParams")
         .def(pybind11::init<>())
@@ -93,6 +185,11 @@ void registerXQAAttnOp(const py::module& m) {
         .def("support", &XQAAttnOp::support, py::arg("attn_inputs").noconvert())
         .def("prepare", &XQAAttnOp::prepare, py::arg("attn_inputs"))
         .def("forward", &XQAAttnOp::forward, py::arg("input"), py::arg("kv_cache"), py::arg("params"));
+    pybind11::class_<XQASpecAttnOp>(m, "XQASpecAttnOp")
+        .def(pybind11::init<const AttentionConfigs&>(), py::arg("attn_configs"))
+        .def("support", &XQASpecAttnOp::support, py::arg("attn_inputs").noconvert())
+        .def("prepare", &XQASpecAttnOp::prepare, py::arg("attn_inputs"))
+        .def("forward", &XQASpecAttnOp::forward, py::arg("input"), py::arg("kv_cache"), py::arg("params"));
 }
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/XQAAttnOp.h b/rtp_llm/models_py/bindings/cuda/XQAAttnOp.h
@@ -24,6 +24,20 @@ class XQAAttnOp {
     AttentionConfigs attn_configs_;
 };
 
+class XQASpecAttnOp {
+public:
+    XQASpecAttnOp(const AttentionConfigs& attn_configs);
+    bool support(torch_ext::PyAttentionInputs attn_inputs);
+
+    ParamsBasePtr prepare(torch_ext::PyAttentionInputs attn_inputs);
+
+    torch::Tensor
+    forward(const torch::Tensor& input, std::optional<torch_ext::LayerKVCache> kv_cache, const XQAParamsPtr& params);
+
+protected:
+    AttentionConfigs attn_configs_;
+};
+
 void registerXQAAttnOp(const py::module& m);
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/models_py/modules/factory/attention/__init__.py b/rtp_llm/models_py/modules/factory/attention/__init__.py
@@ -69,6 +69,7 @@
         )
         from rtp_llm.models_py.modules.factory.attention.cuda_impl.xqa import (
             get_xqa_impl,
+            XQASpecImpl,
         )
 
         PREFILL_MHA_IMPS.extend(
@@ -77,6 +78,7 @@
                 HeadWisePrefillImpl,
                 FlashInferTRTLLMSpecDecodeImpl,
                 FlashInferTRTLLMPrefillImpl,
+                XQASpecImpl,
                 TRTMHAImpl,
                 PyFlashinferPrefillImpl,
                 PyFlashinferPagedPrefillImpl,
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/test/base_attention_test.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/test/base_attention_test.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from rtp_llm.ops import AttentionConfigs, ParallelismConfig
+from rtp_llm.ops import AttentionConfigs, KvCacheDataType, ParallelismConfig
 from rtp_llm.ops.compute_ops import LayerKVCache, PyAttentionInputs, get_typemeta
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
@@ -72,6 +72,7 @@ def _create_config(
         seq_size_per_block: int = 64,
         tp_size: int = 1,
         data_type: str = "fp16",
+        kv_cache_dtype: KvCacheDataType = KvCacheDataType.BASE,
     ) -> TestConfig:
         """Helper to create a test config"""
         attn_configs = AttentionConfigs()
@@ -89,6 +90,7 @@ def _create_config(
             "bf16": torch.bfloat16,
         }
         attn_configs.dtype = dtype_map.get(data_type, torch.float16)
+        attn_configs.kv_cache_dtype = kv_cache_dtype
 
         parallelism_config = ParallelismConfig()
         parallelism_config.tp_size = tp_size
@@ -263,15 +265,24 @@ def _create_kv_cache(
 
         # Create combined KV cache with shape [total_blocks, 2, num_kv_heads, seq_size_per_block, head_dim]
         # where dim=1, index=0 is K and index=1 is V
+        sample_dtype = torch.bfloat16 if dtype == torch.float8_e4m3fn else dtype
         kv_cache_combined = torch.randn(
             total_blocks,
             2,  # K and V
             num_kv_heads,
             seq_size_per_block,
             head_dim,
-            dtype=dtype,
+            dtype=sample_dtype,
             device=self.device,
         )
+        if dtype == torch.float8_e4m3fn:
+            kv_cache_combined = kv_cache_combined.to(dtype)
+            kv_cache.kv_scale_base = torch.ones(
+                total_blocks,
+                num_kv_heads * seq_size_per_block,
+                dtype=torch.float32,
+                device=self.device,
+            )
 
         kv_cache.kv_cache_base = kv_cache_combined
 
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/test/test_xqa.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/test/test_xqa.py
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/xqa.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/xqa.py

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,7 @@`
`69`	`69`	`)`
`70`	`70`	`from rtp_llm.models_py.modules.factory.attention.cuda_impl.xqa import (`
`71`	`71`	`get_xqa_impl,`
	`72`	`+ XQASpecImpl,`
`72`	`73`	`)`
`73`	`74`
`74`	`75`	`PREFILL_MHA_IMPS.extend(`
`@@ -77,6 +78,7 @@`
`77`	`78`	`HeadWisePrefillImpl,`
`78`	`79`	`FlashInferTRTLLMSpecDecodeImpl,`
`79`	`80`	`FlashInferTRTLLMPrefillImpl,`
	`81`	`+ XQASpecImpl,`
`80`	`82`	`TRTMHAImpl,`
`81`	`83`	`PyFlashinferPrefillImpl,`
`82`	`84`	`PyFlashinferPagedPrefillImpl,`