alibaba
diff --git a/‎rtp_llm/cpp/core/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/cpp/core/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rtp_llm/cpp/core/ExecOps.h‎
Lines changed: 4 additions & 0 deletions b/‎rtp_llm/cpp/core/ExecOps.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎rtp_llm/cpp/cuda/ops/CudaFlashInfer.cc‎
Lines changed: 5 additions & 1 deletion b/‎rtp_llm/cpp/cuda/ops/CudaFlashInfer.cc‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎rtp_llm/cpp/cuda_graph/BUILD‎
Lines changed: 3 additions & 0 deletions b/‎rtp_llm/cpp/cuda_graph/BUILD‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎rtp_llm/cpp/cuda_graph/cuda_graph_prefill.cc‎
Lines changed: 9 additions & 7 deletions b/‎rtp_llm/cpp/cuda_graph/cuda_graph_prefill.cc‎
Lines changed: 9 additions & 7 deletions
@@ -184,6 +184,7 @@ cc_library(
         ":event",
         "//rtp_llm/cpp/config:config_modules",
         "//rtp_llm/cpp/models:stats",
+        "//rtp_llm/models_py/bindings/common/kernels:fuse_copy_util",
     ] + torch_deps() + select({
         "@//:using_rocm": ["@local_config_rocm//rocm:rocm_headers"],
         "//conditions:default": [],
 
@@ -5,6 +5,7 @@
 #include "rtp_llm/cpp/core/Event.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/models/eplb/stats/ExpertStats.h"
+#include "rtp_llm/models_py/bindings/common/kernels/fuse_copy_util.h"
 
 #include <memory>
 #include <atomic>
@@ -92,6 +93,9 @@ void execNoBlockCopy(const CopyParams& params);
 void execBatchCopy(const BatchCopyParams& params);
 void execMultiMergeCopy(const MultiMergeCopyParams& params);
 
+void fusedCopy(const FusedD2DCopyParams& params);
+void fusedStridedCopy(const FusedStridedCopyParams& params);
+
 // ===================================================================
 // Sample ops
 // ===================================================================
 
@@ -76,7 +76,10 @@ FlashInferAttnParams::allocateManyBuffer(const std::vector<std::vector<int64_t>>
     auto buf_options = torch::TensorOptions(torch::kInt32);
     if (atype == AllocationType::DEVICE) {
         buf_options = buf_options.device(torch::kCUDA);
+    } else {
+        buf_options = buf_options.device(torch::kCPU).pinned_memory(true);
     }
+
     auto buf = torch::empty({(int64_t)total_size}, buf_options);
 
     size_t offset = 0;
@@ -104,7 +107,8 @@ FlashInferAttnParams* FlashInferAttnParams::create(int batch_size, int input_tok
     params->float_workspace_ =
         torch::empty({128 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCUDA));
     params->int_workspace_ = torch::empty({8 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCUDA));
-    params->int_host_workspace_ = torch::empty({8 * 1024 * 1024}, torch::kInt8);
+    params->int_host_workspace_ =
+        torch::empty({8 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCPU).pinned_memory(true));
 
     params->float_workspace_d = params->float_workspace_;
     params->int_workspace_d   = params->int_workspace_;
 
@@ -66,9 +66,12 @@ cc_library(
     deps = torch_deps() + [
         ":cuda_graph_base",
         ":cuda_graph_hdrs_lib",
+        "//rtp_llm/cpp/core:exec_ops_hdr",
         "//rtp_llm/cpp/utils:core_utils",
         "//rtp_llm/cpp/utils:profiling_scope",
         "//rtp_llm/models_py/bindings:op_defs",
+        "//rtp_llm/models_py/bindings/common/kernels:fuse_copy_util",
+        "//rtp_llm/models_py/bindings/common:fuse_copy_op",
     ] + select({
         "//:using_cuda": [
             "//rtp_llm/cpp/cuda:cuda",
 
@@ -23,19 +23,21 @@ void CudaGraphRunner::capturePrefill() {
             inputs.attention_inputs.prefix_lengths.fill_(0);
             // Must set cu_seqlens/cu_kv_seqlens/input_lengths to match actual seq_len,
             // otherwise FlashInfer plans for max_seq_len tokens but q/k/v only have seq_len tokens
-            inputs.attention_inputs.cu_seqlens.data_ptr<int>()[0]    = 0;
-            inputs.attention_inputs.cu_seqlens.data_ptr<int>()[1]    = seq_len;
-            inputs.attention_inputs.input_lengths.data_ptr<int>()[0] = seq_len;
+            inputs.attention_inputs.cu_seqlens_host[0] = 0;
+            inputs.attention_inputs.cu_seqlens_host[1] = seq_len;
+            inputs.attention_inputs.cu_seqlens.copy_(inputs.attention_inputs.cu_seqlens_host, false);
+            inputs.attention_inputs.input_lengths[0] = seq_len;
         } else {
-            inputs.attention_inputs.cu_seqlens.fill_(seq_len);
+            inputs.attention_inputs.cu_seqlens_host.fill_(seq_len);
+            inputs.attention_inputs.cu_seqlens_host[0] = 0;
+            inputs.attention_inputs.cu_seqlens.copy_(inputs.attention_inputs.cu_seqlens_host, false);
             inputs.attention_inputs.input_lengths.fill_(0);
             int kv_len     = max_seq_len_ + seq_len;
             int prefix_len = kv_len;
             inputs.attention_inputs.cu_kv_seqlens.fill_(kv_len);
+            inputs.attention_inputs.cu_kv_seqlens[0] = 0;
             inputs.attention_inputs.prefix_lengths.fill_(prefix_len);
-            inputs.attention_inputs.cu_seqlens.data_ptr<int>()[0]    = 0;
-            inputs.attention_inputs.cu_kv_seqlens.data_ptr<int>()[0] = 0;
-            inputs.attention_inputs.input_lengths.data_ptr<int>()[0] = seq_len;
+            inputs.attention_inputs.input_lengths[0] = seq_len;
         }
 
         inputs.attention_inputs.context_total_kv_length = seq_len;