alibaba
diff --git a/‎rtp_llm/cpp/core/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/cpp/core/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rtp_llm/cpp/core/ExecOps.h‎
Lines changed: 4 additions & 0 deletions b/‎rtp_llm/cpp/core/ExecOps.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphRunner.cc‎
Lines changed: 164 additions & 104 deletions b/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphRunner.cc‎
Lines changed: 164 additions & 104 deletions
diff --git a/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphRunner.h‎
Lines changed: 0 additions & 1 deletion b/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphRunner.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphUtils.cc‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphUtils.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphUtils.h‎
Lines changed: 3 additions & 0 deletions b/‎rtp_llm/cpp/cuda/CudaGraph/CudaGraphUtils.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎rtp_llm/cpp/cuda/ops/CudaFlashInfer.cc‎
Lines changed: 5 additions & 1 deletion b/‎rtp_llm/cpp/cuda/ops/CudaFlashInfer.cc‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎rtp_llm/cpp/kernels/BUILD‎
Lines changed: 40 additions & 0 deletions b/‎rtp_llm/cpp/kernels/BUILD‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎rtp_llm/cpp/kernels/copy_utils.h‎
Lines changed: 1 addition & 1 deletion b/‎rtp_llm/cpp/kernels/copy_utils.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rtp_llm/cpp/kernels/fuse_copy/fuse_copy_kernel.cu‎
Lines changed: 88 additions & 0 deletions b/‎rtp_llm/cpp/kernels/fuse_copy/fuse_copy_kernel.cu‎
Lines changed: 88 additions & 0 deletions
@@ -184,6 +184,7 @@ cc_library(
         ":event",
         "//rtp_llm/cpp/config:config_modules",
         "//rtp_llm/cpp/models:stats",
+        "//rtp_llm/cpp/kernels:fuse_copy_util",
     ] + torch_deps() + select({
         "@//:using_rocm": ["@local_config_rocm//rocm:rocm_headers"],
         "//conditions:default": [],
 
@@ -5,6 +5,7 @@
 #include "rtp_llm/cpp/core/Event.h"
 #include "rtp_llm/cpp/config/ConfigModules.h"
 #include "rtp_llm/cpp/models/eplb/stats/ExpertStats.h"
+#include "rtp_llm/cpp/kernels/fuse_copy/util.h"
 
 #include <memory>
 #include <atomic>
@@ -93,6 +94,9 @@ void execNoBlockCopy(const MultiCopyParams& params);
 void execBatchCopy(const BatchCopyParams& params);
 void execMultiMergeCopy(const MultiMergeCopyParams& params);
 
+void fusedCopy(const FusedD2DCopyParams& params);
+void fusedStridedCopy(const FusedStridedCopyParams& params);
+
 // ===================================================================
 // Sample ops
 // ===================================================================
 
@@ -102,7 +102,6 @@ class CudaGraphRunner: public GraphBase {
     void              setInputEmbeddingScalar(float input_embedding_scalar) override;
 
 private:
-    void             copySmallerIntoLarger(const torch::Tensor& source_tensor, torch::Tensor& target_tensor);
     std::vector<int> getDecodeBatchSizesToCapture();
     std::vector<int> getPrefillSequenceLengthsToCapture();
     /// Select graph key for decode; false if no captured graph can serve current_batch_size (e.g. lower_bound hit end).
 
@@ -103,6 +103,7 @@ void debugPrintPyModelInputs(const PyModelInputs& inputs) {
     printTensorInfo("kv_cache_block_id_host", inputs.attention_inputs.kv_cache_block_id_host, 40);
     printTensorInfo("kv_cache_block_id_device", inputs.attention_inputs.kv_cache_block_id_device, 40);
     printTensorInfo("cu_seqlens", inputs.attention_inputs.cu_seqlens);
+    printTensorInfo("cu_seqlens_host", inputs.attention_inputs.cu_seqlens_host);
     printTensorInfo("cu_kv_seqlens", inputs.attention_inputs.cu_kv_seqlens);
     printTensorInfo("sequence_lengths_plus_1_d", inputs.attention_inputs.sequence_lengths_plus_1_d);
     printTensorInfo("decode_cu_seqlens_d", inputs.attention_inputs.decode_cu_seqlens_d);
 
@@ -29,6 +29,7 @@ class CaptureMemoryHold {
     CaptureMemoryHold(at::Tensor hidden_states, PyModelInputs& inputs, bool is_embedding):
         decoder_layer_hidden_states_(hidden_states) {
         py_model_inputs_.attention_inputs.input_lengths    = inputs.attention_inputs.input_lengths;
+        py_model_inputs_.attention_inputs.input_lengths_d  = inputs.attention_inputs.input_lengths_d;
         py_model_inputs_.attention_inputs.sequence_lengths = inputs.attention_inputs.sequence_lengths;
         py_model_inputs_.attention_inputs.kv_cache_kernel_block_id_device =
             inputs.attention_inputs.kv_cache_kernel_block_id_device;
@@ -40,11 +41,13 @@ class CaptureMemoryHold {
             inputs.attention_inputs.kv_cache_kernel_block_id_host_by_group;
         py_model_inputs_.attention_inputs.kv_cache_layer_to_group = inputs.attention_inputs.kv_cache_layer_to_group;
         py_model_inputs_.attention_inputs.prefix_lengths          = inputs.attention_inputs.prefix_lengths;
+        py_model_inputs_.attention_inputs.prefix_lengths_d        = inputs.attention_inputs.prefix_lengths_d;
         py_model_inputs_.input_ids                                = inputs.input_ids;
 
         // for spec
         py_model_inputs_.input_hiddens                            = inputs.input_hiddens;
         py_model_inputs_.attention_inputs.cu_seqlens              = inputs.attention_inputs.cu_seqlens;
+        py_model_inputs_.attention_inputs.cu_seqlens_host         = inputs.attention_inputs.cu_seqlens_host;
         py_model_inputs_.attention_inputs.cu_kv_seqlens           = inputs.attention_inputs.cu_kv_seqlens;
         py_model_inputs_.attention_inputs.padding_offset          = inputs.attention_inputs.padding_offset;
         py_model_inputs_.attention_inputs.is_prefill              = inputs.attention_inputs.is_prefill;
 
@@ -76,7 +76,10 @@ FlashInferAttnParams::allocateManyBuffer(const std::vector<std::vector<int64_t>>
     auto buf_options = torch::TensorOptions(torch::kInt32);
     if (atype == AllocationType::DEVICE) {
         buf_options = buf_options.device(torch::kCUDA);
+    } else {
+        buf_options = buf_options.device(torch::kCPU).pinned_memory(true);
     }
+
     auto buf = torch::empty({(int64_t)total_size}, buf_options);
 
     size_t offset = 0;
@@ -104,7 +107,8 @@ FlashInferAttnParams* FlashInferAttnParams::create(int batch_size, int input_tok
     params->float_workspace_ =
         torch::empty({128 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCUDA));
     params->int_workspace_ = torch::empty({8 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCUDA));
-    params->int_host_workspace_ = torch::empty({8 * 1024 * 1024}, torch::kInt8);
+    params->int_host_workspace_ =
+        torch::empty({8 * 1024 * 1024}, torch::TensorOptions(torch::kInt8).device(torch::kCPU).pinned_memory(true));
 
     params->float_workspace_d = params->float_workspace_;
     params->int_workspace_d   = params->int_workspace_;
 
@@ -489,6 +489,45 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "fuse_copy_util",
+    hdrs = [
+        "fuse_copy/util.h",
+    ],
+    deps = any_cuda_deps + [
+        "//rtp_llm/cpp/utils:core_utils",
+    ],
+    copts = any_cuda_copts(),
+    include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "fuse_copy_kernel",
+    srcs = [
+        "fuse_copy/fuse_copy_kernel.cu",
+    ],
+    hdrs = [
+        "fuse_copy/fuse_copy_kernel.h",
+    ],
+    deps = any_cuda_deps + [
+        ":fuse_copy_util",
+        "//rtp_llm/cpp/utils:core_utils",
+    ],
+    copts = any_cuda_copts(),
+    include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "kernels_fused_copy",
+    deps = [
+        ":fuse_copy_kernel",
+        ":fuse_copy_util",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "kernels_copy",
     srcs = [
@@ -576,6 +615,7 @@ cc_library(
         ":kernels_embedding",
         ":kernels_tensor_ops",
         ":kernels_kv_cache",
+        ":kernels_fused_copy",
         ":kernels_copy",
         ":kernels_moe",
         ":kernels_mla",
 
@@ -3,7 +3,7 @@
 #include <assert.h>
 #include <vector>
 
-#if USEING_CUDA
+#if USING_CUDA
 #include <cuda_runtime.h>
 #endif
 
 
@@ -0,0 +1,88 @@
+#include <cstdint>
+#include <cstddef>
+
+#include "rtp_llm/cpp/kernels/fuse_copy/fuse_copy_kernel.h"
+
+namespace rtp_llm {
+
+static constexpr int FUSED_COPY_BLOCKS_PER_TASK = 8;
+static constexpr int FUSED_COPY_THREADS         = 256;
+
+__global__ void fusedCopyKernel(FusedD2DCopyParams params) {
+    const int copy_idx = blockIdx.y;
+    if (copy_idx >= params.num_copies)
+        return;
+
+    const size_t total_bytes   = params.size[copy_idx];
+    const size_t global_tid    = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    const size_t global_stride = static_cast<size_t>(gridDim.x) * blockDim.x;
+
+    const auto src_addr = reinterpret_cast<uintptr_t>(params.src[copy_idx]);
+    const auto dst_addr = reinterpret_cast<uintptr_t>(params.dst[copy_idx]);
+
+    if ((src_addr % sizeof(int4) == 0) && (dst_addr % sizeof(int4) == 0)) {
+        // Fast path: 16-byte vectorized bulk copy
+        const int4*  src = reinterpret_cast<const int4*>(src_addr);
+        int4*        dst = reinterpret_cast<int4*>(dst_addr);
+        const size_t n16 = total_bytes / sizeof(int4);
+
+        for (size_t i = global_tid; i < n16; i += global_stride) {
+            dst[i] = src[i];
+        }
+
+        if (blockIdx.x == 0) {
+            const size_t rem_start = n16 * sizeof(int4);
+            const char*  src_byte  = reinterpret_cast<const char*>(src_addr);
+            char*        dst_byte  = reinterpret_cast<char*>(dst_addr);
+            for (size_t i = rem_start + threadIdx.x; i < total_bytes; i += blockDim.x) {
+                dst_byte[i] = src_byte[i];
+            }
+        }
+    } else {
+        // Slow path: byte-by-byte copy for unaligned pointers
+        const char* src_byte = reinterpret_cast<const char*>(src_addr);
+        char*       dst_byte = reinterpret_cast<char*>(dst_addr);
+        for (size_t i = global_tid; i < total_bytes; i += global_stride) {
+            dst_byte[i] = src_byte[i];
+        }
+    }
+}
+
+void invokeFusedCopy(const FusedD2DCopyParams& params, cudaStream_t stream) {
+    if (params.num_copies <= 0)
+        return;
+    dim3 grid(FUSED_COPY_BLOCKS_PER_TASK, params.num_copies);
+    fusedCopyKernel<<<grid, FUSED_COPY_THREADS, 0, stream>>>(params);
+}
+
+__global__ void fusedStridedCopyKernel(FusedStridedCopyParams params) {
+    const int copy_idx = blockIdx.y;
+    if (copy_idx >= params.num_copies)
+        return;
+
+    const size_t nrows      = params.num_rows[copy_idx];
+    const size_t rbytes     = params.row_bytes[copy_idx];
+    const size_t src_stride = params.src_row_stride[copy_idx];
+    const size_t dst_stride = params.dst_row_stride[copy_idx];
+    const char*  src        = reinterpret_cast<const char*>(params.src[copy_idx]);
+    char*        dst        = reinterpret_cast<char*>(params.dst[copy_idx]);
+
+    const size_t total      = nrows * rbytes;
+    const size_t global_tid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    const size_t stride     = static_cast<size_t>(gridDim.x) * blockDim.x;
+
+    for (size_t idx = global_tid; idx < total; idx += stride) {
+        const size_t row            = idx / rbytes;
+        const size_t col            = idx % rbytes;
+        dst[row * dst_stride + col] = src[row * src_stride + col];
+    }
+}
+
+void invokeFusedStridedCopy(const FusedStridedCopyParams& params, cudaStream_t stream) {
+    if (params.num_copies <= 0)
+        return;
+    dim3 grid(FUSED_COPY_BLOCKS_PER_TASK, params.num_copies);
+    fusedStridedCopyKernel<<<grid, FUSED_COPY_THREADS, 0, stream>>>(params);
+}
+
+}  // namespace rtp_llm