fix(fuse_copy): raise cap to 64 for micro-batch accumulation

Vinkle-hzt · Vinkle-hzt · commit 22735d02bb84 · 2026-04-23T14:41:08.000+08:00
diff --git a/rtp_llm/cpp/cuda_graph/cuda_graph_runner.cc b/rtp_llm/cpp/cuda_graph/cuda_graph_runner.cc
@@ -66,6 +66,10 @@ void CudaGraphRunner::prepareInputs(const PyModelInputs& inputs, CudaGraphState&
     auto& py_model_inputs_ = graph_instances_[graph_idx].mem_hold_.py_model_inputs_;
     auto  attn_pyobj       = graph_instances_[graph_idx].mem_hold_.attn_pyobj_;
 
+    // Per-launch capacity contract: see fuse_copy_util.h sizing rationale.
+    // Worst case here is ~8 contiguous + (1 + group_count) strided copies,
+    // batched into one launch each. If new copies are added below — or if the
+    // hybrid KV-cache group_count grows materially — re-check MAX_FUSED_*_COPIES.
     FusedD2DCopyParams     d2d_copies;
     FusedStridedCopyParams strided_d2d_copies;
 
diff --git a/rtp_llm/cpp/models/PyWrappedModel.cc b/rtp_llm/cpp/models/PyWrappedModel.cc
@@ -281,6 +281,15 @@ std::optional<PyCacheStoreInputs> PyWrappedModel::prepareWriteCacheParams(const
 GptModelOutputs PyWrappedModel::forwardMicroBatched(const GptModelInputs& inputs) {
     RTP_LLM_PROFILE_SCOPE("py_model.forwardMicroBatched");
 
+    // Per-launch capacity contract: see fuse_copy_util.h sizing rationale.
+    // d2d_copies_ accumulates across ALL micro-batches before the single
+    // fusedCopy() flush below. Per micro-batch this adds ~6 copies from
+    // buildPyAttentionInputs + padding_offset, plus group_count from
+    // setupKVCacheForAttentionInputs. With the planMicroBatches cap of 2
+    // micro-batches and hybrid group_count of 4 the worst case is ~20.
+    // If new tensorHoldHostAndToCuda call sites land below — or if
+    // planMicroBatches starts producing >2 micro-batches — re-check
+    // MAX_FUSED_D2D_COPIES.
     d2d_copies_.clear();
     if (pinned_check_remaining_ > 0) {
         --pinned_check_remaining_;
diff --git a/rtp_llm/models_py/bindings/common/kernels/fuse_copy_util.h b/rtp_llm/models_py/bindings/common/kernels/fuse_copy_util.h
@@ -1,12 +1,38 @@
 #pragma once
 #include <cstddef>
 #include <stdexcept>
+#include <string>
 
 namespace rtp_llm {
 
-// NOTE: Hardcoded limits for fused copies. It is enough for most cases. If you need more, please increase the limits.
-static constexpr int MAX_FUSED_D2D_COPIES     = 16;
-static constexpr int MAX_FUSED_STRIDED_COPIES = 16;
+// Hard caps on copies fused into a single kernel launch. The structs below
+// are passed by value as kernel parameters, so the arrays must be sized at
+// compile time.
+//
+// Sizing rationale (worst-case callers as of 2026):
+//   * cuda_graph_runner.cc::prepareInputs accumulates ~8 contiguous copies
+//     plus 1 + group_count strided copies per launch (one launch per replay).
+//   * PyWrappedModel.cc::forwardMicroBatched is the tightest path: it
+//     accumulates across ALL micro-batches before a single flush. Per
+//     micro-batch it adds ~6 contiguous copies (5 from buildPyAttentionInputs
+//     plus 1 padding_offset) plus `group_count` per-group block-id copies.
+//     With the current planMicroBatches cap of 2 micro-batches and a hybrid
+//     KV-cache group_count of 4 that's (6 + 4) * 2 = 20 contiguous copies.
+//
+// 64 entries gives ~3x headroom over today's worst case (20 contiguous, 5
+// strided) and accommodates ~30 KV-cache groups before hitting the cap. Each
+// FusedStridedCopyParams is 6 * 8 * 64 + 4 = 3076 bytes, well under the 32 KB
+// kernel parameter buffer available on Volta and newer GPUs (all currently
+// supported targets).
+//
+// If you need to raise these further: bump the constant, re-check the kernel
+// parameter buffer size for the lowest supported compute capability, and
+// extend the MaxFusedCopies / micro-batch unit tests accordingly. If the
+// upper bound ever needs to be unbounded, prefer adding a chunked-launch
+// helper (split into multiple param structs and launch each) over making the
+// arrays dynamic — the kernel signature must stay POD for grid launch.
+static constexpr int MAX_FUSED_D2D_COPIES     = 64;
+static constexpr int MAX_FUSED_STRIDED_COPIES = 64;
 
 inline void copyParamsAssert(bool value, const std::string& msg) {
     if (!value) {
@@ -22,7 +48,9 @@ struct FusedD2DCopyParams {
 
     void add(const void* src_ptr, void* dst_ptr, size_t bytes) {
         copyParamsAssert(num_copies < MAX_FUSED_D2D_COPIES,
-                         "FusedD2DCopyParams: num_copies exceeds MAX_FUSED_D2D_COPIES");
+                         "FusedD2DCopyParams: num_copies (" + std::to_string(num_copies + 1)
+                             + ") exceeds MAX_FUSED_D2D_COPIES (" + std::to_string(MAX_FUSED_D2D_COPIES)
+                             + "). Bump the cap in fuse_copy_util.h after re-checking the sizing rationale.");
         src[num_copies]  = src_ptr;
         dst[num_copies]  = dst_ptr;
         size[num_copies] = bytes;
@@ -45,7 +73,9 @@ struct FusedStridedCopyParams {
 
     void add(const void* src_ptr, void* dst_ptr, size_t rows, size_t row_b, size_t src_stride, size_t dst_stride) {
         copyParamsAssert(num_copies < MAX_FUSED_STRIDED_COPIES,
-                         "FusedStridedCopyParams: num_copies exceeds MAX_FUSED_STRIDED_COPIES");
+                         "FusedStridedCopyParams: num_copies (" + std::to_string(num_copies + 1)
+                             + ") exceeds MAX_FUSED_STRIDED_COPIES (" + std::to_string(MAX_FUSED_STRIDED_COPIES)
+                             + "). Bump the cap in fuse_copy_util.h after re-checking the sizing rationale.");
         src[num_copies]            = src_ptr;
         dst[num_copies]            = dst_ptr;
         num_rows[num_copies]       = rows;
diff --git a/rtp_llm/models_py/bindings/common/kernels/test/fuse_copy_kernel_test.cc b/rtp_llm/models_py/bindings/common/kernels/test/fuse_copy_kernel_test.cc
@@ -48,6 +48,17 @@ std::vector<T> deviceToHost(const T* d_ptr, size_t n) {
     return host;
 }
 
+// Allocate page-locked (pinned) host memory and fill it with the given data.
+// With UVA the returned pointer is directly dereferenceable from a CUDA kernel,
+// so it can be passed straight into FusedD2DCopyParams as a source pointer.
+template<typename T>
+T* pinnedHostAlloc(const std::vector<T>& host_data) {
+    T* h_pinned = nullptr;
+    EXPECT_EQ(cudaHostAlloc(&h_pinned, host_data.size() * sizeof(T), cudaHostAllocMapped), cudaSuccess);
+    std::memcpy(h_pinned, host_data.data(), host_data.size() * sizeof(T));
+    return h_pinned;
+}
+
 }  // namespace
 
 // ---------------------------------------------------------------------------
@@ -219,6 +230,117 @@ TEST_F(FusedCopyTest, MaxFusedCopies) {
     }
 }
 
+// Documented worst-case contract: PyWrappedModel::forwardMicroBatched
+// accumulates copies across all micro-batches before a single flush. With
+// the planMicroBatches cap of 2 micro-batches and a hybrid KV-cache
+// group_count of 4, the total is (6 base + 4 group) * 2 = 20 copies.
+// This test pins that scenario down so any regression in the accounting
+// (or in MAX_FUSED_D2D_COPIES) fails here rather than at production runtime.
+TEST_F(FusedCopyTest, MicroBatchedAccumulationWorstCase) {
+    constexpr int    NUM_MICRO_BATCHES  = 2;
+    constexpr int    BASE_COPIES_PER_MB = 6;
+    constexpr int    GROUP_COUNT        = 4;
+    constexpr int    COPIES_PER_MB      = BASE_COPIES_PER_MB + GROUP_COUNT;
+    constexpr int    TOTAL_COPIES       = NUM_MICRO_BATCHES * COPIES_PER_MB;  // 20
+    constexpr size_t N                  = 256;
+
+    static_assert(TOTAL_COPIES <= rtp_llm::MAX_FUSED_D2D_COPIES,
+                  "MAX_FUSED_D2D_COPIES is below the documented forwardMicroBatched worst case; "
+                  "see fuse_copy_util.h sizing rationale.");
+
+    std::vector<std::vector<uint8_t>> host_srcs(TOTAL_COPIES);
+    std::vector<uint8_t*>             d_srcs(TOTAL_COPIES);
+    std::vector<uint8_t*>             d_dsts(TOTAL_COPIES);
+
+    for (int c = 0; c < TOTAL_COPIES; ++c) {
+        host_srcs[c].resize(N);
+        for (size_t i = 0; i < N; ++i)
+            host_srcs[c][i] = static_cast<uint8_t>((c * 19 + i) & 0xFF);
+        d_srcs[c] = deviceAlloc(host_srcs[c]);
+        d_dsts[c] = deviceAllocZero<uint8_t>(N);
+    }
+
+    rtp_llm::FusedD2DCopyParams params;
+    for (int c = 0; c < TOTAL_COPIES; ++c)
+        params.add(d_srcs[c], d_dsts[c], N);
+
+    rtp_llm::invokeFusedCopy(params, stream_);
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
+
+    for (int c = 0; c < TOTAL_COPIES; ++c) {
+        auto result = deviceToHost(d_dsts[c], N);
+        for (size_t i = 0; i < N; ++i)
+            ASSERT_EQ(result[i], host_srcs[c][i]) << "copy " << c << " mismatch at byte " << i;
+    }
+
+    for (int c = 0; c < TOTAL_COPIES; ++c) {
+        cudaFree(d_srcs[c]);
+        cudaFree(d_dsts[c]);
+    }
+}
+
+// Copy from page-locked (pinned) host memory directly into device memory.
+// The kernel dereferences the source pointer on the GPU, so this exercises
+// the UVA path where pinned host memory is reachable from a CUDA kernel.
+TEST_F(FusedCopyTest, PinnedHostToDeviceCopy) {
+    constexpr size_t     N = 1024;  // 16-byte aligned, hits the vectorised fast path
+    std::vector<uint8_t> host_src(N);
+    for (size_t i = 0; i < N; ++i)
+        host_src[i] = static_cast<uint8_t>((i * 5 + 1) & 0xFF);
+
+    uint8_t* h_src_pinned = pinnedHostAlloc(host_src);
+    uint8_t* d_dst        = deviceAllocZero<uint8_t>(N);
+
+    rtp_llm::FusedD2DCopyParams params;
+    params.add(h_src_pinned, d_dst, N);
+
+    rtp_llm::invokeFusedCopy(params, stream_);
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
+
+    auto result = deviceToHost(d_dst, N);
+    for (size_t i = 0; i < N; ++i)
+        ASSERT_EQ(result[i], host_src[i]) << "mismatch at byte " << i;
+
+    cudaFreeHost(h_src_pinned);
+    cudaFree(d_dst);
+}
+
+// Mixed sources in a single fused launch: some copies read from pinned host
+// memory, others from device memory. This is the realistic batched scenario.
+TEST_F(FusedCopyTest, MixedPinnedAndDeviceSrc) {
+    constexpr size_t N = 512;
+
+    std::vector<uint8_t> host_a(N), host_b(N);
+    for (size_t i = 0; i < N; ++i) {
+        host_a[i] = static_cast<uint8_t>((i + 11) & 0xFF);
+        host_b[i] = static_cast<uint8_t>((i * 3 + 7) & 0xFF);
+    }
+
+    uint8_t* h_src_pinned = pinnedHostAlloc(host_a);  // pinned host source
+    uint8_t* d_src_dev    = deviceAlloc(host_b);      // device source
+    uint8_t* d_dst_a      = deviceAllocZero<uint8_t>(N);
+    uint8_t* d_dst_b      = deviceAllocZero<uint8_t>(N);
+
+    rtp_llm::FusedD2DCopyParams params;
+    params.add(h_src_pinned, d_dst_a, N);
+    params.add(d_src_dev, d_dst_b, N);
+
+    rtp_llm::invokeFusedCopy(params, stream_);
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
+
+    auto result_a = deviceToHost(d_dst_a, N);
+    auto result_b = deviceToHost(d_dst_b, N);
+    for (size_t i = 0; i < N; ++i) {
+        ASSERT_EQ(result_a[i], host_a[i]) << "pinned-src mismatch at byte " << i;
+        ASSERT_EQ(result_b[i], host_b[i]) << "device-src mismatch at byte " << i;
+    }
+
+    cudaFreeHost(h_src_pinned);
+    cudaFree(d_src_dev);
+    cudaFree(d_dst_a);
+    cudaFree(d_dst_b);
+}
+
 // ---------------------------------------------------------------------------
 // FusedStridedCopy tests (invokeFusedStridedCopy)
 // ---------------------------------------------------------------------------
@@ -382,6 +504,36 @@ TEST_F(FusedStridedCopyTest, SingleRowCopy) {
     cudaFree(d_dst);
 }
 
+// Strided copy from pinned host memory directly into device memory.
+TEST_F(FusedStridedCopyTest, PinnedHostToDeviceCopy) {
+    constexpr size_t NROWS      = 8;
+    constexpr size_t ROW_BYTES  = 32;
+    constexpr size_t SRC_STRIDE = 64;         // pinned source has padding per row
+    constexpr size_t DST_STRIDE = ROW_BYTES;  // compact device destination
+
+    std::vector<uint8_t> host_src(NROWS * SRC_STRIDE, 0xCD);
+    for (size_t r = 0; r < NROWS; ++r)
+        for (size_t b = 0; b < ROW_BYTES; ++b)
+            host_src[r * SRC_STRIDE + b] = static_cast<uint8_t>((r * ROW_BYTES + b * 2) & 0xFF);
+
+    uint8_t* h_src_pinned = pinnedHostAlloc(host_src);
+    uint8_t* d_dst        = deviceAllocZero<uint8_t>(NROWS * DST_STRIDE);
+
+    rtp_llm::FusedStridedCopyParams params;
+    params.add(h_src_pinned, d_dst, NROWS, ROW_BYTES, SRC_STRIDE, DST_STRIDE);
+
+    rtp_llm::invokeFusedStridedCopy(params, stream_);
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
+
+    auto result = deviceToHost(d_dst, NROWS * DST_STRIDE);
+    for (size_t r = 0; r < NROWS; ++r)
+        for (size_t b = 0; b < ROW_BYTES; ++b)
+            ASSERT_EQ(result[r * DST_STRIDE + b], host_src[r * SRC_STRIDE + b]) << "row " << r << " col " << b;
+
+    cudaFreeHost(h_src_pinned);
+    cudaFree(d_dst);
+}
+
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();