[TRTLLM-10947][perf] eagle3: use cudaMemcpy2DAsync custom op for hidden-state capture (#14479)

pcicotti · web-flow · commit 06456e1a3a95 · 2026-06-01T22:00:53.000+08:00
Signed-off-by: Pietro Cicotti &lt;5833013+pcicotti@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -119,7 +119,8 @@ add_library(
   dsv3RopeOp.cpp
   fusedGemmAllreduceOp.cpp
   convertReqIndexToGlobalOp.cpp
-  trtllmGenQKVProcessOp.cpp)
+  trtllmGenQKVProcessOp.cpp
+  inplaceSliceCopyOp.cpp)
 set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(
   th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES}
diff --git a/cpp/tensorrt_llm/thop/inplaceSliceCopyOp.cpp b/cpp/tensorrt_llm/thop/inplaceSliceCopyOp.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime_api.h>
+
+namespace tensorrt_llm::torch_ext
+{
+
+// Copy src[:, :] into dest[:numTokens, dim1Start:dim1End] using cudaMemcpy2D.
+// dest      : 2-D contiguous CUDA tensor, shape [destRows, destCols]
+// src       : 2-D contiguous CUDA tensor, shape [numTokens, sliceWidth] where sliceWidth == dim1End - dim1Start
+// dim1Start : first column index in dest to write into
+// dim1End   : one-past-last column index in dest to write into
+// numTokens is inferred from src.size(0)
+void inplaceSliceCopy(at::Tensor& dest, at::Tensor const& src, int64_t dim1Start, int64_t dim1End)
+{
+    CHECK_TH_CUDA(dest);
+    CHECK_TH_CUDA(src);
+    TORCH_CHECK(dest.get_device() == src.get_device(), "dest and src must be on the same CUDA device");
+    TORCH_CHECK(dest.is_contiguous(), "dest must be contiguous");
+    TORCH_CHECK(src.is_contiguous(), "src must be contiguous");
+    TORCH_CHECK(dest.dim() == 2, "dest must be 2-D");
+    TORCH_CHECK(src.dim() == 2, "src must be 2-D");
+    TORCH_CHECK(dest.scalar_type() == src.scalar_type(), "dest and src must have the same dtype");
+
+    int64_t const numTokens = src.size(0);
+    int64_t const sliceWidth = dim1End - dim1Start;
+    TORCH_CHECK(dim1Start >= 0, "dim1Start must be non-negative");
+    TORCH_CHECK(sliceWidth > 0, "dim1End must be greater than dim1Start");
+    TORCH_CHECK(numTokens <= dest.size(0), "numTokens exceeds dest row count");
+    TORCH_CHECK(dim1End <= dest.size(1), "dim1End exceeds dest column count");
+    TORCH_CHECK(src.size(1) == sliceWidth, "src column count must equal dim1End - dim1Start");
+
+    if (numTokens == 0 || sliceWidth == 0)
+    {
+        return;
+    }
+
+    int64_t const elemSize = dest.element_size();
+    int64_t const destPitch = dest.size(1) * elemSize; // bytes per dest row
+    int64_t const srcPitch = src.size(1) * elemSize;   // bytes per src row
+    int64_t const width = sliceWidth * elemSize;       // bytes to copy per row
+
+    char* destPtr = static_cast<char*>(dest.data_ptr()) + dim1Start * elemSize;
+    char const* srcPtr = static_cast<char const*>(src.data_ptr());
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream(dest.get_device());
+    TLLM_CUDA_CHECK(cudaMemcpy2DAsync(
+        destPtr, destPitch, srcPtr, srcPitch, width, static_cast<size_t>(numTokens), cudaMemcpyDeviceToDevice, stream));
+}
+
+} // namespace tensorrt_llm::torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    // dest: destination tensor (mutated in-place)
+    // src:  source tensor (numTokens inferred from src.size(0))
+    // dim1_start: first column index in dest
+    // dim1_end:   one-past-last column index in dest
+    m.def("inplace_slice_copy(Tensor(a!) dest, Tensor src, int dim1_start, int dim1_end) -> ()");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("inplace_slice_copy", TORCH_FN(tensorrt_llm::torch_ext::inplaceSliceCopy));
+}
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
@@ -121,6 +121,9 @@ def inplace_info():
         torch.ops.trtllm.cute_dsl_bf16_gemm_blackwell.default: {
             1: "output"
         },
+        torch.ops.trtllm.inplace_slice_copy.default: {
+            1: "dest"
+        }
     }
     if IS_CUDA_TILE_AVAILABLE:
         # cuda.tile availability depends on GPU capability thus runtime check.
diff --git a/tensorrt_llm/_torch/custom_ops/__init__.py b/tensorrt_llm/_torch/custom_ops/__init__.py
@@ -1,3 +1,5 @@
+import torch
+
 from ..cuda_tile_utils import IS_CUDA_TILE_AVAILABLE
 from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE
 from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
@@ -10,6 +12,12 @@
 # modules.attention and must be imported from there. They are not re-exported here to
 # avoid circular imports: custom_ops must not depend on modules.attention.
 
+
+def inplace_slice_copy(dest: torch.Tensor, src: torch.Tensor, dim1_start: int,
+                       dim1_end: int) -> None:
+    torch.ops.trtllm.inplace_slice_copy(dest, src, dim1_start, dim1_end)
+
+
 __all__ = [
     'IS_FLASHINFER_AVAILABLE',
     '_register_fake',
@@ -20,6 +28,7 @@
     'copy_to_userbuffers',
     'matmul_to_ub',
     'IS_CUTLASS_DSL_AVAILABLE',
+    'inplace_slice_copy',
 ]
 
 if IS_FLASHINFER_AVAILABLE:
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -204,6 +204,10 @@ def _(scores, scores_with_bias, n_group, topk_group, topk,
                                 dtype=scores_with_bias.dtype), scores.new_empty(
                                     shape, dtype=torch.int32)
 
+    @torch.library.register_fake("trtllm::inplace_slice_copy")
+    def _(dest, src, dim1_start, dim1_end):
+        pass
+
     @torch.library.register_fake("trtllm::indexer_topk_prefill")
     def _(logits, row_starts, row_ends, indices, index_topk):
         # In-place operation, no return value (void function)
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -4,6 +4,7 @@
 import torch
 from torch import nn
 
+from tensorrt_llm._torch.custom_ops import inplace_slice_copy
 from tensorrt_llm._utils import prefer_pinned
 from tensorrt_llm.mapping import Mapping
 
@@ -457,13 +458,13 @@ def maybe_capture_hidden_states(
             layer_id: int,
             hidden_states: torch.Tensor,
             residual: Optional[torch.Tensor] = None) -> None:
+
         for i, captured_layer_id in enumerate(self.layers_to_capture):
             if captured_layer_id == layer_id:
-                num_tokens = hidden_states.shape[0]
                 to_save = hidden_states + residual if residual is not None else hidden_states
-                self.hidden_states[:num_tokens, i * self.hidden_size:(i + 1) *
-                                   self.hidden_size].copy_(to_save,
-                                                           non_blocking=True)
+                inplace_slice_copy(self.hidden_states, to_save,
+                                   i * self.hidden_size,
+                                   (i + 1) * self.hidden_size)
                 break
 
 
diff --git a/tests/unittest/_torch/thop/parallel_hw_agnostic/test_inplace_slice_copy.py b/tests/unittest/_torch/thop/parallel_hw_agnostic/test_inplace_slice_copy.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for trtllm::inplace_slice_copy.
+
+Verifies that the cudaMemcpy2DAsync-backed op produces the same result as a
+reference Python slice + Tensor.copy_, for the row-prefix / column-slice
+write pattern used in EAGLE3 hidden-state capture.
+"""
+
+import pytest
+import torch
+
+import tensorrt_llm  # noqa: F401
+
+
+def _reference(dest_shape, src, dim1_start, dim1_end, dtype):
+    dest = torch.zeros(dest_shape, dtype=dtype, device="cuda")
+    num_tokens = src.shape[0]
+    dest[:num_tokens, dim1_start:dim1_end].copy_(src)
+    return dest
+
+
+def _run(dest_shape, src, dim1_start, dim1_end, dtype):
+    dest = torch.zeros(dest_shape, dtype=dtype, device="cuda")
+    torch.ops.trtllm.inplace_slice_copy(dest, src, dim1_start, dim1_end)
+    return dest
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+def test_full_dest_full_width(dtype):
+    """num_tokens == dest.size(0) and slice == full dest width."""
+    dest_shape = (16, 64)
+    src = torch.randn(16, 64, dtype=dtype, device="cuda")
+    out = _run(dest_shape, src, 0, 64, dtype)
+    ref = _reference(dest_shape, src, 0, 64, dtype)
+    torch.testing.assert_close(out, ref)
+
+
+def test_partial_rows():
+    """num_tokens < dest.size(0): trailing rows must stay zero."""
+    dtype = torch.bfloat16
+    dest_shape = (32, 64)
+    src = torch.randn(8, 64, dtype=dtype, device="cuda")
+    out = _run(dest_shape, src, 0, 64, dtype)
+    ref = _reference(dest_shape, src, 0, 64, dtype)
+    torch.testing.assert_close(out, ref)
+    assert torch.all(out[8:] == 0)
+
+
+def test_column_slice_middle():
+    """Write to a middle column band; flanking columns must stay zero."""
+    dtype = torch.bfloat16
+    dest_shape = (16, 96)
+    src = torch.randn(16, 32, dtype=dtype, device="cuda")
+    out = _run(dest_shape, src, 32, 64, dtype)
+    ref = _reference(dest_shape, src, 32, 64, dtype)
+    torch.testing.assert_close(out, ref)
+    assert torch.all(out[:, :32] == 0)
+    assert torch.all(out[:, 64:] == 0)
+
+
+def test_layered_capture_pattern():
+    """Mimic EAGLE3 hidden-state capture: write each layer into its band."""
+    dtype = torch.bfloat16
+    num_tokens, hidden_size, num_layers = 12, 48, 3
+    dest_shape = (24, hidden_size * num_layers)
+    srcs = [
+        torch.randn(num_tokens, hidden_size, dtype=dtype, device="cuda") for _ in range(num_layers)
+    ]
+
+    out = torch.zeros(dest_shape, dtype=dtype, device="cuda")
+    for i, s in enumerate(srcs):
+        torch.ops.trtllm.inplace_slice_copy(out, s, i * hidden_size, (i + 1) * hidden_size)
+
+    ref = torch.zeros(dest_shape, dtype=dtype, device="cuda")
+    for i, s in enumerate(srcs):
+        ref[:num_tokens, i * hidden_size : (i + 1) * hidden_size].copy_(s)
+
+    torch.testing.assert_close(out, ref)
+
+
+def test_empty_src_is_noop():
+    """num_tokens == 0 must not modify dest and must not raise."""
+    dtype = torch.bfloat16
+    dest_shape = (16, 64)
+    dest = torch.full(dest_shape, 7, dtype=dtype, device="cuda")
+    src = torch.empty(0, 32, dtype=dtype, device="cuda")
+    torch.ops.trtllm.inplace_slice_copy(dest, src, 16, 48)
+    assert torch.all(dest == 7)
+
+
+def test_dtype_mismatch_raises():
+    dest = torch.zeros(8, 32, dtype=torch.bfloat16, device="cuda")
+    src = torch.randn(8, 32, dtype=torch.float16, device="cuda")
+    with pytest.raises(RuntimeError):
+        torch.ops.trtllm.inplace_slice_copy(dest, src, 0, 32)
+
+
+def test_out_of_bounds_raises():
+    dtype = torch.bfloat16
+    dest = torch.zeros(8, 32, dtype=dtype, device="cuda")
+    src = torch.randn(8, 8, dtype=dtype, device="cuda")
+    with pytest.raises(RuntimeError):
+        torch.ops.trtllm.inplace_slice_copy(dest, src, 28, 36)
+
+
+def test_negative_dim1_start_raises():
+    """A negative dim1_start would underflow the dest pointer."""
+    dtype = torch.bfloat16
+    dest = torch.zeros(8, 32, dtype=dtype, device="cuda")
+    src = torch.randn(8, 8, dtype=dtype, device="cuda")
+    with pytest.raises(RuntimeError):
+        torch.ops.trtllm.inplace_slice_copy(dest, src, -8, 0)
+
+
+def test_device_mismatch_raises():
+    """dest and src on different CUDA devices must be rejected."""
+    if torch.cuda.device_count() < 2:
+        pytest.skip("requires >= 2 CUDA devices")
+    dtype = torch.bfloat16
+    dest = torch.zeros(8, 32, dtype=dtype, device="cuda:0")
+    src = torch.randn(8, 32, dtype=dtype, device="cuda:1")
+    with pytest.raises(RuntimeError):
+        torch.ops.trtllm.inplace_slice_copy(dest, src, 0, 32)