Skip to content

Commit 000d810

Browse files
authored
[ET Device Support] Define et_copy runtime h2d and d2h copy ops (pytorch#19858)
clone pytorch#18729 due to bot crash
1 parent c8c04e4 commit 000d810

8 files changed

Lines changed: 698 additions & 3 deletions

File tree

backends/cuda/runtime/shims/tests/targets.bzl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,27 @@ def define_common_targets():
4242
cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
4343
cuda_shim_cpp_unittest("aoti_torch_item_bool")
4444
cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
45+
46+
cpp_unittest(
47+
name = "test_op__device_copy",
48+
srcs = ["test_op__device_copy.cpp"],
49+
deps = [
50+
"//executorch/backends/cuda/runtime:cuda_backend",
51+
"//executorch/kernels/portable:generated_lib",
52+
"//executorch/kernels/portable:generated_lib_headers",
53+
"//executorch/kernels/portable/cpu:op__device_copy",
54+
"//executorch/runtime/core:device_allocator",
55+
"//executorch/runtime/core/exec_aten:lib",
56+
"//executorch/runtime/core/portable_type:portable_type",
57+
"//executorch/runtime/kernel:kernel_runtime_context",
58+
"//executorch/runtime/platform:platform",
59+
],
60+
external_deps = [
61+
("cuda", None, "cuda-lazy"),
62+
],
63+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
64+
keep_gpu_sections = True,
65+
remote_execution = re_test_utils.remote_execution(
66+
platform = "gpu-remote-execution",
67+
),
68+
)
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cuda_runtime.h>
10+
#include <executorch/kernels/portable/Functions.h>
11+
#include <executorch/runtime/core/device_allocator.h>
12+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
13+
#include <executorch/runtime/core/portable_type/tensor_impl.h>
14+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
15+
#include <executorch/runtime/platform/runtime.h>
16+
#include <gtest/gtest.h>
17+
18+
#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \
19+
defined(__SANITIZE_ADDRESS__)
20+
#include <sanitizer/lsan_interface.h>
21+
#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1
22+
#else
23+
#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0
24+
#endif
25+
26+
#include <cstdint>
27+
#include <memory>
28+
#include <vector>
29+
30+
using executorch::aten::ScalarType;
31+
using executorch::aten::Tensor;
32+
using executorch::aten::TensorImpl;
33+
using executorch::runtime::Error;
34+
using executorch::runtime::get_device_allocator;
35+
using executorch::runtime::KernelRuntimeContext;
36+
using executorch::runtime::TensorShapeDynamism;
37+
using executorch::runtime::etensor::DeviceIndex;
38+
using executorch::runtime::etensor::DeviceType;
39+
40+
namespace {
41+
42+
struct CudaDeleter {
43+
void operator()(void* ptr) const {
44+
if (ptr != nullptr) {
45+
cudaFree(ptr);
46+
}
47+
}
48+
};
49+
50+
using CudaPtr = std::unique_ptr<void, CudaDeleter>;
51+
52+
CudaPtr allocate_cuda(size_t nbytes) {
53+
void* ptr = nullptr;
54+
const cudaError_t err = cudaMalloc(&ptr, nbytes);
55+
EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed";
56+
return CudaPtr(ptr);
57+
}
58+
59+
bool is_cuda_available() {
60+
#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
61+
__lsan_disable();
62+
#endif
63+
int device_count = 0;
64+
const cudaError_t err = cudaGetDeviceCount(&device_count);
65+
#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
66+
__lsan_enable();
67+
#endif
68+
return err == cudaSuccess && device_count > 0;
69+
}
70+
71+
std::vector<float> copy_cuda_to_host(const void* device_ptr, size_t numel) {
72+
std::vector<float> host(numel);
73+
const cudaError_t err = cudaMemcpy(
74+
host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost);
75+
EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed";
76+
return host;
77+
}
78+
79+
void copy_host_to_cuda(const std::vector<float>& host, void* device_ptr) {
80+
const cudaError_t err = cudaMemcpy(
81+
device_ptr,
82+
host.data(),
83+
host.size() * sizeof(float),
84+
cudaMemcpyHostToDevice);
85+
EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed";
86+
}
87+
88+
class CudaDeviceCopyOpTest : public ::testing::Test {
89+
protected:
90+
static void SetUpTestSuite() {
91+
executorch::runtime::runtime_init();
92+
ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr)
93+
<< "Linking cuda_backend should auto-register the CUDA allocator";
94+
}
95+
96+
void SetUp() override {
97+
if (!is_cuda_available()) {
98+
GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests";
99+
}
100+
}
101+
102+
Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) {
103+
return torch::executor::et_copy::_h2d_copy_outf(context_, self, out);
104+
}
105+
106+
Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) {
107+
return torch::executor::et_copy::_d2h_copy_outf(context_, self, out);
108+
}
109+
110+
KernelRuntimeContext context_;
111+
};
112+
113+
} // namespace
114+
115+
TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) {
116+
std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f};
117+
auto device_data = allocate_cuda(src_data.size() * sizeof(float));
118+
ASSERT_NE(device_data.get(), nullptr);
119+
120+
int32_t sizes[] = {static_cast<int32_t>(src_data.size())};
121+
uint8_t dim_order[] = {0};
122+
int32_t strides[] = {1};
123+
124+
TensorImpl src_impl(
125+
ScalarType::Float,
126+
1,
127+
sizes,
128+
src_data.data(),
129+
dim_order,
130+
strides,
131+
TensorShapeDynamism::STATIC,
132+
DeviceType::CPU,
133+
0);
134+
Tensor src(&src_impl);
135+
136+
TensorImpl dst_impl(
137+
ScalarType::Float,
138+
1,
139+
sizes,
140+
device_data.get(),
141+
dim_order,
142+
strides,
143+
TensorShapeDynamism::STATIC,
144+
DeviceType::CUDA,
145+
0);
146+
Tensor dst(&dst_impl);
147+
148+
Tensor& result = op_h2d_copy_out(src, dst);
149+
150+
EXPECT_EQ(context_.failure_state(), Error::Ok);
151+
EXPECT_EQ(&result, &dst);
152+
EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data);
153+
}
154+
155+
TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) {
156+
const std::vector<float> expected = {5.0f, 6.0f, 7.0f, 8.0f};
157+
auto device_data = allocate_cuda(expected.size() * sizeof(float));
158+
ASSERT_NE(device_data.get(), nullptr);
159+
copy_host_to_cuda(expected, device_data.get());
160+
161+
std::vector<float> dst_data(expected.size(), 0.0f);
162+
int32_t sizes[] = {static_cast<int32_t>(expected.size())};
163+
uint8_t dim_order[] = {0};
164+
int32_t strides[] = {1};
165+
166+
TensorImpl src_impl(
167+
ScalarType::Float,
168+
1,
169+
sizes,
170+
device_data.get(),
171+
dim_order,
172+
strides,
173+
TensorShapeDynamism::STATIC,
174+
DeviceType::CUDA,
175+
0);
176+
Tensor src(&src_impl);
177+
178+
TensorImpl dst_impl(
179+
ScalarType::Float,
180+
1,
181+
sizes,
182+
dst_data.data(),
183+
dim_order,
184+
strides,
185+
TensorShapeDynamism::STATIC,
186+
DeviceType::CPU,
187+
0);
188+
Tensor dst(&dst_impl);
189+
190+
Tensor& result = op_d2h_copy_out(src, dst);
191+
192+
EXPECT_EQ(context_.failure_state(), Error::Ok);
193+
EXPECT_EQ(&result, &dst);
194+
EXPECT_EQ(dst_data, expected);
195+
}
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
/**
10+
* Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops.
11+
*
12+
* These ops transfer tensor data between CPU and device memory using
13+
* the DeviceAllocator interface. The device type is inferred from the
14+
* tensor metadata (out.device_type() for H2D, self.device_type() for D2H),
15+
* which was set during AOT serialization by PropagateDevicePass.
16+
*/
17+
18+
#include <executorch/runtime/core/device_allocator.h>
19+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
20+
#include <executorch/runtime/kernel/kernel_includes.h>
21+
22+
namespace torch {
23+
namespace executor {
24+
namespace native {
25+
26+
using Tensor = executorch::aten::Tensor;
27+
using DeviceAllocator = executorch::runtime::DeviceAllocator;
28+
using Error = executorch::runtime::Error;
29+
30+
/**
31+
* Copies tensor data from host (CPU) memory to device memory.
32+
*
33+
* self: source tensor on CPU
34+
* out: destination tensor on device (memory-planned by runtime)
35+
*
36+
* The device type and index are inferred from out's TensorImpl metadata.
37+
*/
38+
Tensor&
39+
_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
40+
auto device_type = out.unsafeGetTensorImpl()->device_type();
41+
auto device_index = out.unsafeGetTensorImpl()->device_index();
42+
43+
ET_KERNEL_CHECK_MSG(
44+
ctx,
45+
self.unsafeGetTensorImpl()->device_type() ==
46+
executorch::runtime::etensor::DeviceType::CPU,
47+
InvalidArgument,
48+
out,
49+
"_h2d_copy: source tensor must be on CPU, got device_type=%d",
50+
static_cast<int>(self.unsafeGetTensorImpl()->device_type()));
51+
52+
ET_KERNEL_CHECK_MSG(
53+
ctx,
54+
device_type != executorch::runtime::etensor::DeviceType::CPU,
55+
InvalidArgument,
56+
out,
57+
"_h2d_copy: destination tensor must be on a non-CPU device");
58+
59+
auto nbytes = self.nbytes();
60+
ET_KERNEL_CHECK_MSG(
61+
ctx,
62+
nbytes == out.nbytes(),
63+
InvalidArgument,
64+
out,
65+
"_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
66+
nbytes,
67+
out.nbytes());
68+
69+
DeviceAllocator* allocator =
70+
executorch::runtime::get_device_allocator(device_type);
71+
ET_KERNEL_CHECK_MSG(
72+
ctx,
73+
allocator != nullptr,
74+
NotFound,
75+
out,
76+
"_h2d_copy: no device allocator registered for device_type=%d",
77+
static_cast<int>(device_type));
78+
79+
Error err = allocator->copy_host_to_device(
80+
out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
81+
ET_KERNEL_CHECK_MSG(
82+
ctx,
83+
err == Error::Ok,
84+
Internal,
85+
out,
86+
"_h2d_copy: copy_host_to_device failed");
87+
88+
return out;
89+
}
90+
91+
/**
92+
* Copies tensor data from device memory to host (CPU) memory.
93+
*
94+
* self: source tensor on device
95+
* out: destination tensor on CPU (memory-planned by runtime)
96+
*
97+
* The device type and index are inferred from self's TensorImpl metadata.
98+
*/
99+
Tensor&
100+
_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
101+
auto device_type = self.unsafeGetTensorImpl()->device_type();
102+
auto device_index = self.unsafeGetTensorImpl()->device_index();
103+
104+
ET_KERNEL_CHECK_MSG(
105+
ctx,
106+
device_type != executorch::runtime::etensor::DeviceType::CPU,
107+
InvalidArgument,
108+
out,
109+
"_d2h_copy: source tensor must be on a non-CPU device");
110+
111+
ET_KERNEL_CHECK_MSG(
112+
ctx,
113+
out.unsafeGetTensorImpl()->device_type() ==
114+
executorch::runtime::etensor::DeviceType::CPU,
115+
InvalidArgument,
116+
out,
117+
"_d2h_copy: destination tensor must be on CPU, got device_type=%d",
118+
static_cast<int>(out.unsafeGetTensorImpl()->device_type()));
119+
120+
auto nbytes = self.nbytes();
121+
ET_KERNEL_CHECK_MSG(
122+
ctx,
123+
nbytes == out.nbytes(),
124+
InvalidArgument,
125+
out,
126+
"_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
127+
nbytes,
128+
out.nbytes());
129+
130+
DeviceAllocator* allocator =
131+
executorch::runtime::get_device_allocator(device_type);
132+
ET_KERNEL_CHECK_MSG(
133+
ctx,
134+
allocator != nullptr,
135+
NotFound,
136+
out,
137+
"_d2h_copy: no device allocator registered for device_type=%d",
138+
static_cast<int>(device_type));
139+
140+
Error err = allocator->copy_device_to_host(
141+
out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
142+
ET_KERNEL_CHECK_MSG(
143+
ctx,
144+
err == Error::Ok,
145+
Internal,
146+
out,
147+
"_d2h_copy: copy_device_to_host failed");
148+
149+
return out;
150+
}
151+
152+
} // namespace native
153+
} // namespace executor
154+
} // namespace torch

kernels/portable/functions.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,16 @@
10451045
- arg_meta: null
10461046
kernel_name: torch::executor::zeros_out
10471047

1048+
- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
1049+
kernels:
1050+
- arg_meta: null
1051+
kernel_name: torch::executor::_h2d_copy_out
1052+
1053+
- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
1054+
kernels:
1055+
- arg_meta: null
1056+
kernel_name: torch::executor::_d2h_copy_out
1057+
10481058
- func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
10491059
kernels:
10501060
- arg_meta: null

0 commit comments

Comments
 (0)