Skip to content

Commit e665842

Browse files
committed
[ET Device Support] CudaAllocator: device memory allocator for CUDA backend
Implement CudaAllocator, a concrete DeviceAllocator using cudaMalloc/ cudaFree/cudaMemcpy for CUDA device memory management. The allocator is automatically registered with the DeviceAllocatorRegistry when the CUDA backend library is linked, ensuring DeviceMemoryBuffer::create(CUDA) works transparently. Differential Revision: [D98014184](https://our.internmc.facebook.com/intern/diff/D98014184/) ghstack-source-id: 357086933 Pull Request resolved: #18468
1 parent 6f09c45 commit e665842

6 files changed

Lines changed: 351 additions & 30 deletions

File tree

backends/aoti/slim/core/storage.h

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#ifdef CUDA_AVAILABLE
1414
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
1515
#include <executorch/backends/aoti/slim/cuda/guard.h>
16+
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
1617
#endif
1718

1819
#include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -107,9 +108,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
107108
/// @param device The target CUDA device (used to get the stream).
108109
/// @return Pointer to allocated device memory.
109110
static void* allocate(size_t nbytes, const c10::Device& device) {
110-
// Get the current stream for this device (set by CUDAStreamGuard if any)
111-
// This follows PyTorch's pattern where the allocator assumes the caller
112-
// has already set the correct device via CUDAStreamGuard.
113111
auto stream_result =
114112
executorch::backends::cuda::getCurrentCUDAStream(device.index());
115113
ET_CHECK_MSG(
@@ -118,31 +116,23 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
118116
static_cast<int>(device.index()));
119117

120118
cudaStream_t stream = stream_result.get();
121-
void* data = nullptr;
122-
ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
123-
return data;
119+
auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
120+
nbytes, device.index(), stream);
121+
ET_CHECK_MSG(
122+
result.ok(),
123+
"CudaAllocator::allocate_async failed for %zu bytes on device %d",
124+
nbytes,
125+
static_cast<int>(device.index()));
126+
return result.get();
124127
}
125128

126-
/// Frees CUDA device memory on the current stream.
127-
/// @param ptr Pointer to device memory to free.
128129
static void free(void* ptr) {
129-
// Get the current stream for the current device
130-
// Currently all cuda slimtensors should be on the same device same stream,
131-
// so we can just use the stream on current device.
132-
// TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
133-
// support multiple devices.
134130
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
135131
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
136-
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
132+
executorch::backends::cuda::CudaAllocator::deallocate_async(
133+
ptr, -1, stream_result.get());
137134
}
138135

139-
/// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
140-
/// @param dst Destination pointer.
141-
/// @param src Source pointer.
142-
/// @param nbytes Number of bytes to copy.
143-
/// @param dst_device Destination device.
144-
/// @param src_device Source device.
145-
/// @param stream CUDA stream for async copy.
146136
static void memcpy_async(
147137
void* dst,
148138
const void* src,
@@ -151,7 +141,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
151141
const c10::Device& src_device,
152142
cudaStream_t stream) {
153143
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
154-
155144
if (src_device.is_cpu()) {
156145
direction = cudaMemcpyHostToDevice;
157146
} else if (dst_device.is_cpu()) {
@@ -164,23 +153,18 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
164153
static_cast<int>(dst_device.index()));
165154
}
166155

167-
ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
156+
auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
157+
dst, src, nbytes, direction, stream);
158+
ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
168159
}
169160

170-
/// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
171-
/// @param dst Destination pointer.
172-
/// @param src Source pointer.
173-
/// @param nbytes Number of bytes to copy.
174-
/// @param dst_device Destination device.
175-
/// @param src_device Source device.
176161
static void memcpy(
177162
void* dst,
178163
const void* src,
179164
size_t nbytes,
180165
const c10::Device& dst_device,
181166
const c10::Device& src_device) {
182167
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
183-
184168
if (src_device.is_cpu()) {
185169
direction = cudaMemcpyHostToDevice;
186170
} else if (dst_device.is_cpu()) {

backends/aoti/slim/core/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def define_common_targets():
1919
"//executorch/runtime/platform:platform",
2020
"//executorch/backends/aoti/slim/c10/cuda:exception",
2121
"//executorch/backends/aoti/slim/cuda:guard",
22+
"//executorch/backends/cuda/runtime:cuda_allocator",
2223
],
2324
)
2425

backends/cuda/runtime/TARGETS

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,33 @@ runtime.cxx_library(
7070
],
7171
)
7272

73+
runtime.cxx_library(
74+
name = "cuda_allocator",
75+
srcs = [
76+
"cuda_allocator.cpp",
77+
],
78+
headers = [
79+
"cuda_allocator.h",
80+
],
81+
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
82+
link_whole = True,
83+
supports_python_dlopen = True,
84+
visibility = ["PUBLIC"],
85+
exported_deps = [
86+
"//executorch/runtime/core:device_allocator",
87+
],
88+
deps = [
89+
"//executorch/runtime/platform:platform",
90+
],
91+
nvcc_flags = get_nvcc_arch_args() + [
92+
"-_NVCC_HOST_COMPILER_FLAG_",
93+
"gcc",
94+
],
95+
external_deps = [
96+
("cuda", None, "cuda-lazy"),
97+
],
98+
)
99+
73100
runtime.cxx_library(
74101
name = "cuda_backend",
75102
srcs = [
@@ -87,6 +114,8 @@ runtime.cxx_library(
87114
visibility = ["PUBLIC"],
88115
deps = [
89116
":runtime_shims",
117+
":cuda_allocator",
118+
":cuda_platform",
90119
"//executorch/backends/aoti:aoti_common_slim",
91120
"//executorch/backends/aoti/slim/core:slimtensor",
92121
"//executorch/backends/aoti/slim/factory:empty",
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
10+
11+
#include <cuda_runtime.h>
12+
13+
#include <executorch/runtime/platform/log.h>
14+
15+
namespace executorch::backends::cuda {
16+
17+
using executorch::runtime::Error;
18+
using executorch::runtime::Result;
19+
using executorch::runtime::etensor::DeviceIndex;
20+
using executorch::runtime::etensor::DeviceType;
21+
22+
Result<void*> CudaAllocator::allocate(size_t nbytes, DeviceIndex index) {
23+
void* ptr = nullptr;
24+
cudaError_t prev_device_err = cudaSuccess;
25+
int prev_device = 0;
26+
27+
if (index >= 0) {
28+
prev_device_err = cudaGetDevice(&prev_device);
29+
if (prev_device_err == cudaSuccess) {
30+
cudaSetDevice(index);
31+
}
32+
}
33+
34+
cudaError_t err = cudaMalloc(&ptr, nbytes);
35+
36+
if (index >= 0 && prev_device_err == cudaSuccess) {
37+
cudaSetDevice(prev_device);
38+
}
39+
40+
if (err != cudaSuccess) {
41+
ET_LOG(
42+
Error,
43+
"cudaMalloc failed: %s (requested %zu bytes on device %d)",
44+
cudaGetErrorString(err),
45+
nbytes,
46+
static_cast<int>(index));
47+
return Error::MemoryAllocationFailed;
48+
}
49+
50+
return ptr;
51+
}
52+
53+
void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
54+
if (ptr == nullptr) {
55+
return;
56+
}
57+
58+
int prev_device = 0;
59+
cudaError_t prev_device_err = cudaSuccess;
60+
61+
if (index >= 0) {
62+
prev_device_err = cudaGetDevice(&prev_device);
63+
if (prev_device_err == cudaSuccess) {
64+
cudaSetDevice(index);
65+
}
66+
}
67+
68+
cudaError_t err = cudaFree(ptr);
69+
70+
if (index >= 0 && prev_device_err == cudaSuccess) {
71+
cudaSetDevice(prev_device);
72+
}
73+
74+
if (err != cudaSuccess) {
75+
ET_LOG(
76+
Error,
77+
"cudaFree failed: %s (ptr=%p, device %d)",
78+
cudaGetErrorString(err),
79+
ptr,
80+
static_cast<int>(index));
81+
}
82+
}
83+
84+
Error CudaAllocator::copy_host_to_device(
85+
void* dst,
86+
const void* src,
87+
size_t nbytes,
88+
DeviceIndex index) {
89+
int prev_device = 0;
90+
cudaError_t prev_device_err = cudaSuccess;
91+
92+
if (index >= 0) {
93+
prev_device_err = cudaGetDevice(&prev_device);
94+
if (prev_device_err == cudaSuccess) {
95+
cudaSetDevice(index);
96+
}
97+
}
98+
99+
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
100+
101+
if (index >= 0 && prev_device_err == cudaSuccess) {
102+
cudaSetDevice(prev_device);
103+
}
104+
105+
if (err != cudaSuccess) {
106+
ET_LOG(
107+
Error,
108+
"cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
109+
cudaGetErrorString(err),
110+
nbytes,
111+
static_cast<int>(index));
112+
return Error::Internal;
113+
}
114+
return Error::Ok;
115+
}
116+
117+
Error CudaAllocator::copy_device_to_host(
118+
void* dst,
119+
const void* src,
120+
size_t nbytes,
121+
DeviceIndex index) {
122+
int prev_device = 0;
123+
cudaError_t prev_device_err = cudaSuccess;
124+
125+
if (index >= 0) {
126+
prev_device_err = cudaGetDevice(&prev_device);
127+
if (prev_device_err == cudaSuccess) {
128+
cudaSetDevice(index);
129+
}
130+
}
131+
132+
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
133+
134+
if (index >= 0 && prev_device_err == cudaSuccess) {
135+
cudaSetDevice(prev_device);
136+
}
137+
138+
if (err != cudaSuccess) {
139+
ET_LOG(
140+
Error,
141+
"cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
142+
cudaGetErrorString(err),
143+
nbytes,
144+
static_cast<int>(index));
145+
return Error::Internal;
146+
}
147+
return Error::Ok;
148+
}
149+
150+
DeviceType CudaAllocator::device_type() const {
151+
return DeviceType::CUDA;
152+
}
153+
154+
CudaAllocator& CudaAllocator::instance() {
155+
static CudaAllocator allocator;
156+
return allocator;
157+
}
158+
159+
Result<void*> CudaAllocator::allocate_async(
160+
size_t nbytes,
161+
DeviceIndex index,
162+
cudaStream_t stream) {
163+
void* ptr = nullptr;
164+
cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream);
165+
if (err != cudaSuccess) {
166+
ET_LOG(
167+
Error,
168+
"cudaMallocAsync failed: %s (requested %zu bytes on device %d)",
169+
cudaGetErrorString(err),
170+
nbytes,
171+
static_cast<int>(index));
172+
return Error::MemoryAllocationFailed;
173+
}
174+
return ptr;
175+
}
176+
177+
void CudaAllocator::deallocate_async(
178+
void* ptr,
179+
DeviceIndex index,
180+
cudaStream_t stream) {
181+
if (ptr == nullptr) {
182+
return;
183+
}
184+
cudaError_t err = cudaFreeAsync(ptr, stream);
185+
if (err != cudaSuccess) {
186+
ET_LOG(
187+
Error,
188+
"cudaFreeAsync failed: %s (ptr=%p, device %d)",
189+
cudaGetErrorString(err),
190+
ptr,
191+
static_cast<int>(index));
192+
}
193+
}
194+
195+
Error CudaAllocator::memcpy_async(
196+
void* dst,
197+
const void* src,
198+
size_t nbytes,
199+
cudaMemcpyKind direction,
200+
cudaStream_t stream) {
201+
cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream);
202+
if (err != cudaSuccess) {
203+
ET_LOG(
204+
Error,
205+
"cudaMemcpyAsync failed: %s (%zu bytes)",
206+
cudaGetErrorString(err),
207+
nbytes);
208+
return Error::Internal;
209+
}
210+
return Error::Ok;
211+
}
212+
213+
} // namespace executorch::backends::cuda

0 commit comments

Comments
 (0)