Skip to content

Commit 0506f2c

Browse files
committed
[ET Device Support] MemoryManager: add per-buffer device metadata
This diff extend MemoryManager with optional per-buffer device type metadata so the runtime explicitly knows which planned memory buffers are on which device. This enables future device-aware dispatch and debugging. Changes: - New constructor taking planned_buffer_devices as extra input for device info - New accessors: planned_buffer_devices(), has_device_memory() - No existing functionalities have been updated. Differential Revision: [D97850706](https://our.internmc.facebook.com/intern/diff/D97850706/) ghstack-source-id: 357060903 Pull Request resolved: #18475
1 parent adff306 commit 0506f2c

6 files changed

Lines changed: 265 additions & 0 deletions

File tree

runtime/core/portable_type/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def define_common_targets():
2828
"//executorch/extension/fb/dynamic_shim/...",
2929
"//executorch/kernels/portable/cpu/...",
3030
"//executorch/runtime/core/...",
31+
"//executorch/runtime/executor/...",
3132
"//executorch/runtime/core/exec_aten/...",
3233
"//executorch/runtime/core/portable_type/test/...",
3334
],

runtime/executor/memory_manager.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#include <executorch/runtime/core/hierarchical_allocator.h>
1212
#include <executorch/runtime/core/memory_allocator.h>
13+
#include <executorch/runtime/core/portable_type/device.h>
14+
#include <executorch/runtime/core/span.h>
1315

1416
namespace executorch {
1517
namespace runtime {
@@ -61,6 +63,32 @@ class MemoryManager final {
6163
"method allocator cannot be the same as temp allocator");
6264
}
6365

66+
/**
67+
* Constructs a new MemoryManager with per-buffer device metadata.
68+
*
69+
* @param[in] method_allocator Same as above.
70+
* @param[in] planned_memory Same as above. May contain a mix of CPU and
71+
* device pointers — HierarchicalAllocator only does pointer arithmetic,
72+
* so device pointers are valid.
73+
* @param[in] temp_allocator Same as above.
74+
* @param[in] planned_buffer_devices One entry per planned memory buffer
75+
* (same count as planned_memory buffers), indicating the device type for
76+
* each buffer. For CPU-only programs, use the 3-arg constructor instead.
77+
*/
78+
MemoryManager(
79+
MemoryAllocator* method_allocator,
80+
HierarchicalAllocator* planned_memory,
81+
MemoryAllocator* temp_allocator,
82+
Span<const etensor::DeviceType> planned_buffer_devices)
83+
: method_allocator_(method_allocator),
84+
planned_memory_(planned_memory),
85+
temp_allocator_(temp_allocator),
86+
planned_buffer_devices_(planned_buffer_devices) {
87+
ET_CHECK_MSG(
88+
method_allocator != temp_allocator,
89+
"method allocator cannot be the same as temp allocator");
90+
}
91+
6492
/**
6593
* DEPRECATED: Use the constructor without `constant_allocator` instead.
6694
*
@@ -105,10 +133,28 @@ class MemoryManager final {
105133
return temp_allocator_;
106134
}
107135

136+
/**
137+
* Returns per-buffer device metadata. One entry per planned memory buffer,
138+
* same count as planned_memory buffers. Empty if no device metadata was
139+
* provided (CPU-only program).
140+
*/
141+
Span<const etensor::DeviceType> planned_buffer_devices() const {
142+
return planned_buffer_devices_;
143+
}
144+
145+
/**
146+
* Returns true if any planned buffer is on a non-CPU device.
147+
* When false, the memory setup is CPU-only and follows the legacy path.
148+
*/
149+
bool has_device_memory() const {
150+
return planned_buffer_devices_.size() > 0;
151+
}
152+
108153
private:
109154
MemoryAllocator* method_allocator_;
110155
HierarchicalAllocator* planned_memory_;
111156
MemoryAllocator* temp_allocator_;
157+
Span<const etensor::DeviceType> planned_buffer_devices_;
112158
};
113159

114160
} // namespace runtime

runtime/executor/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def define_common_targets():
3636
],
3737
exported_deps = [
3838
"//executorch/runtime/core:memory_allocator",
39+
"//executorch/runtime/core/portable_type:portable_type",
3940
],
4041
visibility = ["PUBLIC"],
4142
)

runtime/executor/test/memory_manager_test.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ using namespace ::testing;
1717
using executorch::runtime::HierarchicalAllocator;
1818
using executorch::runtime::MemoryAllocator;
1919
using executorch::runtime::MemoryManager;
20+
using executorch::runtime::Span;
21+
using executorch::runtime::etensor::DeviceType;
2022

2123
TEST(MemoryManagerTest, MinimalCtor) {
2224
MemoryAllocator method_allocator(0, nullptr);
@@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
9395
/*temp_allocator=*/&method_allocator),
9496
"cannot be the same");
9597
}
98+
99+
TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
100+
MemoryAllocator method_allocator(0, nullptr);
101+
HierarchicalAllocator planned_memory({});
102+
MemoryAllocator temp_allocator(0, nullptr);
103+
104+
MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
105+
106+
EXPECT_FALSE(mm.has_device_memory());
107+
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
108+
}
109+
110+
TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) {
111+
MemoryAllocator method_allocator(0, nullptr);
112+
HierarchicalAllocator planned_memory({});
113+
MemoryAllocator temp_allocator(0, nullptr);
114+
115+
// 3 buffers: CPU, CUDA, CPU
116+
DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU};
117+
Span<const DeviceType> device_span(devices, 3);
118+
119+
MemoryManager mm(
120+
&method_allocator, &planned_memory, &temp_allocator, device_span);
121+
122+
EXPECT_EQ(mm.method_allocator(), &method_allocator);
123+
EXPECT_EQ(mm.planned_memory(), &planned_memory);
124+
EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
125+
EXPECT_TRUE(mm.has_device_memory());
126+
EXPECT_EQ(mm.planned_buffer_devices().size(), 3);
127+
EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU);
128+
EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA);
129+
EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU);
130+
}
131+
132+
TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
133+
MemoryAllocator method_allocator(0, nullptr);
134+
135+
MemoryManager mm(&method_allocator);
136+
137+
EXPECT_FALSE(mm.has_device_memory());
138+
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
139+
}

runtime/executor/test/targets.bzl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
1919
"//executorch/exir/backend/test/...",
2020
"//executorch/runtime/backend/...",
2121
"//executorch/extension/pybindings/...",
22+
"//executorch/extension/module/test/...",
2223
"//executorch/devtools/fb/runners/...",
2324
"//executorch/test/...",
2425
"//executorch/examples/...",
@@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
326327
deps = [
327328
":managed_memory_manager",
328329
"//executorch/runtime/executor:program",
330+
"//executorch/runtime/core:device_allocator",
331+
"//executorch/runtime/core:device_memory_buffer",
329332
"//executorch/extension/data_loader:file_data_loader",
330333
"//executorch/schema:program",
331334
],

runtime/executor/test/tensor_parser_device_test.cpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,32 @@
1717
#include <executorch/runtime/executor/tensor_parser.h>
1818

1919
#include <executorch/extension/data_loader/file_data_loader.h>
20+
#include <executorch/runtime/core/device_allocator.h>
21+
#include <executorch/runtime/core/device_memory_buffer.h>
2022
#include <executorch/runtime/core/exec_aten/exec_aten.h>
2123
#include <executorch/runtime/executor/test/managed_memory_manager.h>
24+
#include <executorch/runtime/platform/runtime.h>
2225
#include <executorch/schema/program_generated.h>
2326

2427
#include <gtest/gtest.h>
2528

2629
using executorch::aten::Tensor;
30+
using executorch::runtime::DeviceAllocator;
31+
using executorch::runtime::DeviceMemoryBuffer;
2732
using executorch::runtime::Error;
33+
using executorch::runtime::HierarchicalAllocator;
34+
using executorch::runtime::MemoryAllocator;
35+
using executorch::runtime::MemoryManager;
36+
using executorch::runtime::MethodMeta;
2837
using executorch::runtime::Program;
2938
using executorch::runtime::Result;
39+
using executorch::runtime::Span;
40+
using executorch::runtime::get_device_allocator;
41+
using executorch::runtime::register_device_allocator;
3042
using executorch::runtime::deserialization::parseTensor;
3143
using executorch::runtime::testing::ManagedMemoryManager;
44+
using executorch::runtime::etensor::DeviceIndex;
45+
using executorch::runtime::etensor::DeviceType;
3246
using torch::executor::util::FileDataLoader;
3347

3448
constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
@@ -50,15 +64,77 @@ class ProgramTestFriend final {
5064

5165
using executorch::runtime::testing::ProgramTestFriend;
5266

67+
namespace {
68+
69+
/**
70+
* Mock CUDA allocator that uses host memory for testing.
71+
* Tracks the allocated range so tests can verify tensor data_ptr
72+
* falls within the "device" memory region.
73+
*/
74+
class MockCudaAllocator : public DeviceAllocator {
75+
public:
76+
Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
77+
allocate_count_++;
78+
buffer_ = std::make_unique<uint8_t[]>(nbytes);
79+
buffer_size_ = nbytes;
80+
return static_cast<void*>(buffer_.get());
81+
}
82+
83+
void deallocate(void* ptr, DeviceIndex index) override {
84+
deallocate_count_++;
85+
buffer_.reset();
86+
buffer_size_ = 0;
87+
}
88+
89+
Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
90+
return Error::Ok;
91+
}
92+
93+
Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
94+
return Error::Ok;
95+
}
96+
97+
DeviceType device_type() const override {
98+
return DeviceType::CUDA;
99+
}
100+
101+
bool is_device_ptr(const void* ptr) const {
102+
if (buffer_ == nullptr || buffer_size_ == 0) {
103+
return false;
104+
}
105+
auto* p = static_cast<const uint8_t*>(ptr);
106+
return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
107+
}
108+
109+
int allocate_count_ = 0;
110+
int deallocate_count_ = 0;
111+
112+
private:
113+
std::unique_ptr<uint8_t[]> buffer_;
114+
size_t buffer_size_ = 0;
115+
};
116+
117+
} // namespace
118+
119+
static MockCudaAllocator g_mock_cuda;
120+
53121
class TensorParserDeviceTest : public ::testing::Test {
54122
protected:
123+
static void SetUpTestSuite() {
124+
executorch::runtime::runtime_init();
125+
register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
126+
}
127+
55128
void SetUp() override {
56129
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
57130
ASSERT_NE(path, nullptr)
58131
<< "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
59132
Result<FileDataLoader> loader = FileDataLoader::from(path);
60133
ASSERT_EQ(loader.error(), Error::Ok);
61134
loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
135+
136+
g_mock_cuda.allocate_count_ = 0;
137+
g_mock_cuda.deallocate_count_ = 0;
62138
}
63139

64140
std::unique_ptr<FileDataLoader> loader_;
@@ -169,3 +245,97 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
169245
<< " without device annotation should have device_index=0";
170246
}
171247
}
248+
TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
249+
Result<Program> program =
250+
Program::load(loader_.get(), Program::Verification::Minimal);
251+
ASSERT_EQ(program.error(), Error::Ok);
252+
253+
Result<MethodMeta> method_meta = program->method_meta("forward");
254+
ASSERT_EQ(method_meta.error(), Error::Ok);
255+
256+
// ModuleAddWithDevice has:
257+
// non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes)
258+
// non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
259+
const size_t num_buffers = method_meta->num_memory_planned_buffers();
260+
ASSERT_EQ(num_buffers, 1);
261+
262+
// Set up device-aware planned memory.
263+
std::vector<Span<uint8_t>> planned_spans;
264+
std::vector<std::vector<uint8_t>> cpu_buffers;
265+
std::vector<DeviceMemoryBuffer> device_buffers;
266+
267+
for (size_t i = 0; i < num_buffers; ++i) {
268+
auto size = method_meta->memory_planned_buffer_size(i);
269+
ASSERT_TRUE(size.ok());
270+
auto device = method_meta->memory_planned_buffer_device(i);
271+
ASSERT_TRUE(device.ok());
272+
273+
if (device->is_cpu()) {
274+
cpu_buffers.emplace_back(size.get());
275+
planned_spans.emplace_back(
276+
cpu_buffers.back().data(), cpu_buffers.back().size());
277+
} else {
278+
cpu_buffers.emplace_back(); // empty placeholder
279+
auto dmb = DeviceMemoryBuffer::create(
280+
size.get(), device->type(), device->index());
281+
ASSERT_TRUE(dmb.ok())
282+
<< "DeviceMemoryBuffer::create failed for buffer " << i;
283+
planned_spans.emplace_back(dmb->as_span());
284+
device_buffers.push_back(std::move(dmb.get()));
285+
}
286+
}
287+
288+
ASSERT_EQ(g_mock_cuda.allocate_count_, 1);
289+
290+
// Build HierarchicalAllocator with mixed CPU/device spans.
291+
HierarchicalAllocator planned_memory(
292+
{planned_spans.data(), planned_spans.size()});
293+
294+
constexpr size_t kMethodAllocBytes = 32 * 1024U;
295+
auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
296+
MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
297+
MemoryManager memory_manager(&method_allocator, &planned_memory);
298+
299+
// Parse tensors and verify CUDA tensors have device memory.
300+
const executorch_flatbuffer::Program* internal_program =
301+
ProgramTestFriend::GetInternalProgram(&program.get());
302+
auto* execution_plan =
303+
internal_program->execution_plan()->GetMutableObject(0);
304+
auto* flatbuffer_values = execution_plan->values();
305+
306+
int cuda_with_device_memory = 0;
307+
308+
for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
309+
auto* serialization_value = flatbuffer_values->Get(i);
310+
if (serialization_value->val_type() !=
311+
executorch_flatbuffer::KernelTypes::Tensor) {
312+
continue;
313+
}
314+
315+
auto* s_tensor = serialization_value->val_as_Tensor();
316+
bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
317+
s_tensor->extra_tensor_info()->device_type() ==
318+
executorch_flatbuffer::DeviceType::CUDA;
319+
320+
Result<Tensor> tensor =
321+
parseTensor(&program.get(), &memory_manager, s_tensor);
322+
ASSERT_TRUE(tensor.ok())
323+
<< "parseTensor failed at index " << i
324+
<< " with error 0x" << std::hex
325+
<< static_cast<uint32_t>(tensor.error());
326+
327+
Tensor t = tensor.get();
328+
329+
if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
330+
EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
331+
<< "CUDA tensor at index " << i
332+
<< " should have data_ptr in device memory, but got CPU memory";
333+
cuda_with_device_memory++;
334+
}
335+
}
336+
337+
// All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
338+
// their data_ptr pointing to the mock device memory buffer.
339+
EXPECT_EQ(cuda_with_device_memory, 3)
340+
<< "All 3 CUDA tensors should have data_ptr in device memory";
341+
}

0 commit comments

Comments
 (0)