Skip to content

Commit 6f09c45

Browse files
committed
[ET Device Support] Module: allocate device memory for planned buffers
This diff enables module API loading program memory-planed on non-cpu device. It update Module::load_method() to detect device buffers via MethodMeta and allocate device memory using the registered DeviceAllocator. Device memory is managed via DeviceMemoryBuffer RAII objects stored in PlannedMemory, ensuring proper cleanup when the Method is destroyed. Differential Revision: [D97850705](https://our.internmc.facebook.com/intern/diff/D97850705/) ghstack-source-id: 357060902 Pull Request resolved: #18467
1 parent 6091b66 commit 6f09c45

6 files changed

Lines changed: 335 additions & 5 deletions

File tree

extension/module/module.cpp

Lines changed: 90 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
1414
#include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
1515
#include <executorch/extension/named_data_map/merged_data_map.h>
16+
#include <executorch/runtime/core/device_memory_buffer.h>
1617
#include <executorch/runtime/platform/runtime.h>
1718

1819
namespace executorch {
@@ -314,6 +315,45 @@ Module::make_planned_memory_with_shared_arenas(
314315
return planned;
315316
}
316317

318+
std::unique_ptr<Module::PlannedMemory>
319+
Module::make_planned_memory_with_devices(
320+
const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
321+
auto planned = std::make_unique<PlannedMemory>();
322+
const size_t num_buffers = method_meta.num_memory_planned_buffers();
323+
planned->planned_buffers.reserve(num_buffers);
324+
planned->planned_spans.reserve(num_buffers);
325+
326+
for (size_t i = 0; i < num_buffers; ++i) {
327+
auto size = method_meta.memory_planned_buffer_size(i);
328+
ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
329+
auto device = method_meta.memory_planned_buffer_device(i);
330+
ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
331+
332+
if (device->is_cpu()) {
333+
planned->planned_buffers.emplace_back(size.get());
334+
planned->planned_spans.emplace_back(
335+
planned->planned_buffers.back().data(), size.get());
336+
} else {
337+
// Allocate device memory via DeviceAllocator and store the RAII buffer.
338+
planned->planned_buffers.emplace_back(); // empty CPU placeholder
339+
auto dmb = runtime::DeviceMemoryBuffer::create(
340+
size.get(), device->type(), device->index());
341+
ET_CHECK_MSG(
342+
dmb.ok(),
343+
"Failed to allocate device memory for buffer %zu (device_type=%d)",
344+
i,
345+
static_cast<int>(device->type()));
346+
planned->planned_spans.emplace_back(dmb->as_span());
347+
planned->device_buffers.push_back(std::move(dmb.get()));
348+
}
349+
}
350+
351+
planned->planned_memory =
352+
std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
353+
planned->planned_spans.data(), planned->planned_spans.size()));
354+
return planned;
355+
}
356+
317357
runtime::Result<std::vector<size_t>> Module::get_mem_planned_buffer_sizes(
318358
const std::string& method_name) {
319359
auto meta_res = program_->method_meta(method_name.c_str());
@@ -365,10 +405,54 @@ runtime::Error Module::load_method(
365405
MethodHolder method_holder;
366406

367407
if (!planned_memory) {
368-
if (!share_memory_arenas_) {
408+
// Check if any buffers need device memory allocation.
409+
auto meta_res = program_->method_meta(method_name.c_str());
410+
ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
411+
auto& meta = meta_res.get();
412+
413+
bool has_device_buffers = false;
414+
for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
415+
auto dev = meta.memory_planned_buffer_device(i);
416+
if (dev.ok() && !dev->is_cpu()) {
417+
has_device_buffers = true;
418+
break;
419+
}
420+
}
421+
422+
if (has_device_buffers) {
423+
// Device memory with shared arenas is not yet supported.
424+
ET_CHECK_OR_RETURN_ERROR(
425+
!share_memory_arenas_,
426+
NotSupported,
427+
"Device memory buffers are not yet compatible with "
428+
"share_memory_arenas. Please disable share_memory_arenas "
429+
"when using models with device-planned memory.");
430+
431+
// Device-aware path: allocate CPU and device buffers, build metadata.
432+
method_holder.planned_memory =
433+
make_planned_memory_with_devices(meta);
434+
435+
// Build per-buffer device type array for MemoryManager metadata.
436+
for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
437+
auto dev = meta.memory_planned_buffer_device(i);
438+
method_holder.buffer_devices.push_back(
439+
dev.ok() ? dev->type()
440+
: runtime::etensor::DeviceType::CPU);
441+
}
442+
planned_memory = method_holder.planned_memory->planned_memory.get();
443+
444+
method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
445+
memory_allocator_.get(),
446+
planned_memory,
447+
temp_allocator_.get(),
448+
runtime::Span<const runtime::etensor::DeviceType>(
449+
method_holder.buffer_devices.data(),
450+
method_holder.buffer_devices.size()));
451+
} else if (!share_memory_arenas_) {
369452
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
370453
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
371454
method_holder.planned_memory = make_planned_memory(sizes_res.get());
455+
planned_memory = method_holder.planned_memory->planned_memory.get();
372456
} else {
373457
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
374458
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -385,12 +469,14 @@ runtime::Error Module::load_method(
385469
}
386470
method_holder.planned_memory =
387471
make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
472+
planned_memory = method_holder.planned_memory->planned_memory.get();
388473
}
389-
planned_memory = method_holder.planned_memory->planned_memory.get();
390474
}
391475

392-
method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
393-
memory_allocator_.get(), planned_memory, temp_allocator_.get());
476+
if (!method_holder.memory_manager) {
477+
method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
478+
memory_allocator_.get(), planned_memory, temp_allocator_.get());
479+
}
394480
auto res_method = program_->load_method(
395481
method_name.c_str(),
396482
method_holder.memory_manager.get(),

extension/module/module.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
#include <executorch/runtime/executor/program.h>
1818

19+
#include <executorch/runtime/core/device_memory_buffer.h>
20+
1921
#ifdef USE_ATEN_LIB
2022
#define ET_MODULE_NAMESPACE module::aten
2123
#else // !USE_ATEN_LIB
@@ -682,12 +684,15 @@ class Module {
682684
std::vector<std::vector<uint8_t>> planned_buffers;
683685
std::vector<runtime::Span<uint8_t>> planned_spans;
684686
std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
687+
std::vector<runtime::DeviceMemoryBuffer> device_buffers;
685688
};
686689
std::unique_ptr<PlannedMemory> make_planned_memory(
687690
const std::vector<size_t>& buffer_sizes);
688691
std::unique_ptr<PlannedMemory> make_planned_memory_with_shared_arenas(
689692
const std::vector<size_t>& buffer_sizes,
690693
std::vector<std::vector<uint8_t>>& shared_arenas);
694+
std::unique_ptr<PlannedMemory> make_planned_memory_with_devices(
695+
const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
691696
runtime::Result<std::vector<size_t>> get_mem_planned_buffer_sizes(
692697
const std::string& method_name);
693698
runtime::Result<std::vector<size_t>> get_max_mem_planned_buffer_sizes();
@@ -696,6 +701,7 @@ class Module {
696701
std::unique_ptr<PlannedMemory> planned_memory;
697702
std::unique_ptr<runtime::MemoryManager> memory_manager;
698703
std::unique_ptr<Method> method;
704+
std::vector<runtime::etensor::DeviceType> buffer_devices;
699705
};
700706

701707
std::string file_path_;

extension/module/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def define_common_targets():
2828
],
2929
exported_deps = [
3030
"//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
31+
"//executorch/runtime/core:device_memory_buffer",
3132
],
3233
)
3334

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
/**
10+
* Tests that Module's device-aware memory allocation path works correctly.
11+
*
12+
* Uses ModuleAddWithDevice.pte which has:
13+
* non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved)
14+
* non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
15+
*
16+
* Since we don't have a real CUDA backend, we test that:
17+
* 1. CPU-only models load through Module without invoking device allocator
18+
* 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
19+
*/
20+
21+
#include <executorch/extension/module/module.h>
22+
23+
#include <gtest/gtest.h>
24+
25+
#include <executorch/runtime/core/device_allocator.h>
26+
#include <executorch/runtime/core/device_memory_buffer.h>
27+
#include <executorch/runtime/platform/runtime.h>
28+
29+
using executorch::extension::Module;
30+
using executorch::runtime::DeviceAllocator;
31+
using executorch::runtime::DeviceMemoryBuffer;
32+
using executorch::runtime::Error;
33+
using executorch::runtime::Result;
34+
using executorch::runtime::register_device_allocator;
35+
using executorch::runtime::etensor::DeviceIndex;
36+
using executorch::runtime::etensor::DeviceType;
37+
38+
namespace {
39+
40+
class MockCudaAllocator : public DeviceAllocator {
41+
public:
42+
Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
43+
allocate_count_++;
44+
last_allocate_size_ = nbytes;
45+
last_allocate_index_ = index;
46+
buffer_ = std::make_unique<uint8_t[]>(nbytes);
47+
return static_cast<void*>(buffer_.get());
48+
}
49+
50+
void deallocate(void* ptr, DeviceIndex index) override {
51+
deallocate_count_++;
52+
buffer_.reset();
53+
}
54+
55+
Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
56+
return Error::Ok;
57+
}
58+
59+
Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
60+
return Error::Ok;
61+
}
62+
63+
DeviceType device_type() const override {
64+
return DeviceType::CUDA;
65+
}
66+
67+
int allocate_count_ = 0;
68+
int deallocate_count_ = 0;
69+
size_t last_allocate_size_ = 0;
70+
DeviceIndex last_allocate_index_ = -1;
71+
72+
private:
73+
std::unique_ptr<uint8_t[]> buffer_;
74+
};
75+
76+
} // namespace
77+
78+
static MockCudaAllocator g_mock_cuda;
79+
80+
class ModuleDeviceMemoryTest : public ::testing::Test {
81+
protected:
82+
static void SetUpTestSuite() {
83+
executorch::runtime::runtime_init();
84+
register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
85+
}
86+
87+
void SetUp() override {
88+
g_mock_cuda.allocate_count_ = 0;
89+
g_mock_cuda.deallocate_count_ = 0;
90+
g_mock_cuda.last_allocate_size_ = 0;
91+
g_mock_cuda.last_allocate_index_ = -1;
92+
}
93+
};
94+
95+
TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
96+
const char* path = std::getenv("ET_MODULE_ADD_PATH");
97+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
98+
99+
Module module(path);
100+
auto err = module.load_method("forward");
101+
ASSERT_EQ(err, Error::Ok);
102+
103+
EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
104+
<< "CPU-only model should not allocate device memory";
105+
}
106+
107+
TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
108+
// Directly test DeviceMemoryBuffer::create with the registered mock.
109+
// This verifies the RAII allocation/deallocation path that Module uses.
110+
{
111+
auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
112+
ASSERT_TRUE(result.ok());
113+
auto buf = std::move(result.get());
114+
115+
EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
116+
EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
117+
EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
118+
EXPECT_NE(buf.data(), nullptr);
119+
EXPECT_EQ(buf.size(), 48);
120+
121+
// as_span() wraps the device pointer for HierarchicalAllocator.
122+
auto span = buf.as_span();
123+
EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
124+
EXPECT_EQ(span.size(), 48);
125+
126+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
127+
}
128+
// RAII deallocation on scope exit.
129+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
130+
}
131+
132+
TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
133+
// Verify MethodMeta reports the correct device for buffers in the
134+
// device-annotated model, without needing to load the full method.
135+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
136+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
137+
138+
Module module(path);
139+
auto err = module.load();
140+
ASSERT_EQ(err, Error::Ok);
141+
142+
auto meta = module.method_meta("forward");
143+
ASSERT_TRUE(meta.ok());
144+
145+
// ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
146+
ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
147+
148+
auto size = meta->memory_planned_buffer_size(0);
149+
ASSERT_TRUE(size.ok());
150+
EXPECT_EQ(size.get(), 48);
151+
152+
auto device = meta->memory_planned_buffer_device(0);
153+
ASSERT_TRUE(device.ok());
154+
EXPECT_EQ(device->type(), DeviceType::CUDA);
155+
EXPECT_EQ(device->index(), 0);
156+
}
157+
158+
TEST_F(
159+
ModuleDeviceMemoryTest,
160+
DeviceModelWithSharedArenasReturnsNotSupported) {
161+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
162+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
163+
164+
// share_memory_arenas = true with a device-annotated model should fail.
165+
Module module(
166+
path,
167+
Module::LoadMode::File,
168+
/*event_tracer=*/nullptr,
169+
/*memory_allocator=*/nullptr,
170+
/*temp_allocator=*/nullptr,
171+
/*share_memory_arenas=*/true);
172+
173+
auto err = module.load_method("forward");
174+
EXPECT_EQ(err, Error::NotSupported);
175+
}
176+
177+
TEST_F(
178+
ModuleDeviceMemoryTest,
179+
LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
180+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
181+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
182+
183+
{
184+
Module module(path);
185+
auto err = module.load_method("forward");
186+
187+
// Regardless of whether load_method succeeds or fails (e.g. due to
188+
// backend init issues), the device-aware memory allocation path
189+
// (make_planned_memory_with_devices) runs BEFORE backend init.
190+
EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
191+
<< "Expected 1 device allocation for the CUDA buffer"
192+
<< " (actual: " << g_mock_cuda.allocate_count_ << ")"
193+
<< ", deallocate_count=" << g_mock_cuda.deallocate_count_
194+
<< ", load_method returned error=" << static_cast<int>(err);
195+
EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
196+
<< "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
197+
EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
198+
<< "Expected device_index=0 (cuda:0)";
199+
200+
if (err == Error::Ok) {
201+
// Success path: MethodHolder moved into methods_ map.
202+
// DeviceMemoryBuffer is alive as long as Module is alive.
203+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
204+
<< "No deallocation while method is loaded";
205+
} else {
206+
// Error path: local MethodHolder destroyed on return from load_method.
207+
// RAII deallocation already happened.
208+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
209+
<< "RAII deallocation on error path";
210+
}
211+
}
212+
213+
// After Module destroyed, all device memory must be freed.
214+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
215+
<< "Expected deallocation after Module destroyed";
216+
}

0 commit comments

Comments
 (0)