Skip to content

Commit c5e0a03

Browse files
authored
[ET Device Support] MemoryManager: add per-buffer device metadata (#19737)
Create #18475 manually due to bot crash
1 parent 05e3545 commit c5e0a03

8 files changed

Lines changed: 375 additions & 1 deletion

File tree

runtime/core/hierarchical_allocator.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <c10/util/safe_numerics.h>
1313

1414
#include <executorch/runtime/core/memory_allocator.h>
15+
#include <executorch/runtime/core/portable_type/device.h>
1516
#include <executorch/runtime/core/result.h>
1617
#include <executorch/runtime/core/span.h>
1718

@@ -34,6 +35,30 @@ class HierarchicalAllocator final {
3435
explicit HierarchicalAllocator(Span<Span<uint8_t>> buffers)
3536
: buffers_(buffers) {}
3637

38+
/**
39+
* Constructs a new hierarchical allocator with per-buffer device metadata.
40+
*
41+
* @param[in] buffers Same as above. May contain a mix of CPU and device
42+
* pointers — HierarchicalAllocator only does pointer arithmetic, so
43+
* device pointers are valid.
44+
* @param[in] planned_buffer_devices One entry per buffer (same count as
45+
* `buffers`), indicating the `Device` (type + index) for each buffer.
46+
* Different buffers can target the same device type but different
47+
* indices (e.g., `cuda:0` vs `cuda:1`). For CPU-only programs, use the
48+
* single-arg constructor instead.
49+
*/
50+
HierarchicalAllocator(
51+
Span<Span<uint8_t>> buffers,
52+
Span<const etensor::Device> planned_buffer_devices)
53+
: buffers_(buffers), planned_buffer_devices_(planned_buffer_devices) {
54+
ET_CHECK_MSG(
55+
planned_buffer_devices.size() == buffers.size(),
56+
"planned_buffer_devices size (%" ET_PRIsize_t
57+
") must match buffers size (%" ET_PRIsize_t ")",
58+
planned_buffer_devices.size(),
59+
buffers.size());
60+
}
61+
3762
/**
3863
* DEPRECATED: Use spans instead.
3964
*/
@@ -88,6 +113,17 @@ class HierarchicalAllocator final {
88113
return buffer.data() + offset_bytes;
89114
}
90115

116+
/**
117+
* Returns per-buffer device metadata. One entry per buffer, same count as
118+
* the `buffers` passed to the constructor. Each entry is a `Device`
119+
* carrying both type and index, so callers can distinguish e.g. `cuda:0`
120+
* from `cuda:1`. Empty if no device metadata was provided (CPU-only
121+
* program).
122+
*/
123+
Span<const etensor::Device> planned_buffer_devices() const {
124+
return planned_buffer_devices_;
125+
}
126+
91127
private:
92128
// TODO(T162089316): Remove the span array and to_spans once all users move to
93129
// spans. This array is necessary to hold the pointers and sizes that were
@@ -113,6 +149,10 @@ class HierarchicalAllocator final {
113149

114150
/// The underlying buffers.
115151
Span<Span<uint8_t>> buffers_;
152+
153+
/// Per-buffer device metadata. Empty when no device info was provided
154+
/// (CPU-only program).
155+
Span<const etensor::Device> planned_buffer_devices_;
116156
};
117157

118158
} // namespace runtime

runtime/core/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def define_common_targets():
7777
],
7878
exported_deps = [
7979
":core",
80+
"//executorch/runtime/core/exec_aten:lib",
8081
"//executorch/runtime/core/portable_type/c10/c10:c10",
8182
],
8283
visibility = ["PUBLIC"],

runtime/core/test/hierarchical_allocator_test.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010

1111
#include <executorch/runtime/core/hierarchical_allocator.h>
1212
#include <executorch/runtime/core/memory_allocator.h>
13+
#include <executorch/runtime/core/portable_type/device.h>
1314
#include <executorch/runtime/core/span.h>
1415
#include <executorch/runtime/platform/runtime.h>
16+
#include <executorch/test/utils/DeathTest.h>
1517
#include <executorch/test/utils/alignment.h>
1618

1719
#include <gtest/gtest.h>
@@ -22,6 +24,8 @@ using executorch::runtime::HierarchicalAllocator;
2224
using executorch::runtime::MemoryAllocator;
2325
using executorch::runtime::Result;
2426
using executorch::runtime::Span;
27+
using executorch::runtime::etensor::Device;
28+
using executorch::runtime::etensor::DeviceType;
2529

2630
class HierarchicalAllocatorTest : public ::testing::Test {
2731
protected:
@@ -87,6 +91,67 @@ TEST_F(HierarchicalAllocatorTest, Smoke) {
8791
}
8892
}
8993

94+
TEST_F(HierarchicalAllocatorTest, NoDeviceMetadataByDefault) {
95+
Span<Span<uint8_t>> empty_buffers{};
96+
HierarchicalAllocator allocator(empty_buffers);
97+
98+
EXPECT_EQ(allocator.planned_buffer_devices().size(), 0);
99+
}
100+
101+
TEST_F(HierarchicalAllocatorTest, ExposesDeviceMetadataWhenProvided) {
102+
// Use 4 buffers so the device span size matches.
103+
constexpr size_t n_buffers = 4;
104+
uint8_t mem0[4];
105+
uint8_t mem1[4];
106+
uint8_t mem2[4];
107+
uint8_t mem3[4];
108+
Span<uint8_t> buffers[n_buffers]{
109+
{mem0, sizeof(mem0)},
110+
{mem1, sizeof(mem1)},
111+
{mem2, sizeof(mem2)},
112+
{mem3, sizeof(mem3)},
113+
};
114+
115+
// CPU buffers come first because the runtime always sets up host-side
116+
// planned memory before any device buffers. The two CUDA entries use
117+
// distinct device indices to verify per-buffer index tracking.
118+
Device devices[] = {
119+
Device(DeviceType::CPU, 0),
120+
Device(DeviceType::CPU, 0),
121+
Device(DeviceType::CUDA, 0),
122+
Device(DeviceType::CUDA, 1),
123+
};
124+
Span<const Device> device_span(devices, n_buffers);
125+
126+
HierarchicalAllocator allocator({buffers, n_buffers}, device_span);
127+
128+
ASSERT_EQ(allocator.planned_buffer_devices().size(), n_buffers);
129+
EXPECT_EQ(allocator.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
130+
EXPECT_EQ(allocator.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
131+
EXPECT_EQ(allocator.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
132+
EXPECT_EQ(allocator.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
133+
}
134+
135+
TEST_F(HierarchicalAllocatorTest, MismatchedDeviceCountAborts) {
136+
constexpr size_t n_buffers = 2;
137+
uint8_t mem0[4];
138+
uint8_t mem1[4];
139+
Span<uint8_t> buffers[n_buffers]{
140+
{mem0, sizeof(mem0)},
141+
{mem1, sizeof(mem1)},
142+
};
143+
144+
// 3 device entries vs 2 buffers — should abort.
145+
Device devices[] = {
146+
Device(DeviceType::CPU, 0),
147+
Device(DeviceType::CPU, 0),
148+
Device(DeviceType::CUDA, 0),
149+
};
150+
Span<const Device> device_span(devices, 3);
151+
152+
ET_EXPECT_DEATH(HierarchicalAllocator({buffers, n_buffers}, device_span), "");
153+
}
154+
90155
// TODO(T162089316): Tests the deprecated API. Remove this when removing the
91156
// API.
92157
TEST_F(HierarchicalAllocatorTest, DEPRECATEDSmoke) {

runtime/core/test/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def define_common_targets():
108108
],
109109
deps = [
110110
"//executorch/runtime/core:memory_allocator",
111+
"//executorch/test/utils:utils",
111112
],
112113
)
113114

runtime/executor/memory_manager.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#include <executorch/runtime/core/hierarchical_allocator.h>
1212
#include <executorch/runtime/core/memory_allocator.h>
13+
#include <executorch/runtime/core/portable_type/device.h>
14+
#include <executorch/runtime/core/span.h>
1315

1416
namespace executorch {
1517
namespace runtime {
@@ -42,7 +44,8 @@ class MemoryManager final {
4244
* must agree with the corresponding
4345
* `MethodMeta::num_memory_planned_buffers()` and
4446
* `MethodMeta::memory_planned_buffer_size(N)` values, which are embedded
45-
* in the Program.
47+
* in the Program. For device-aware programs, the per-buffer device
48+
* metadata is owned by the HierarchicalAllocator as well.
4649
* @param[in] temp_allocator The allocator to use when allocating temporary
4750
* data during kernel or delegate execution. Must outlive the Method that
4851
* uses it. May be `nullptr` if the Method does not use kernels or
@@ -105,6 +108,29 @@ class MemoryManager final {
105108
return temp_allocator_;
106109
}
107110

111+
/**
112+
* Returns per-buffer device metadata. One entry per planned memory buffer,
113+
* same count as planned_memory buffers. Empty if no device metadata was
114+
* provided (CPU-only program) or if `planned_memory` is null.
115+
*
116+
* This is a thin wrapper around
117+
* `HierarchicalAllocator::planned_buffer_devices()`.
118+
*/
119+
Span<const etensor::Device> planned_buffer_devices() const {
120+
if (planned_memory_ == nullptr) {
121+
return {};
122+
}
123+
return planned_memory_->planned_buffer_devices();
124+
}
125+
126+
/**
127+
* Returns true if any planned buffer has device metadata attached.
128+
* When false, the memory setup is CPU-only.
129+
*/
130+
bool has_device_memory() const {
131+
return planned_buffer_devices().size() > 0;
132+
}
133+
108134
private:
109135
MemoryAllocator* method_allocator_;
110136
HierarchicalAllocator* planned_memory_;

runtime/executor/test/memory_manager_test.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ using namespace ::testing;
1717
using executorch::runtime::HierarchicalAllocator;
1818
using executorch::runtime::MemoryAllocator;
1919
using executorch::runtime::MemoryManager;
20+
using executorch::runtime::Span;
21+
using executorch::runtime::etensor::Device;
22+
using executorch::runtime::etensor::DeviceType;
2023

2124
TEST(MemoryManagerTest, MinimalCtor) {
2225
MemoryAllocator method_allocator(0, nullptr);
@@ -93,3 +96,64 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
9396
/*temp_allocator=*/&method_allocator),
9497
"cannot be the same");
9598
}
99+
100+
TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
101+
MemoryAllocator method_allocator(0, nullptr);
102+
HierarchicalAllocator planned_memory({});
103+
MemoryAllocator temp_allocator(0, nullptr);
104+
105+
MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
106+
107+
EXPECT_FALSE(mm.has_device_memory());
108+
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
109+
}
110+
111+
TEST(MemoryManagerTest, DelegatesDeviceMetadataToHierarchicalAllocator) {
112+
MemoryAllocator method_allocator(0, nullptr);
113+
MemoryAllocator temp_allocator(0, nullptr);
114+
115+
// 4 buffers: cpu:0, cpu:0, cuda:0, cuda:1. CPU buffers come first because
116+
// the runtime always sets up host-side planned memory before any device
117+
// buffers. The two CUDA entries use distinct indices to verify per-buffer
118+
// index tracking.
119+
constexpr size_t n_buffers = 4;
120+
uint8_t mem0[4];
121+
uint8_t mem1[4];
122+
uint8_t mem2[4];
123+
uint8_t mem3[4];
124+
Span<uint8_t> buffers[n_buffers]{
125+
{mem0, sizeof(mem0)},
126+
{mem1, sizeof(mem1)},
127+
{mem2, sizeof(mem2)},
128+
{mem3, sizeof(mem3)},
129+
};
130+
Device devices[] = {
131+
Device(DeviceType::CPU, 0),
132+
Device(DeviceType::CPU, 0),
133+
Device(DeviceType::CUDA, 0),
134+
Device(DeviceType::CUDA, 1),
135+
};
136+
Span<const Device> device_span(devices, n_buffers);
137+
138+
HierarchicalAllocator planned_memory({buffers, n_buffers}, device_span);
139+
MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
140+
141+
EXPECT_EQ(mm.method_allocator(), &method_allocator);
142+
EXPECT_EQ(mm.planned_memory(), &planned_memory);
143+
EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
144+
EXPECT_TRUE(mm.has_device_memory());
145+
EXPECT_EQ(mm.planned_buffer_devices().size(), n_buffers);
146+
EXPECT_EQ(mm.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
147+
EXPECT_EQ(mm.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
148+
EXPECT_EQ(mm.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
149+
EXPECT_EQ(mm.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
150+
}
151+
152+
TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
153+
MemoryAllocator method_allocator(0, nullptr);
154+
155+
MemoryManager mm(&method_allocator);
156+
157+
EXPECT_FALSE(mm.has_device_memory());
158+
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
159+
}

runtime/executor/test/targets.bzl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
1919
"//executorch/exir/backend/test/...",
2020
"//executorch/runtime/backend/...",
2121
"//executorch/extension/pybindings/...",
22+
"//executorch/extension/module/test/...",
2223
"//executorch/devtools/fb/runners/...",
2324
"//executorch/test/...",
2425
"//executorch/examples/...",
@@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
326327
deps = [
327328
":managed_memory_manager",
328329
"//executorch/runtime/executor:program",
330+
"//executorch/runtime/core:device_allocator",
331+
"//executorch/runtime/core:device_memory_buffer",
329332
"//executorch/extension/data_loader:file_data_loader",
330333
"//executorch/schema:program",
331334
],

0 commit comments

Comments
 (0)