1717#include < executorch/runtime/executor/tensor_parser.h>
1818
1919#include < executorch/extension/data_loader/file_data_loader.h>
20+ #include < executorch/runtime/core/device_allocator.h>
21+ #include < executorch/runtime/core/device_memory_buffer.h>
2022#include < executorch/runtime/core/exec_aten/exec_aten.h>
2123#include < executorch/runtime/executor/test/managed_memory_manager.h>
24+ #include < executorch/runtime/platform/runtime.h>
2225#include < executorch/schema/program_generated.h>
2326
2427#include < gtest/gtest.h>
2528
2629using executorch::aten::Tensor;
30+ using executorch::runtime::DeviceAllocator;
31+ using executorch::runtime::DeviceMemoryBuffer;
2732using executorch::runtime::Error;
33+ using executorch::runtime::HierarchicalAllocator;
34+ using executorch::runtime::MemoryAllocator;
35+ using executorch::runtime::MemoryManager;
36+ using executorch::runtime::MethodMeta;
2837using executorch::runtime::Program;
2938using executorch::runtime::Result;
39+ using executorch::runtime::Span;
40+ using executorch::runtime::get_device_allocator;
41+ using executorch::runtime::register_device_allocator;
3042using executorch::runtime::deserialization::parseTensor;
3143using executorch::runtime::testing::ManagedMemoryManager;
44+ using executorch::runtime::etensor::DeviceIndex;
45+ using executorch::runtime::etensor::DeviceType;
3246using torch::executor::util::FileDataLoader;
3347
3448constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U ;
@@ -50,15 +64,77 @@ class ProgramTestFriend final {
5064
5165using executorch::runtime::testing::ProgramTestFriend;
5266
67+ namespace {
68+
69+ /* *
70+ * Mock CUDA allocator that uses host memory for testing.
71+ * Tracks the allocated range so tests can verify tensor data_ptr
72+ * falls within the "device" memory region.
73+ */
74+ class MockCudaAllocator : public DeviceAllocator {
75+ public:
76+ Result<void *> allocate (size_t nbytes, DeviceIndex index) override {
77+ allocate_count_++;
78+ buffer_ = std::make_unique<uint8_t []>(nbytes);
79+ buffer_size_ = nbytes;
80+ return static_cast <void *>(buffer_.get ());
81+ }
82+
83+ void deallocate (void * ptr, DeviceIndex index) override {
84+ deallocate_count_++;
85+ buffer_.reset ();
86+ buffer_size_ = 0 ;
87+ }
88+
89+ Error copy_host_to_device (void *, const void *, size_t , DeviceIndex) override {
90+ return Error::Ok;
91+ }
92+
93+ Error copy_device_to_host (void *, const void *, size_t , DeviceIndex) override {
94+ return Error::Ok;
95+ }
96+
97+ DeviceType device_type () const override {
98+ return DeviceType::CUDA;
99+ }
100+
101+ bool is_device_ptr (const void * ptr) const {
102+ if (buffer_ == nullptr || buffer_size_ == 0 ) {
103+ return false ;
104+ }
105+ auto * p = static_cast <const uint8_t *>(ptr);
106+ return p >= buffer_.get () && p < buffer_.get () + buffer_size_;
107+ }
108+
109+ int allocate_count_ = 0 ;
110+ int deallocate_count_ = 0 ;
111+
112+ private:
113+ std::unique_ptr<uint8_t []> buffer_;
114+ size_t buffer_size_ = 0 ;
115+ };
116+
117+ } // namespace
118+
119+ static MockCudaAllocator g_mock_cuda;
120+
53121class TensorParserDeviceTest : public ::testing::Test {
54122 protected:
123+ static void SetUpTestSuite () {
124+ executorch::runtime::runtime_init ();
125+ register_device_allocator (DeviceType::CUDA, &g_mock_cuda);
126+ }
127+
55128 void SetUp () override {
56129 const char * path = std::getenv (" ET_MODULE_ADD_WITH_DEVICE_PATH" );
57130 ASSERT_NE (path, nullptr )
58131 << " ET_MODULE_ADD_WITH_DEVICE_PATH env var not set" ;
59132 Result<FileDataLoader> loader = FileDataLoader::from (path);
60133 ASSERT_EQ (loader.error (), Error::Ok);
61134 loader_ = std::make_unique<FileDataLoader>(std::move (loader.get ()));
135+
136+ g_mock_cuda.allocate_count_ = 0 ;
137+ g_mock_cuda.deallocate_count_ = 0 ;
62138 }
63139
64140 std::unique_ptr<FileDataLoader> loader_;
@@ -169,3 +245,97 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
169245 << " without device annotation should have device_index=0" ;
170246 }
171247}
248+ TEST_F (TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
249+ Result<Program> program =
250+ Program::load (loader_.get (), Program::Verification::Minimal);
251+ ASSERT_EQ (program.error (), Error::Ok);
252+
253+ Result<MethodMeta> method_meta = program->method_meta (" forward" );
254+ ASSERT_EQ (method_meta.error (), Error::Ok);
255+
256+ // ModuleAddWithDevice has:
257+ // non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes)
258+ // non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
259+ const size_t num_buffers = method_meta->num_memory_planned_buffers ();
260+ ASSERT_EQ (num_buffers, 1 );
261+
262+ // Set up device-aware planned memory.
263+ std::vector<Span<uint8_t >> planned_spans;
264+ std::vector<std::vector<uint8_t >> cpu_buffers;
265+ std::vector<DeviceMemoryBuffer> device_buffers;
266+
267+ for (size_t i = 0 ; i < num_buffers; ++i) {
268+ auto size = method_meta->memory_planned_buffer_size (i);
269+ ASSERT_TRUE (size.ok ());
270+ auto device = method_meta->memory_planned_buffer_device (i);
271+ ASSERT_TRUE (device.ok ());
272+
273+ if (device->is_cpu ()) {
274+ cpu_buffers.emplace_back (size.get ());
275+ planned_spans.emplace_back (
276+ cpu_buffers.back ().data (), cpu_buffers.back ().size ());
277+ } else {
278+ cpu_buffers.emplace_back (); // empty placeholder
279+ auto dmb = DeviceMemoryBuffer::create (
280+ size.get (), device->type (), device->index ());
281+ ASSERT_TRUE (dmb.ok ())
282+ << " DeviceMemoryBuffer::create failed for buffer " << i;
283+ planned_spans.emplace_back (dmb->as_span ());
284+ device_buffers.push_back (std::move (dmb.get ()));
285+ }
286+ }
287+
288+ ASSERT_EQ (g_mock_cuda.allocate_count_ , 1 );
289+
290+ // Build HierarchicalAllocator with mixed CPU/device spans.
291+ HierarchicalAllocator planned_memory (
292+ {planned_spans.data (), planned_spans.size ()});
293+
294+ constexpr size_t kMethodAllocBytes = 32 * 1024U ;
295+ auto method_alloc_pool = std::make_unique<uint8_t []>(kMethodAllocBytes );
296+ MemoryAllocator method_allocator (kMethodAllocBytes , method_alloc_pool.get ());
297+ MemoryManager memory_manager (&method_allocator, &planned_memory);
298+
299+ // Parse tensors and verify CUDA tensors have device memory.
300+ const executorch_flatbuffer::Program* internal_program =
301+ ProgramTestFriend::GetInternalProgram (&program.get ());
302+ auto * execution_plan =
303+ internal_program->execution_plan ()->GetMutableObject (0 );
304+ auto * flatbuffer_values = execution_plan->values ();
305+
306+ int cuda_with_device_memory = 0 ;
307+
308+ for (size_t i = 0 ; i < flatbuffer_values->size (); ++i) {
309+ auto * serialization_value = flatbuffer_values->Get (i);
310+ if (serialization_value->val_type () !=
311+ executorch_flatbuffer::KernelTypes::Tensor) {
312+ continue ;
313+ }
314+
315+ auto * s_tensor = serialization_value->val_as_Tensor ();
316+ bool is_cuda = s_tensor->extra_tensor_info () != nullptr &&
317+ s_tensor->extra_tensor_info ()->device_type () ==
318+ executorch_flatbuffer::DeviceType::CUDA;
319+
320+ Result<Tensor> tensor =
321+ parseTensor (&program.get (), &memory_manager, s_tensor);
322+ ASSERT_TRUE (tensor.ok ())
323+ << " parseTensor failed at index " << i
324+ << " with error 0x" << std::hex
325+ << static_cast <uint32_t >(tensor.error ());
326+
327+ Tensor t = tensor.get ();
328+
329+ if (is_cuda && t.unsafeGetTensorImpl ()->device_type () == DeviceType::CUDA) {
330+ EXPECT_TRUE (g_mock_cuda.is_device_ptr (t.const_data_ptr ()))
331+ << " CUDA tensor at index " << i
332+ << " should have data_ptr in device memory, but got CPU memory" ;
333+ cuda_with_device_memory++;
334+ }
335+ }
336+
337+ // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
338+ // their data_ptr pointing to the mock device memory buffer.
339+ EXPECT_EQ (cuda_with_device_memory, 3 )
340+ << " All 3 CUDA tensors should have data_ptr in device memory" ;
341+ }
0 commit comments